In [66]:
import os
import numpy as np
import pandas as pd

In [67]:
def make_Corpus(root_dir):
    polarity_dirs = [os.path.join(root_dir,f) for f in os.listdir(root_dir)]
    polarity_dirs.sort()
    corpus = []    
    for polarity_dir in polarity_dirs:
        reviews = [os.path.join(polarity_dir,f) for f in os.listdir(polarity_dir)]
        reviews.sort()
        for review in reviews:
            # print(review)
            doc_string = "";
            with open(review) as rev:
                for line in rev:
                    doc_string = doc_string + line
            if not corpus:
                corpus = [doc_string]
            else:
                corpus.append(doc_string.replace("\r","").replace("\n"," ").replace("\t"," ").replace("  "," ").split(" "))
    return corpus
 
#Create a corpus with each document having one string
root_dir = '../preprocessedDataset'
corpus = make_Corpus(root_dir)


In [68]:
def computeTF(wordDict, bow):
    tfDict = {}
    bowCount = len(bow)
    for word, count in wordDict.items():
        tfDict[word] = count/float(bowCount)
    return tfDict

def computeIDF(docList):
    import math
    idfDict = {}
    N = len(docList)
    
    idfDict = dict.fromkeys(docList[0].keys(), 0)
    for doc in docList:
        for word, val in doc.items():
            if val > 0:
                idfDict[word] += 1
    
    for word, val in idfDict.items():
        idfDict[word] = math.log10(N / float(val))
        
    return idfDict

def computeTFIDF(tfBow, idfs):
    tfidf = {}
    for word, val in tfBow.items():
        tfidf[word] = val*idfs[word]
    return tfidf

In [69]:
N = len(corpus)

wordSet = set(corpus[0])
for bow in corpus:
    wordSet = wordSet.union(set(bow))\

wordDict = []
for i in range(N):
    wordDict.append(dict.fromkeys(wordSet,0))

i = 0
for bow in corpus:
    for word in bow:
        wordDict[i][word] += 1
    i += 1

tfBow = []
for i in range(N):
    tfBow.append(computeTF(wordDict[i],corpus[i]))

idfs = computeIDF(wordDict)

tfidfBow = []
for i in range(N):
    tfidfBow.append(computeTFIDF(tfBow[i],idfs))

In [70]:
tfidf = pd.DataFrame(tfidfBow)

In [71]:
tfidf.to_csv('../tfidf/complete.csv',index=False)