In [1]:
import glob
import pandas as pd
import numpy as np

In [2]:
path_neg = glob.glob('imdb1/neg/cv*.txt')
path_pos = glob.glob('imdb1/pos/cv*.txt')

In [3]:
neg_list, pos_list = [],[]
for _file_neg, _file_pos in zip(path_neg, path_pos):
    with open(_file_neg) as _open_neg, open(_file_pos) as _open_pos:
        neg_list.append(_open_neg.readlines())
        pos_list.append(_open_pos.readlines())

In [4]:
import itertools
pos_list_n = list(itertools.chain(*pos_list))
neg_list_n = list(itertools.chain(*neg_list))

In [5]:
pos_df = pd.DataFrame(pos_list_n)
pos_df['class'] = np.ones((pos_df.shape[0],), dtype=int)
pos_df.columns = ['review', 'class']

In [6]:
neg_df = pd.DataFrame(neg_list_n)
neg_df['class'] = np.zeros((neg_df.shape[0],), dtype=int)
neg_df.columns = ['review', 'class']

In [7]:
import re
def clean_data(text):
    text = str(text)
    text = re.sub('\n', ' ', text)
    text = re.sub(r'[^\w\s]', ' ', text)
    text = re.sub('  ', ' ', text)
    return text
clean_round = lambda x: clean_data(x)

In [8]:
from collections import OrderedDict 

def removeDupWithoutOrder(str): 
    return "".join(OrderedDict.fromkeys(str))  
duplicate_round = lambda x: removeDupWithoutOrder(x)

In [9]:
data = pd.concat([pos_df,neg_df], axis=0, ignore_index=True)
data['reviewCleaned'] = pd.DataFrame(data.review.apply(clean_round))
data['review_naduplicate'] = pd.DataFrame(data.review.apply(duplicate_round))

In [10]:
def count_vector(text):
    count = len(re.split(' ', text))
    return count

count_round = lambda x: count_vector(x)
data['count'] = pd.DataFrame(data.review.apply(count_round))
data.head()

Unnamed: 0,review,class,reviewCleaned,review_naduplicate,count
0,films adapted from comic books have had plenty...,1,films adapted from comic books have had plenty...,"films adpterocbkhvnyu,w'()g.\n",53
1,"for starters , it was created by alan moore ( ...",1,for starters it was created by alan moore an...,"for stae,iwcdbylnm(p)hugv'8012-.\n",37
2,to say moore and campbell thoroughly researche...,1,to say moore and campbell thoroughly researche...,to saymrendcpblhugjfkiw.\n,28
3,"the book ( or "" graphic novel , "" if you will ...",1,the book or graphic novel if you will is ...,"the bok(r""gapicnvl,fyuw)s50d3m.\n",31
4,"in other words , don't dismiss this film becau...",1,in other words don t dismiss this film becaus...,"in otherwds,'mflbcau.\n",14


In [15]:
from numpy import array
from sklearn.model_selection import KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
from sklearn.metrics import confusion_matrix

def MNBCrossFold(df, k, column=[], _class='class', **kwargs):
    vect = TfidfVectorizer(**kwargs)
    _nb_model = MultinomialNB(alpha=1.0, fit_prior=True)
    
    X, y = df[column], df[_class]
    kf = KFold(n_splits=k)
    kf.get_n_splits(X)
    confusion_sum = np.zeros((2, 2))
    for train_index, test_index in kf.split(X):
    #     print('TRAIN:', len(train_index), 'TEST:', len(test_index))
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        vect.fit(X_train)
        X_train_dtm = vect.transform(X_train)
        X_train_dtm = vect.fit_transform(X_train)
        X_test_dtm = vect.transform(X_test)

        _nb_model.fit(X_train_dtm, y_train)
        y_pred_class = _nb_model.predict(X_test_dtm)
        print(metrics.accuracy_score(y_test, y_pred_class))
#         print(confusion_matrix(y_test, y_pred_class))
        confusion_sum = np.add(confusion_sum, confusion_matrix(y_test, y_pred_class))
    print(confusion_sum)

print('------ Text As it\'s ------')
MNBCrossFold(data, k=10, column = 'review')
print('------ Removeing Stop Words ------')
MNBCrossFold(data, k=10, column = 'reviewCleaned', stop_words='english')
print('------ Duplicate Removed ------')
MNBCrossFold(data, k=10, column = 'review_naduplicate', stop_words='english', ngram_range=(2, 2), lowercase=True)
print('------ Bigram Feature ------')
MNBCrossFold(data, k=10, column = 'review', stop_words='english', ngram_range=(2, 2), lowercase=True)
print('------ Trigrams Feature ------')
MNBCrossFold(data, k=10, column = 'review', stop_words='english', ngram_range=(3, 3), lowercase=True)

------ Text As it's ------
0.4527194066749073
0.4624536464771323
0.4445302843016069
0.45750927070457353
0.43139678615574784
0.4408220024721879
0.35058714462299134
0.35939431396786153
0.3666563658838072
0.36341161928306553
[[11696. 20087.]
 [17907. 15030.]]
------ Removeing Stop Words ------
0.4780593325092707
0.484548825710754
0.47141532756489496
0.4791409147095179
0.4530284301606922
0.45488257107540175
0.3777812113720643
0.3855067985166873
0.3977132262051916
0.39137824474660077
[[12533. 19250.]
 [17165. 15772.]]
------ Duplicate Removed ------
0.01993201483312732
0.023022249690976514
0.011588380716934486
0.014678615574783683
0.010352286773794808
0.10599505562422744
0.015451174289245983
0.015451174289245983
0.022867737948084055
0.010352286773794808
[[  529. 31254.]
 [31850.  1087.]]
------ Bigram Feature ------
0.34085290482076636
0.36016687268232384
0.33420889987639063
0.34517923362175523
0.3318912237330037
0.36418417799752784
0.27796662546353523
0.2804388133498146
0.2835290482076638


In [None]:
.....ll9
