In [1]:
import nltk 
import numpy as np 
import pickle
import random
import pandas as pd
from nltk.classify.scikitlearn import SklearnClassifier
#from sklearn.naive_bayes import MultinomialNB, BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import  LinearSVC, NuSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score
from nltk.classify import ClassifierI
from statistics import mode

In [2]:
pos_reviews=open("/Users/ashvinsrinivasan/Desktop/positive.txt","r", encoding='latin2').read()
neg_reviews=open("/Users/ashvinsrinivasan/Desktop/negative.txt","r", encoding='latin2').read()

####### Documents have each word entries with its associated category ######
documents=[]
for ii in pos_reviews.split('\n'):
    documents.append((ii,'pos'))
for ii in neg_reviews.split('\n'):
    documents.append((ii,'neg'))

In [3]:
def extract(word_list):
    all_word_types=['J','V','N']
    ret_list=[]
    temp=nltk.pos_tag(word_list)
    for w in temp:
        if (w[1][0]) in all_word_types:
            ret_list.append(w[0].lower())
        
    return ret_list

In [4]:
%%time
####### pos_reviews_words have words  from positive text and likewise for neg_reviews_words   

pos_reviews_words=nltk.word_tokenize(pos_reviews)
neg_reviews_words=nltk.word_tokenize(neg_reviews)
imp_pos_words=extract(pos_reviews_words)
imp_neg_words=extract(neg_reviews_words)

##### all_words contain the set of words occuring in all documents ########

all_words=[]
for ii in imp_pos_words:  # put pos_reviews_words to include all words
    all_words.append(ii.lower())
for ii in imp_neg_words:  # put neg_reviews_words to include all words
    all_words.append(ii.lower())    

####### all_words_freq contains all words with its associated frequencies #######
all_words_freq=nltk.FreqDist(all_words)
word_feats=list(all_words_freq.keys())[:5000]

''' find_feats function returns true if each word of the 5000 words is present 
in the document else false. So every features returned is a 5000 vector with 
true or false entries
'''

CPU times: user 12.9 s, sys: 67.5 ms, total: 12.9 s
Wall time: 13 s


In [5]:
temp_var=open('word_feats.pickle','wb')
pickle.dump(word_feats,temp_var)
temp_var.close()



In [6]:
def find_feats(document):
    words=nltk.word_tokenize(document)
    words=extract(words)
    features={}
    for w in word_feats:
         features[w]=w in words
    return features


### Creating feature list for each document 
features_list=[(find_feats(review),category) for (review,category) in documents]
random.shuffle(features_list)

train_set=features_list[:10000]
test_set=features_list[10000:]

def find_pred(clf,test_set):
    y_pred=[clf.classify(test_set[i][0]) for i in range(len(test_set))]
    y_true=[test_set[i][1] for i in range(len(test_set))]
    y_pred=pd.Series(y_pred).apply(lambda x:0 if x=='neg' else 1)
    y_true=pd.Series(y_true).apply(lambda x:0 if x=='neg' else 1)
    accuracy=accuracy_score(y_true,y_pred)*100
    f_beta=fbeta_score(y_true,y_pred,beta=0.5)
    return accuracy,f_beta

In [7]:
temp_var=open('train_data.pickle','wb')
pickle.dump(train_set,temp_var)
temp_var.close()

temp_var=open('test_data.pickle','wb')
pickle.dump(test_set,temp_var)
temp_var.close()

In [8]:
train_train=train_set[:9000]
train_val=train_set[9000:]
    
classifier = nltk.NaiveBayesClassifier.train(train_set)
print("Original Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, test_set))*100)


Original Naive Bayes Algo accuracy percent: 66.71686746987952


In [9]:
save_clf=open('OrigNB.pickle','wb')
pickle.dump(classifier,save_clf)
save_clf.close()

In [10]:
pwd

'/Users/ashvinsrinivasan/Desktop/Machinelearning/Udacity_dir/Advanced/capstone'

In [11]:
%%time
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(train_set)
acc,beta=find_pred(LogisticRegression_classifier,test_set)
print('Logistic Regression accuracy is {} and f_beta is {}'.format(acc,beta))



Logistic Regression accuracy is 66.26506024096386 and f_beta is 0.6750788643533122
CPU times: user 34.8 s, sys: 972 ms, total: 35.8 s
Wall time: 35.8 s


In [12]:

save_clf=open('Logistic_reg.pickle','wb')
pickle.dump(LogisticRegression_classifier,save_clf)
save_clf.close()

In [13]:
%%time
######## linear SVC Classifier with grid search for different hyper parameters combination ###########

linsvc_params={}
linsvc_params['C']=[2**-3,2**-1,2**0,2**2,2**4,2**6,2**8,2**10]
dict_map={}
var=0
linsvc_acc=[]
linsvc_beta=[]
unopt_linSVC_classifier = SklearnClassifier(LinearSVC())
for ii in linsvc_params['C']:
        linSVC_classifier = SklearnClassifier(LinearSVC(C=ii))
        dict_map[var]=[ii] 
        linSVC_classifier.train(train_train)
        acc,beta=find_pred(linSVC_classifier,train_val)
        linsvc_acc.append(acc)
        linsvc_beta.append(beta)
        var+=1
best_linsvcparams=dict_map[np.where(linsvc_acc==np.max(linsvc_acc))[0][0]]        
        
##### Training lin svc classifier with best hyper parameters
linsvc_bestclf = SklearnClassifier(LinearSVC(C=best_linsvcparams[0]))

linsvc_bestclf.train(train_set)
unopt_linSVC_classifier.train(train_set)

acc,beta=find_pred(linsvc_bestclf,test_set)
unopt_acc,unopt_beta=find_pred(unopt_linSVC_classifier,test_set)

print('optimised Linear SVC accuracy is {} and f_beta is {}'.format(acc,beta))
print('unoptimised Linear SVC accuracy is {} and f_beta is {}'.format(unopt_acc,unopt_beta))

optimised Linear SVC accuracy is 64.7590361445783 and f_beta is 0.6530717604543108
unoptimised Linear SVC accuracy is 63.704819277108435 and f_beta is 0.6441558441558441
CPU times: user 5min 41s, sys: 9.44 s, total: 5min 50s
Wall time: 5min 50s


In [15]:
save_clf=open('LinearSVC.pickle','wb')
pickle.dump(linsvc_bestclf,save_clf)
save_clf.close()
save_clf=open('unopt_LinearSVC.pickle','wb')
pickle.dump(unopt_linSVC_classifier,save_clf)
save_clf.close()

In [16]:
######## Nu SVC Classifier with grid search for different hyper parameters combination ###########
Nusvc_params={}
Nusvc_params['nu']=[0.2,0.4,0.6,0.8]
Nusvc_params['kernel']=['rbf','poly']
dict_map={}
var=0
Nusvc_acc=[]
Nusvc_beta=[]
unopt_NuSVC_classifier = SklearnClassifier(NuSVC())
for ii in Nusvc_params['nu']:
    for jj in Nusvc_params['kernel']:
        NuSVC_classifier = SklearnClassifier(NuSVC(nu=ii,kernel=jj))
        dict_map[var]=[ii,jj] 
        NuSVC_classifier.train(train_train)
        acc,beta=find_pred(NuSVC_classifier,train_val)
        Nusvc_acc.append(acc)
        Nusvc_beta.append(beta)
        var+=1
best_nusvcparams=dict_map[np.where(Nusvc_acc==np.max(Nusvc_acc))[0][0]]        
        
##### Training Nu SVC classifier with best hyper parameters
NuSVC_bestclf = SklearnClassifier(NuSVC(nu=best_nusvcparams[0],
                                        kernel=best_nusvcparams[1]))
NuSVC_bestclf.train(train_set)
acc,beta=find_pred(NuSVC_bestclf,test_set)

unopt_NuSVC_classifier.train(train_set)
unopt_acc,unopt_beta=find_pred(unopt_NuSVC_classifier,test_set)

print('Nu SVC accuracy is {} and f_beta is {}'.format(acc,beta))
print('unoptimised Nu SVC accuracy is {} and f_beta is {}'.format(unopt_acc,unopt_beta))



Nu SVC accuracy is 66.41566265060241 and f_beta is 0.6663301362948006
unoptimised Nu SVC accuracy is 59.48795180722891 and f_beta is 0.6094117647058824


In [17]:
save_clf=open('NuSVC.pickle','wb')
pickle.dump(NuSVC_bestclf,save_clf)
save_clf.close()
save_clf=open('unopt_NuSVC.pickle','wb')
pickle.dump(unopt_NuSVC_classifier,save_clf)
save_clf.close()

In [18]:
######## Random Forest Classifier with grid search for different hyper parameters combination ###########
rf_params={}
rf_params['n_estimators']=[5,10,15]
rf_params['min_samples_split']=[2,10,50]
rf_params['criterion']=['entropy']
dict_map={}
var=0
rf_acc=[]
rf_beta=[]
unopt_rf_classifier=SklearnClassifier(RandomForestClassifier())
for ii in rf_params['n_estimators']:
    for jj in rf_params['min_samples_split']:
        for kk in rf_params['criterion']:
            rf_classifier=SklearnClassifier(RandomForestClassifier(n_estimators=ii,
                                                               min_samples_split=jj,
                                                               criterion=kk))
            dict_map[var]=[ii,jj,kk] 
            rf_classifier.train(train_train)
            acc,beta=find_pred(rf_classifier,train_val)
            rf_acc.append(acc)
            rf_beta.append(beta)
            var+=1
                                                             
                                                                                                                                                               
best_rfparams=dict_map[np.where(rf_acc==np.max(rf_acc))[0][0]]

####### Best random forest classifier ###########
rf_bestclf=SklearnClassifier(RandomForestClassifier(n_estimators=best_rfparams[0],
                                                       min_samples_split=best_rfparams[1],
                                                       criterion=best_rfparams[2]))
rf_bestclf.train(train_set)
#print("rf_classifier accuracy percent:", (nltk.classify.accuracy(rf_classifier, test_set))*100)
acc,beta=find_pred(rf_bestclf,test_set)

unopt_rf_classifier.train(train_set)
unopt_acc,unopt_beta=find_pred(unopt_rf_classifier,test_set)

print('Random Forest accuracy is {} and f_beta is {}'.format(acc,beta))
print('unoptimised Random Forest accuracy is {} and f_beta is {}'.format(unopt_acc,unopt_beta))

Random Forest accuracy is 62.65060240963856 and f_beta is 0.6346826586706646
unoptimised Random Forest accuracy is 60.69277108433735 and f_beta is 0.6183206106870229


In [19]:
save_clf=open('randomforest.pickle','wb')
pickle.dump(rf_bestclf,save_clf)
save_clf.close()
save_clf=open('unopt_randomforest.pickle','wb')
pickle.dump(unopt_rf_classifier,save_clf)
save_clf.close()

In [20]:
'''
Hyper parameters for each model is optimized based on the accuracy on validation set(train_val), upon which the model with 
best hyper parameters is trained on the entire training set(train_set).
'''

'\nHyper parameters for each model is optimized based on the accuracy on validation set(train_val), upon which the model with \nbest hyper parameters is trained on the entire training set(train_set).\n'