In [162]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import random
import pickle
import numpy as np
import pandas as pd

#Sklearn wrapper in nltk
from nltk.classify.scikitlearn import SklearnClassifier

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

In [190]:
def save_pickles(item, file_name):
    # https://stackoverflow.com/questions/14622314/python-inserting-variable-string-as-file-name
    f = open('pickled/%s.pickle' % file_name, 'wb')
    pickle.dump(item, f)
    f.close()

In [163]:
short_pos = open("short_reviews/positive.txt", "r", encoding="latin2").read()
short_neg =  open("short_reviews/negative.txt", "r", encoding="latin2").read()

In [164]:
documents = []
all_words = []
allowed_types = ["J"]

#Appending the positive 
for r in short_pos.split('\n'):
    documents.append((r, "pos"))
    words = word_tokenize(r)
    #Extracting position of speech
    pos = nltk.pos_tag(words)
    #Adding only adjectives
    for w in pos:
        if w[1][0] in allowed_types:
            all_words.append(w[0].lower())
        

#Appending the negative
for r in short_neg.split('\n'):
    documents.append((r, "neg"))
    # example of r: documents[0]
    
    words = word_tokenize(r)
    # example of tokenize: word_tokenize(documents[0][0])
    
    # Extracting position of speech
    pos = nltk.pos_tag(words)
    # example of position tagging: nltk.pos_tag(word_tokenize(documents[0][0]))
    
    #Adding only adjectives
    for w in pos:
        # We use 1 to extract the part of speech
        # We use 0 to extract the first element from the xx or xxx part of speech
        # (Because we want to capture all adjectives, adverbs etc. )
        # We dont want nouns and comma etc.
        # example: nltk.pos_tag(word_tokenize(documents[0][0]))[0][1][0]
        if w[1][0] in allowed_types:
            all_words.append(w[0].lower())

In [191]:
save_pickles(documents, "documets")

In [166]:
all_words = nltk.FreqDist(all_words)

In [167]:
word_features = [w[0] for w in all_words.most_common(5000)]

In [168]:
save_pickles(word_features, "features")

In [169]:
def find_features(document):
    '''
    document = list of all the words in a review
    '''
    # To extract only the unique words in a document
    words = word_tokenize(document)
    features = {}
    
    #Set true or false based on the if the word is 
    #present in the top 3000 words
    for w in word_features:
        features[w] = (w in words)
        
    return features    

In [170]:
feature_sets = [(find_features(rev), category) for (rev, category) in documents]

In [171]:
random.shuffle(feature_sets)

In [172]:
X = []
y = []
for rev, category in documents:
    X.append(find_features(rev))
    y.append(category)

In [173]:
X = pd.DataFrame.from_dict(X)
#https://stackoverflow.com/questions/18837262/convert-python-dict-into-a-dataframe

In [174]:
save_X = open("pickled/X.pickle", "wb")
pickle.dump(X, save_X)
save_X.close()
save_pickles(X, "X")

In [175]:
save_pickles(y, "y")

In [176]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=101)

# Multinomial Naive Bayes

In [177]:
MNB_classifier = MultinomialNB()
MNB_classifier.fit(X_train, y_train)
MNB_classifier.score(X_train, y_train)

0.82499120853358343

In [178]:
MNB_classifier.score(X_test, y_test)

0.72714486638537268

In [179]:
confusion_matrix(y_pred=MNB_classifier.predict(X_train), y_true=y_train)

array([[3472,  791],
       [ 702, 3566]])

In [180]:
save_pickles(MNB_classifier, "MNB_classifier")

# Bernoulli Naive Bayes

In [181]:
BNB_classifier = BernoulliNB()
BNB_classifier.fit(X_train, y_train)
BNB_classifier.score(X_train, y_train)

0.83436877271128829

In [182]:
BNB_classifier.score(X_test, y_test)

0.72948898265353956

In [183]:
confusion_matrix(y_pred=BNB_classifier.predict(X_train), y_true=y_train)

array([[3739,  524],
       [ 889, 3379]])

In [None]:
save_pickles(BNB_classifier, "BNB_classifier")

# LogisticRegression

In [142]:
LR_classifier = LogisticRegression()
LR_classifier.fit(X_train, y_train)
LR_classifier.score(X_train, y_train)

0.85394443793224706

In [143]:
LR_classifier.score(X_test, y_test)

0.72058134083450542

In [144]:
confusion_matrix(y_pred=LR_classifier.predict(X_train), y_true=y_train)

array([[3708,  555],
       [ 691, 3577]])

In [None]:
save_pickles(LR_classifier, "LR_classifier")

# SGDClassifier

In [145]:
SGD_classifier = SGDClassifier()
SGD_classifier.fit(X_train, y_train)
SGD_classifier.score(X_train, y_train)



0.8205368655491736

In [146]:
SGD_classifier.score(X_test, y_test)

0.68682606657290202

In [147]:
confusion_matrix(y_pred=SGD_classifier.predict(X_train), y_true=y_train)

array([[3091, 1172],
       [ 359, 3909]])

In [None]:
save_pickles(SGD_classifier, "SGD_classifier")

In [25]:
# Not using SGD Classifier as it is predicting everything as positive.

# SVC -> Takes too Long to run!

SVC_classifier = SVC()
SVC_classifier.fit(X_train, y_train)
SVC_classifier.score(X_train, y_train)

SVC_classifier.score(X_test, y_test)

confusion_matrix(y_pred = SVC_classifier.predict(X_train), y_true=y_train)

# Random Forest Classifier

In [148]:
RF_classifier = RandomForestClassifier()
RF_classifier.fit(X_train, y_train)
RF_classifier.score(X_train, y_train)

0.96565467119915604

In [149]:
RF_classifier.score(X_test, y_test)

0.67135489920300051

In [150]:
confusion_matrix(y_pred = RF_classifier.predict(X_train), y_true=y_train)

array([[4187,   76],
       [ 217, 4051]])

In [None]:
save_pickles(RF_classifier, "RF_classifier")

In [32]:
# Basic RFC model appears to be overfitting

# KNN Classifier -> Takes too long to run!

KNN_classifier = KNeighborsClassifier()
KNN_classifier.fit(X_train, y_train)
KNN_classifier.score(X_train, y_train)

KNN_classifier.score(X_test, y_test)

confusion_matrix(y_pred = KNN_classifier.predict(X_train), y_true=y_train)

# Ensemble Models

In [222]:
# http://scikit-learn.org/stable/modules/ensemble.html
eclf = VotingClassifier(estimators=[('MNB', MNB_classifier),
                                    ('BNB', BNB_classifier),
                                    #('SGD', SGDClassifier_classifier),
                                    ('LR', LogisticRegression_classifier)],
                                    #('RFC', RFC_classifier)],
                        voting='soft')

In [223]:
eclf.fit(X_train, y_train)
eclf.score(X_train, y_train)

0.84421521509787834

In [224]:
eclf.score(X_test, y_test)

0.73370839193624005

In [225]:
confusion_matrix(y_pred = eclf.predict(X_train), y_true=y_train)

array([[3707,  556],
       [ 773, 3495]])

In [228]:
save_pickles(eclf, "Voting_classifier")

In [229]:
pred = eclf.predict_proba(np.array(new_list).reshape(1, -1))

In [231]:
np.max(pred)

0.68202996877625532

In [227]:
eclf.predict(np.array(new_list).reshape(1, -1))

array(['neg'],
      dtype='<U3')

# Build Classifier Module

In [88]:
def sentiment_module(text):
    feats = find_features(text)
    feats = [items for (keys, items) in feats.items()]
    feats = np.array(feats).reshape(1, -1)
    return eclf.predict(feats)

In [155]:
# Testing positive sentiment
feats = find_features("This movie was great! The acting was great, plot was wonderfule, and there were pythons...")

In [158]:
# Testing negative sentiment
feats = find_features("The movie was utter junk. There were absolutely 0 pythons. I don't see what the point was at all. Horrible movie, 0/10")

In [117]:
# new_list = []
# for keys, items in feats.items():
#     new_list.append(items)

In [159]:
new_list = [items for (keys, items) in feats.items()]

In [160]:
eclf.predict(np.array(new_list).reshape(1, -1))

array(['neg'],
      dtype='<U3')

In [218]:
import sentiment_analysis as sa

In [219]:
positive = "This movie was great! The acting was great, plot was wonderfule, and there were pythons..."

In [220]:
sa.sentiment_module(positive)

NameError: name 'word_tokenize' is not defined