# Sentiment Anlysis on Movie review Dataset

In [1]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.classify import DecisionTreeClassifier
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
#Returns a dictionary of all tokens (key=token and value=True)
#In this format NaiveBayes expects the input
def create_feature(word_list):
    useful_words=[]
    for token in word_list:
        if token not in stopwords.words("english"):
            useful_words.append(token)  
            
    my_dict =dict([w,True] for w in useful_words)
    return my_dict    

In [3]:
create_feature(["mohan","am","goes","to","market","school","goes","I"])

{'I': True, 'goes': True, 'market': True, 'mohan': True, 'school': True}

In [4]:
neg_words = []
for f_id in movie_reviews.fileids('neg'):
    words = movie_reviews.words(f_id)
    dict4_each_file = create_feature(words)
    neg_words.append((dict4_each_file,"Bad"))
#print(len(neg_words))

In [7]:
pos_words = []
for f_id in movie_reviews.fileids('pos'):
    words = movie_reviews.words(f_id)
    dict4_each_file = create_feature(words)
    pos_words.append((dict4_each_file,"Good"))
#print(pos_words[0])
#print(len(pos_words))

In [8]:
#Traning and testing data
training_data = pos_words[:750]+neg_words[:750]
testing_data = pos_words[750:]+neg_words[750:]
#print(len(training_data),len(testing_data))

# Naive Bayes

In [9]:
classifer = NaiveBayesClassifier.train(training_data)

In [10]:
accuracy = nltk.classify.util.accuracy(classifer,testing_data)
print(accuracy*100)

72.39999999999999


In [11]:
#User input text 
user_input = '''I am abhijeet adhikary i do practice on competitive programming, and i am happy to do that kill belongs kill from kishnaganj,blood donate my father is doctor fight and my mother is great
my younger brothe is student'''
print(user_input)

I am abhijeet adhikary i do practice kill belongs kill from kishnaganj,blood donate my father is doctor fight and my mother is great
my younger brothe is student


In [12]:
words = word_tokenize(user_input)
featured_words = create_feature(words)
classifer.classify(featured_words)

'Good'

In [13]:
from sklearn.svm import LinearSVC

In [14]:
classifer.show_most_informative_features()

Most Informative Features
             magnificent = True             Good : Bad    =     15.0 : 1.0
             outstanding = True             Good : Bad    =     13.6 : 1.0
               insulting = True              Bad : Good   =     13.0 : 1.0
              vulnerable = True             Good : Bad    =     12.3 : 1.0
               ludicrous = True              Bad : Good   =     11.8 : 1.0
                  avoids = True             Good : Bad    =     11.7 : 1.0
             uninvolving = True              Bad : Good   =     11.7 : 1.0
              astounding = True             Good : Bad    =     10.3 : 1.0
             fascination = True             Good : Bad    =     10.3 : 1.0
                 idiotic = True              Bad : Good   =      9.8 : 1.0


## Support Vector Machine(SVM)

In [15]:
import os
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

In [16]:
#Reads data from each file
def add_data_from_files (file_list,data_list):
    for f in file_list:
        with open(f,'r') as fh:
            data_list.append(fh.read())

In [17]:
directory = os.path.join("C:\\Users\\kiit\\AppData\\Roaming\\","nltk_data\\corpora\\movie_reviews\\")
print(directory)

C:\Users\kiit\AppData\Roaming\nltk_data\corpora\movie_reviews\


In [35]:
clses = ["pos","neg"]
# The data is in the data_dir, sorted into subdirectories, one for each class.
data_dirs = [os.path.join(directory,cls) for cls in clses]
print(data_dirs)

['C:\\Users\\kiit\\AppData\\Roaming\\nltk_data\\corpora\\movie_reviews\\pos', 'C:\\Users\\kiit\\AppData\\Roaming\\nltk_data\\corpora\\movie_reviews\\neg']


In [36]:
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,stop_words='english')

In [37]:
train_labels = []
test_labels = []

In [38]:
train_data = []
test_data = []
training_proportion = (9,10)

In [39]:
for i,cls  in enumerate(clses):
    d_dir = data_dirs[i]
    #chnage directory
    os.chdir(d_dir)
    #all file inside d_dir
    cls_files = os.listdir(d_dir)
    #store total no. of files 
    num_cls_files = len(cls_files)
    #start and end index of traning and testing data 
    training_index = (training_proportion[0] *(num_cls_files/training_proportion[1]))
    
    train_labels.extend(cls for f in cls_files[:int(training_index)])
    test_labels.extend(cls for f in cls_files[int(training_index):])
    
    add_data_from_files (cls_files[:int(training_index)],train_data)
    add_data_from_files (cls_files[int(training_index):],test_data)

In [40]:
train_features = vectorizer.fit_transform(train_data)
test_features = vectorizer.transform(test_data)
train_features.shape

(1800, 37673)

In [41]:
clf = LinearSVC(loss='squared_hinge', penalty="l2",dual=False, tol=1e-3)

In [42]:
# Train (or "fit") the model to the training data.
clf.fit(train_features,train_labels)

LinearSVC(C=1.0, class_weight=None, dual=False, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.001,
     verbose=0)

In [43]:
# Test the model on the test data.
predicted_svm = clf.predict(test_features)

In [44]:
acc = clf.score(test_features,test_labels)*100

In [45]:
print(acc)

89.5


In [46]:
#user_input = "I am abhijeet adhikary i do practice kill belongs kill from kishnaganj,blood donate my father is doctor fight and my mother is great my younger brothe is student"
#vectorizer1 = TfidfVectorizer(sublinear_tf=True, max_df=1,stop_words='english')
#user_input_features = vectorizer1.fit_transform([user_input])
#newlabels = clf.predict(user_input_features)

# K-Nearest Neighbor

In [47]:
from sklearn.neighbors import KNeighborsClassifier

In [48]:
negih = KNeighborsClassifier(n_neighbors = 3)

In [49]:
negih.fit(train_features,train_labels)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=3, p=2,
           weights='uniform')

In [50]:
predicted_knn = negih.predict(test_features)

In [51]:
acc = negih.score(test_features,test_labels)*100

In [52]:
print(acc)

71.0


# Decision Tree

In [53]:
from sklearn import tree

In [54]:
clf = tree.DecisionTreeClassifier()

In [55]:
clf.fit(train_features,train_labels)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [56]:
predicted_dt = clf.predict(test_features)

In [57]:
acc = clf.score(test_features,test_labels)
print(acc)

0.64


# Random forest

In [64]:
from sklearn.ensemble import RandomForestClassifier

In [200]:
classifier = RandomForestClassifier(n_estimators = 53, criterion = 'entropy', random_state = 0)

In [201]:
classifier.fit(train_features,train_labels)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=53, n_jobs=1,
            oob_score=False, random_state=0, verbose=0, warm_start=False)

In [202]:
predicted_rf = classifier.predict(test_features)

In [203]:
acc = classifier.score(test_features,test_labels)

In [204]:
print(acc*100)

85.5
