## Initialization

In [1]:
import pandas as pd
import numpy as np
import re

In [2]:
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

In [3]:
model_accuracies = {'LogReg':1, 'DT':1, 'RF':1, 'LinearSVC':1, 'KernelSVC':1, 'NB':1, 'KNN':1}

## Importing the data

In [4]:
dataset = pd.read_csv('imdb_labelled.txt', header = None, delimiter = '\t', quoting = 3)
dataset.shape

(1000, 2)

In [5]:
dataset.head()

Unnamed: 0,0,1
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


## Preprocess the Data

In [6]:
corpus = []

In [7]:
for i in range(0, dataset.shape[0]):
    sent = re.sub('[^a-zA-Z]', ' ', dataset[0][i])
    sent = sent.lower()
    sent = sent.split()
    ps = PorterStemmer()
    sent = [ps.stem(word) for word in sent if not word in set(stopwords.words('english'))]
    sent = ' '.join(sent)
    corpus.append(sent)

In [8]:
corpus

['slow move aimless movi distress drift young man',
 'sure lost flat charact audienc nearli half walk',
 'attempt arti black white clever camera angl movi disappoint becam even ridicul act poor plot line almost non exist',
 'littl music anyth speak',
 'best scene movi gerardo tri find song keep run head',
 'rest movi lack art charm mean empti work guess empti',
 'wast two hour',
 'saw movi today thought good effort good messag kid',
 'bit predict',
 'love cast jimmi buffet scienc teacher',
 'babi owl ador',
 'movi show lot florida best made look appeal',
 'song best muppet hilari',
 'cool',
 'right case movi deliv everyth almost right face',
 'averag act main person low budget clearli see',
 'review long overdu sinc consid tale two sister singl greatest film ever made',
 'put gem movi term screenplay cinematographi act post product edit direct aspect film make',
 'practic perfect true masterpiec sea faux masterpiec',
 'structur film easili tightli construct histori cinema',
 'think fil

## Create Bag of Words Model

In [15]:
cv = CountVectorizer(max_features = 2300)

In [16]:
X = cv.fit_transform(corpus).toarray()

In [17]:
Y = dataset.iloc[:, 1].values

In [18]:
X.shape

(1000, 2300)

In [19]:
Y.shape

(1000,)

## Create Train and Test Data

In [20]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 4)

In [21]:
X_train.shape

(800, 2300)

In [22]:
X_test.shape

(200, 2300)

In [23]:
Y_train.shape

(800,)

In [24]:
Y_test.shape

(200,)

In [25]:
pd.DataFrame(pd.DataFrame(Y_train)[0].value_counts())

Unnamed: 0,0
1,411
0,389


In [26]:
pd.DataFrame(pd.DataFrame(Y_test)[0].value_counts())

Unnamed: 0,0
0,111
1,89


## DecisionTree

In [27]:
clf_dt = DecisionTreeClassifier(criterion = 'entropy')

In [28]:
clf_dt.fit(X_train, Y_train)

DecisionTreeClassifier(class_weight=None, criterion='entropy', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [29]:
Y_pred_dt = clf_dt.predict(X_test)

In [30]:
confusion_matrix(Y_pred_dt, Y_test)

array([[80, 15],
       [31, 74]], dtype=int64)

## Random Forest

In [31]:
clf_rf = RandomForestClassifier(n_estimators = 10, criterion = 'entropy')

In [32]:
clf_rf.fit(X_train, Y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [33]:
Y_pred_rf = clf_rf.predict(X_test)

In [34]:
confusion_matrix(Y_pred_rf, Y_test)

array([[72, 12],
       [39, 77]], dtype=int64)

## Naive Bayes

In [35]:
clf_nb = GaussianNB()

In [36]:
clf_nb.fit(X_train, Y_train)

GaussianNB(priors=None)

In [37]:
Y_pred_nb = clf_nb.predict(X_test)

In [38]:
confusion_matrix(Y_pred_nb, Y_test)

array([[92, 38],
       [19, 51]], dtype=int64)

## KNN

In [39]:
clf_knn = KNeighborsClassifier(n_neighbors = 5)

In [40]:
clf_knn.fit(X_train, Y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [41]:
Y_pred_knn = clf_knn.predict(X_test)

In [42]:
confusion_matrix(Y_pred_knn, Y_test)

array([[43,  8],
       [68, 81]], dtype=int64)

## Logistic Regression

In [43]:
clf_lr = LogisticRegression()

In [44]:
clf_lr.fit(X_train, Y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [45]:
Y_pred_lr = clf_lr.predict(X_test)

In [46]:
confusion_matrix(Y_pred_lr, Y_test)

array([[85, 12],
       [26, 77]], dtype=int64)

## Linear SVC

In [47]:
clf_lsvc = SVC(kernel = 'linear')

In [48]:
clf_lsvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='linear',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [49]:
Y_pred_lsvc = clf_lsvc.predict(X_test)

In [50]:
confusion_matrix(Y_pred_lsvc, Y_test)

array([[82, 15],
       [29, 74]], dtype=int64)

## Kernel SVC

In [51]:
clf_ksvc = SVC(kernel = 'rbf')

In [52]:
clf_ksvc.fit(X_train, Y_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)

In [53]:
Y_pred_ksvc = clf_ksvc.predict(X_test)

In [54]:
confusion_matrix(Y_pred_ksvc, Y_test)

array([[  0,   0],
       [111,  89]], dtype=int64)

## Accuracy of Various Models

In [55]:
model_accuracies['DT'] = accuracy_score(Y_pred_dt, Y_test)
model_accuracies['KNN'] = accuracy_score(Y_pred_knn, Y_test)
model_accuracies['KernelSVC'] = accuracy_score(Y_pred_ksvc, Y_test)
model_accuracies['LinearSVC'] = accuracy_score(Y_pred_lsvc, Y_test)
model_accuracies['LogReg'] = accuracy_score(Y_pred_lr, Y_test)
model_accuracies['NB'] = accuracy_score(Y_pred_nb, Y_test)
model_accuracies['RF'] = accuracy_score(Y_pred_rf, Y_test)
model_accuracies

{'DT': 0.77000000000000002,
 'KNN': 0.62,
 'KernelSVC': 0.44500000000000001,
 'LinearSVC': 0.78000000000000003,
 'LogReg': 0.81000000000000005,
 'NB': 0.71499999999999997,
 'RF': 0.745}