In [41]:
#importing libraries

import numpy as np
import pandas as pd

import os

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer

from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

#models
from sklearn.naive_bayes import MultinomialNB
import xgboost as xgb

#success metric
from  sklearn.metrics  import accuracy_score

In [12]:
#load the data from txt files to pandas dataframe

folder = 'aclImdb'

labels ={'pos': 1, 'neg': 0}

df = pd.DataFrame()

for f in ('test', 'train'):
    for l in ('pos', 'neg'):
        path = os.path.join(folder, f, l)
        for file in os.listdir(path):
            with open(os.path.join(path,file), 'r', encoding='utf-8') as infile:
                txt = infile.read()
            df = df.append([[txt, labels[l]]], ignore_index=True)

df.columns = ['review', 'sentiment']
            

In [13]:
#saving to csv for easier access
df.to_csv('imdb_data', index=False, encoding = 'utf-8')

In [47]:
df.sample(10)

Unnamed: 0,review,sentiment
12221,this was a thoughtful and well-shot and direct...,1
9812,I'm not a movie maker but I do know it is hard...,1
5683,AristoCats is such a terrific Disney classic t...,1
5934,"The movie is excellent. Acting, cinematography...",1
155,Gorgeous Annie Belle in her prime stars in thi...,1
47945,"Leslie Sands' stilted play ""Deadlock"" becomes ...",0
21557,I can find no redeeming value to this movie. I...,0
35851,Russ and Valerie are having discussions about ...,1
15899,This film appears to be an exposé of the curre...,0
43701,Soon after watching this film you will realize...,0


In [27]:
#checking the top words used in reviews by frequency distribution

#combining all reviews into one string
reviews = df['review'].str.cat(sep='')

#spliting text into separate words
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(reviews)

#setting vocabulary
vocabulary = set(tokens)
print(len(vocabulary))

freq_distr = nltk.FreqDist(tokens)

#sorting top 50 tokens
sorted(freq_distr, key=freq_distr.__getitem__, reverse=True)[0:50]


132599


['the',
 'and',
 'a',
 'of',
 'to',
 'is',
 'br',
 'in',
 'I',
 'it',
 'that',
 's',
 'this',
 'was',
 'The',
 'movie',
 'as',
 'with',
 'for',
 'film',
 'but',
 't',
 'on',
 'you',
 'are',
 'not',
 'have',
 'his',
 'be',
 'one',
 'he',
 'at',
 'by',
 'all',
 'an',
 'who',
 'they',
 'from',
 'like',
 'It',
 'so',
 'or',
 'about',
 'out',
 'her',
 'just',
 'has',
 'This',
 'some',
 'good']

In [28]:
#we have to remove stopwords

stop_words = set(stopwords.words('english'))
tokens = [w for w in tokens if not w in stop_words]
freq_distr = nltk.FreqDist(tokens)

#sorting top 50 tokens
sorted(freq_distr, key=freq_distr.__getitem__, reverse=True)[0:50]

['br',
 'I',
 'The',
 'movie',
 'film',
 'one',
 'like',
 'It',
 'This',
 'good',
 'time',
 'would',
 'story',
 'really',
 'see',
 'even',
 'much',
 'well',
 'get',
 'bad',
 'people',
 'great',
 'made',
 'first',
 'make',
 'also',
 'could',
 'way',
 'movies',
 'But',
 'characters',
 'think',
 'character',
 'films',
 'And',
 'seen',
 'A',
 'watch',
 'plot',
 'many',
 'two',
 'acting',
 'know',
 'life',
 'never',
 'There',
 'show',
 'love',
 'In',
 'little']

Naive Bayes Classifier

In [29]:
X_train = df.loc[:24999, 'review'].values
y_train = df.loc[:24999, 'sentiment'].values
X_test = df.loc[25000:, 'review'].values
y_test = df.loc[25000:, 'sentiment'].values

In [31]:
#converting text into feature vectors with TF-IDF
vectorizer = TfidfVectorizer()
train_vectors = vectorizer.fit_transform(X_train)
test_vectors = vectorizer.transform(X_test)
print(train_vectors.shape, test_vectors.shape)

(25000, 73822) (25000, 73822)


In [33]:
clf = MultinomialNB().fit(train_vectors, y_train)

In [35]:
predicted = clf.predict(test_vectors)
print(accuracy_score(y_test,predicted))

0.83664


XGBOOST

In [38]:
model = xgb.XGBClassifier(random_state=8888,max_depth=7, n_estimators=300, objective='binary:logistic')

In [39]:
model.fit(train_vectors, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bynode=1, colsample_bytree=1, gamma=0, learning_rate=0.1,
       max_delta_step=0, max_depth=7, min_child_weight=1, missing=None,
       n_estimators=300, n_jobs=1, nthread=None,
       objective='binary:logistic', random_state=8888, reg_alpha=0,
       reg_lambda=1, scale_pos_weight=1, seed=None, silent=None,
       subsample=1, verbosity=1)

In [40]:
prediction = model.predict(test_vectors)
print(accuracy_score(y_test,prediction))

0.8602
