In [30]:
import numpy as np 
import pandas as pd 
import nltk
from nltk.corpus import stopwords
import gensim
import string

In [31]:
df = pd.read_csv('imdb_sentiment.csv')

### Load Google's Pretrained model
- Download: https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
- Size: 1.5GB

In [4]:
google_model = '..//word2vec//GoogleNews-vectors-negative300.bin.gz' 
embeddings = gensim.models.KeyedVectors.load_word2vec_format(google_model,
                                                        binary=True)

In [32]:
embeddings['google'].shape

(300,)

In [33]:
df.head(5)

Unnamed: 0,review,sentiment
0,"A very, very, very slow-moving, aimless movie ...",0
1,Not sure who was more lost - the flat characte...,0
2,Attempting artiness with black & white and cle...,0
3,Very little music or anything to speak of.,0
4,The best scene in the movie was when Gerardo i...,1


In [24]:
df.shape

(745, 2)

In [25]:
df.columns

Index(['review', 'sentiment'], dtype='object')

In [35]:
df.drop_duplicates(inplace = True)

In [36]:
df.shape

(745, 2)

In [28]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [37]:
from string import punctuation
from nltk.corpus import stopwords
from nltk import word_tokenize
import re

In [41]:
import re
def clean_txt(sent):
    tokens = word_tokenize(re.sub('[^a-z ]',' ',sent.lower()))
    stop_updated = stopwords.words("english") + list(punctuation) +\
    ["..."] + \
    ["would", "could","told","subject"]
    text = [term for term in tokens if term not in stop_updated and
            len(term) > 2] 
    res = " ".join(text)
    return res

In [54]:
df['clean_review'] = df.review.apply(clean_txt)

In [55]:
df.head()

Unnamed: 0,review,sentiment,clean_review
0,"A very, very, very slow-moving, aimless movie ...",0,slow moving aimless movie distressed drifting ...
1,Not sure who was more lost - the flat characte...,0,sure lost flat characters audience nearly half...
2,Attempting artiness with black & white and cle...,0,attempting artiness black white clever camera ...
3,Very little music or anything to speak of.,0,little music anything speak
4,The best scene in the movie was when Gerardo i...,1,best scene movie gerardo trying find song keep...


## Create document vectors using word embeddings

In [98]:
docs_vectors = pd.DataFrame()
for doc in df.clean_review:
    temp = pd.DataFrame()
    words = doc.split(' ')
    for word in words:
        try:
            word2vec = embeddings[word]
            temp = temp.append(pd.Series(word2vec),ignore_index=True)
        except:
            pass
    doc_vector = temp.mean()
    docs_vectors = docs_vectors.append(doc_vector,ignore_index=True)

In [99]:
docs_vectors.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,290,291,292,293,294,295,296,297,298,299
0,0.158325,0.205505,-0.08754,0.064842,-0.106415,0.008987,-0.062489,0.000954,0.057213,0.050742,...,-0.106262,0.061035,-0.097076,-0.020645,-0.056183,0.068665,0.088745,0.022263,0.062592,0.040619
1,0.143494,-0.017984,-0.009964,0.106308,-0.052292,-0.075745,-0.015942,-0.037323,0.114197,0.029335,...,-0.080658,0.051147,-0.152344,-0.048691,-0.065536,-0.029953,0.032959,-0.05365,0.021364,-0.096176
2,0.103201,0.049657,-0.00123,0.012548,-0.097672,-0.008192,0.047252,0.028645,0.043961,0.026582,...,-0.127988,0.071552,-0.118704,0.033023,-0.079702,-0.009722,0.041518,-0.037813,0.016389,0.02189
3,0.070648,-0.059143,-0.003357,0.084737,-0.092316,0.107788,0.156372,-0.170471,0.006287,-0.007477,...,-0.052734,0.080322,-0.023529,0.03006,-0.011902,-0.059326,0.144897,-0.071167,0.074524,-0.052612
4,0.120809,0.007568,0.040127,0.048381,-0.056264,0.004886,0.024143,-0.053096,0.066806,-0.012214,...,-0.098731,0.089857,-0.140744,0.013428,-0.042019,-0.01529,0.079617,2.7e-05,0.037869,0.076067


In [84]:
docs_vectors = docs_vectors.fillna(method='ffill')

In [92]:
docs_vectors.shape

(745, 300)

In [85]:
X = docs_vectors
y = df.sentiment.values

In [86]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state=100)

we cannot use naive bayes with word embeddings. 
word embeddings can be either +ve or -ve and naive bayes cannot take negetive values

In [87]:
from sklearn.linear_model import LogisticRegression 
classifier = LogisticRegression()
classifier.fit(X_train, y_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn',
          n_jobs=None, penalty='l2', random_state=None, solver='warn',
          tol=0.0001, verbose=0, warm_start=False)

In [88]:
y_pred = classifier.predict(X_test)

In [89]:
from sklearn.metrics import accuracy_score, confusion_matrix

In [90]:
accuracy_score(y_test,y_pred)

0.8482142857142857

In [91]:
confusion_matrix(y_test,y_pred)

array([[102,  17],
       [ 17,  88]], dtype=int64)