In [5]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\paras\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\wordnet.zip.


True

In [6]:
import pandas as pd
import numpy as np
import re
import os
from IPython.display import HTML

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text 
from sklearn.decomposition import PCA

from tensorflow.python.keras.models import Sequential, load_model
from tensorflow.python.keras.layers import Dense, Dropout
from tensorflow.python.keras import optimizers

import nltk
from nltk.stem.porter import PorterStemmer
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import words
from nltk.corpus import wordnet 
allEnglishWords = words.words() + [w for w in wordnet.words()]
allEnglishWords = np.unique([x.lower() for x in allEnglishWords])

import plotly.offline as py
import plotly.graph_objs as go
py.init_notebook_mode(connected=True)

import warnings
warnings.filterwarnings('ignore')

## Data Import

In [2]:
path = "aclImdb/"
positiveFiles = [x for x in os.listdir(path+"train/pos/") if x.endswith(".txt")]
negativeFiles = [x for x in os.listdir(path+"train/neg/") if x.endswith(".txt")]
testFiles1 = [x for x in os.listdir(path+"test/pos/") if x.endswith(".txt")]
testFiles2 = [x for x in os.listdir(path+"test/neg/") if x.endswith(".txt")]

In [3]:
len(testFiles2)

12500

In [4]:
positiveReviews, negativeReviews, pos_testReviews, neg_testReviews = [], [], [], []
for pfile in positiveFiles:
    with open(path+"train/pos/"+pfile, encoding="latin1") as f:
        positiveReviews.append(f.read())
for nfile in negativeFiles:
    with open(path+"train/neg/"+nfile, encoding="latin1") as f:
        negativeReviews.append(f.read())
for tfile in testFiles1:
    with open(path+"test/pos/"+tfile, encoding="latin1") as f:
        pos_testReviews.append(f.read())
for tfile in testFiles2:
    with open(path+"test/neg/"+tfile,encoding="latin1") as f:
        neg_testReviews.append(f.read())
        

In [5]:
reviews_train = pd.concat([
    pd.DataFrame({"review":positiveReviews, "label":1, "file":positiveFiles}),
    pd.DataFrame({"review":negativeReviews, "label":0, "file":negativeFiles})
], ignore_index=True).sample(frac=1, random_state=1)


#test set 
reviews_test= pd.concat([
     pd.DataFrame({"review":pos_testReviews, "label":1, "file":testFiles1}),
    pd.DataFrame({"review":neg_testReviews, "label":0, "file":testFiles2})
], ignore_index=True).sample(frac=1, random_state=1)


print(reviews_train.head())
print(reviews_test.head())

NameError: name 'reviews' is not defined

With everything centralized in 1 dataframe, we now perform train, validation and test set splits.

In [None]:
#reviews = reviews[["review", "label", "file"]].sample(frac=1, random_state=1)
#train = reviews[reviews.label!=-1].sample(frac=0.8, random_state=1)
#valid = reviews[reviews.label!=-1].drop(train.index)
#test = reviews[reviews.label==-1]

In [None]:
#print(train.shape)
#print(valid.shape)
#print(test.shape)

In [None]:
HTML(reviews_train.review.iloc[0])

---

## Data Preprocessing

It can perform the following operations.
* Discard non alpha-numeric characters
* Set everything to lower case
* Stems all words using PorterStemmer, and change the stems back to the most occurring existent word.
* Discard non-Egnlish words (not by default).

In [8]:
class Preprocessor(object):
    ''' Preprocess data for NLP tasks. '''

    def __init__(self, alpha=True, lower=True, stemmer=True, english=False):
        self.alpha = alpha
        self.lower = lower
        self.stemmer = stemmer
        self.english = english
        
        self.uniqueWords = None
        self.uniqueStems = None
        
    def fit(self, texts):
        texts = self._doAlways(texts)

        allwords = pd.DataFrame({"word": np.concatenate(texts.apply(lambda x: x.split()).values)})
        self.uniqueWords = allwords.groupby(["word"]).size().rename("count").reset_index()
        self.uniqueWords = self.uniqueWords[self.uniqueWords["count"]>1]
        if self.stemmer:
            self.uniqueWords["stem"] = self.uniqueWords.word.apply(lambda x: PorterStemmer().stem(x)).values
            self.uniqueWords.sort_values(["stem", "count"], inplace=True, ascending=False)
            self.uniqueStems = self.uniqueWords.groupby("stem").first()
        
        #if self.english: self.words["english"] = np.in1d(self.words["mode"], allEnglishWords)
        print("Fitted.")
            
    def transform(self, texts):
        texts = self._doAlways(texts)
        if self.stemmer:
            allwords = np.concatenate(texts.apply(lambda x: x.split()).values)
            uniqueWords = pd.DataFrame(index=np.unique(allwords))
            uniqueWords["stem"] = pd.Series(uniqueWords.index).apply(lambda x: PorterStemmer().stem(x)).values
            uniqueWords["mode"] = uniqueWords.stem.apply(lambda x: self.uniqueStems.loc[x, "word"] if x in self.uniqueStems.index else "")
            texts = texts.apply(lambda x: " ".join([uniqueWords.loc[y, "mode"] for y in x.split()]))
        #if self.english: texts = self.words.apply(lambda x: " ".join([y for y in x.split() if self.words.loc[y,"english"]]))
        print("Transformed.")
        return(texts)

    def fit_transform(self, texts):
        texts = self._doAlways(texts)
        self.fit(texts)
        texts = self.transform(texts)
        return(texts)
    
    def _doAlways(self, texts):
        # Remove parts between <>'s
        texts = texts.apply(lambda x: re.sub('<.*?>', ' ', x))
        # Keep letters and digits only.
        if self.alpha: texts = texts.apply(lambda x: re.sub('[^a-zA-Z0-9 ]+', ' ', x))
        # Set everything to lower case
        if self.lower: texts = texts.apply(lambda x: x.lower())
        return texts  

In [9]:
reviews_train.head()

Unnamed: 0,review,label,file
21492,"I have copy of this on VHS, I think they (The ...",0,6844_1.txt
9488,After several extremely well ratings to the po...,1,7290_10.txt
16933,I still don't know why I forced myself to sit ...,0,2740_1.txt
12604,Mt little sister and I are self-proclaimed hor...,0,10094_1.txt
8222,I have personally seen many Disney movies in m...,1,6150_7.txt


In [10]:
preprocess = Preprocessor(alpha=True, lower=True, stemmer=True)

In [11]:
%%time
trainX = preprocess.fit_transform(reviews_train.review)
testX =preprocess.fit_transform(reviews_test.review)

Fitted.
Transformed.
Fitted.
Transformed.
Wall time: 2min 26s


In [12]:
trainX.head()

21492    i have copy of this on vhs i think they the te...
9488     after several extremely well rating to the poi...
16933    i still don t know why i forced myself to sit ...
12604    mt little sister and i are self proclaimed hor...
8222     i have person seen many disney movie in my lif...
Name: review, dtype: object

In [13]:
print(preprocess.uniqueWords.shape)
preprocess.uniqueWords[preprocess.uniqueWords.word.str.contains("disappoint")]

(46433, 3)


Unnamed: 0,word,count,stem
18412,disappointingly,24,disappointingli
18410,disappointed,900,disappoint
18411,disappointing,414,disappoint
18413,disappointment,372,disappoint
18409,disappoint,94,disappoint
18414,disappointments,31,disappoint
18415,disappoints,20,disappoint


In [14]:
print(preprocess.uniqueStems.shape)
preprocess.uniqueStems[preprocess.uniqueStems.word.str.contains("disappoint")]

(30714, 2)


Unnamed: 0_level_0,word,count
stem,Unnamed: 1_level_1,Unnamed: 2_level_1
disappoint,disappointed,900
disappointingli,disappointingly,24


---

## Feature Engineering
Next, we take the preprocessed texts as input and calculate their TF-IDF's ([info](http://www.tfidf.com)). We retain 10000 features per text.

In [15]:
stop_words = text.ENGLISH_STOP_WORDS.union(["thats","weve","dont","lets","youre","im","thi","ha",
    "wa","st","ask","want","like","thank","know","susan","ryan","say","got","ought","ive","theyre"])
tfidf = TfidfVectorizer(min_df=2, max_features=10000, stop_words=stop_words) #, ngram_range=(1,3)

In [16]:
%%time
trainX = tfidf.fit_transform(trainX).toarray()
testX = tfidf.fit_transform(testX).toarray()

Wall time: 10.8 s


In [17]:
print(trainX.shape)

(25000, 10000)


In [18]:
trainY = reviews_train.label
testY =reviews_test.label

In [19]:
print(trainX.shape, trainY.shape)

print(testX.shape, testY.shape)

(25000, 10000) (25000,)
(25000, 10000) (25000,)


---

## Feature Selection
Next, we take the 10k dimensional tfidf's as input, and keep the 2000 dimensions that correlate the most with our sentiment target. The corresponding words - see below - make sense.

In [20]:
from scipy.stats.stats import pearsonr

In [21]:
getCorrelation = np.vectorize(lambda x: pearsonr(trainX[:,x], trainY)[0])
correlations = getCorrelation(np.arange(trainX.shape[1]))
print(correlations)

[-0.01274173 -0.0180222   0.00906162 ...  0.01701741  0.02091131
  0.00975959]


In [22]:
allIndeces = np.argsort(-correlations)
bestIndeces = allIndeces[np.concatenate([np.arange(1000), np.arange(-1000, 0)])]

In [23]:
vocabulary = np.array(tfidf.get_feature_names())
print(vocabulary[bestIndeces][:10])
print(vocabulary[bestIndeces][-10:])

['grateful' 'loudly' 'evidenced' 'bend' 'bbc' 'perpetrator' 'fascism'
 'endeavors' 'amber' 'perplexed']
['pope' 'stubborn' 'hoskins' 'words' 'teller' 'bond' 'avery' 'waning'
 'work' 'babysitter']


In [47]:
trainX = trainX[:,bestIndeces]
testX  = testX [:,bestIndeces]

In [49]:
print(trainX.shape, trainY.shape)
print(testX.shape,testY.shape)

(25000, 2000) (25000,)
(25000, 2000) (25000,)


---

## Model Architecture
We choose a very simple dense network with 6 layers, performing binary classification.

In [26]:
DROPOUT = 0.5
ACTIVATION = "tanh"

model = Sequential([    
    Dense(int(trainX.shape[1]/2), activation=ACTIVATION, input_dim=trainX.shape[1]),
    Dropout(DROPOUT),
    Dense(int(trainX.shape[1]/2), activation=ACTIVATION, input_dim=trainX.shape[1]),
    Dropout(DROPOUT),
    Dense(int(trainX.shape[1]/4), activation=ACTIVATION),
    Dropout(DROPOUT),
    Dense(100, activation=ACTIVATION),
    Dropout(DROPOUT),
    Dense(20, activation=ACTIVATION),
    Dropout(DROPOUT),
    Dense(5, activation=ACTIVATION),
    Dropout(DROPOUT),
    Dense(1, activation='sigmoid'),
])

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [27]:
model.compile(optimizer=optimizers.Adam(0.00005), loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1000)              2001000   
_________________________________________________________________
dropout (Dropout)            (None, 1000)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1000)              1001000   
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000)              0         
_________________________________________________________________
dense_2 (Dense)              (None, 500)               500500    
_________________________________________________________________
dropout_2 (Dropout)          (None, 500)               0         
_________________________________________________________________
dense_3 (Dense)              (None, 100)               50100     
__________

---

## Model Training
Let's go.

In [28]:
EPOCHS = 100
BATCHSIZE = 1500

In [29]:
model.fit(trainX, trainY, epochs=EPOCHS, batch_size=BATCHSIZE, validation_split=0.2,verbose=1)

Train on 20000 samples, validate on 5000 samples
Instructions for updating:
Use tf.cast instead.
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Ep

<tensorflow.python.keras.callbacks.History at 0x25cdb5fb1d0>

In [30]:
x = np.arange(EPOCHS)
history = model.history.history

data = [
    go.Scatter(x=x, y=history["acc"], name="Train Accuracy", marker=dict(size=5), yaxis='y2'),
    go.Scatter(x=x, y=history["val_acc"], name="Valid Accuracy", marker=dict(size=5), yaxis='y2'),
    go.Scatter(x=x, y=history["loss"], name="Train Loss", marker=dict(size=5)),
    go.Scatter(x=x, y=history["val_loss"], name="Valid Loss", marker=dict(size=5))
]
layout = go.Layout(
    title="Model Training Evolution", font=dict(family='Palatino'), xaxis=dict(title='Epoch', dtick=1),
    yaxis1=dict(title="Loss", domain=[0, 0.45]), yaxis2=dict(title="Accuracy", domain=[0.55, 1]),
)
py.iplot(go.Figure(data=data, layout=layout), show_link=False)

---

## Model Evaluation

### Accuracy & Loss
Let's first centralize the probabilities and predictions with the original train and validation dataframes. Then we can print out the respective accuracies and losses.

In [33]:
reviews_train["probability"] = model.predict(trainX)
reviews_train["prediction"] = reviews_train.probability-0.5>0
reviews_train["truth"] = reviews_train.label==1
reviews_train.tail()

Unnamed: 0,review,label,file,probability,prediction,truth
10955,Diane Keaton gave an outstanding performance i...,1,8610_10.txt,0.895504,True,True
17289,"This has to be creepiest, most twisted holiday...",0,3060_1.txt,0.094351,False,False
5192,"Do not expect a depiction of the ""truth"". Howe...",1,3423_7.txt,0.903977,True,True
12172,The League of Gentlemen is one of the funniest...,1,9706_10.txt,0.903593,True,True
235,"Narratives Â whether written, visual or poeti...",1,10211_7.txt,0.903887,True,True


In [35]:
print(model.evaluate(trainX, trainY))
print((reviews_train.truth==reviews_train.prediction).mean())

[0.24397008286714553, 0.93012]
0.93012


In [38]:
len(testX)

25000

In [50]:
reviews_test["probability"] = model.predict(testX)4
reviews_test["prediction"] = reviews_test.probability-0.5>0
reviews_test["truth"] = reviews_test.label==1
reviews_test.tail()

Unnamed: 0,review,label,file,probability,prediction,truth
10955,This movie is a fascinating drama about the Ma...,1,8610_8.txt,0.094942,False,True
17289,"It's too kind to call this a ""fictionalized"" a...",0,3060_3.txt,0.096035,False,False
5192,I was unsure of this movie before renting and ...,1,3423_9.txt,0.904331,True,True
12172,"Just got out of an advance screening, and wow ...",1,9706_7.txt,0.164973,False,True
235,I doubt if the real story of the development o...,1,10211_8.txt,0.094346,False,True


In [52]:
print(model.evaluate(testX, testY))
print((reviews_test.truth==reviews_test.prediction).mean())

[1.1097714756011963, 0.4962]
0.4962


### Error Analysis
Error analysis gives us great insight in the way the model is making its errors. Often, it shows data quality issues.

In [53]:
trainCross = reviews_train.groupby(["prediction", "truth"]).size().unstack()
trainCross

truth,False,True
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
False,11567,814
True,933,11686


In [54]:
validCross = reviews_test.groupby(["prediction", "truth"]).size().unstack()
validCross

truth,False,True
prediction,Unnamed: 1_level_1,Unnamed: 2_level_1
False,7551,7646
True,4949,4854


In [55]:
truepositives = reviews_test[(reviews_test.truth==True)&(reviews_test.truth==reviews_test.prediction)]
print(len(truepositives), "true positives.")
truepositives.sort_values("probability", ascending=False).head(3)

4854 true positives.


Unnamed: 0,review,label,file,probability,prediction,truth
7866,I admit that I almost gave up on watching TV s...,1,5830_10.txt,0.90443,True,True
8527,A very insightful psychological thriller! Foot...,1,6425_9.txt,0.904416,True,True
5567,"""Pearl Harbor, buddy."" This movie is brilliant...",1,3761_8.txt,0.904412,True,True


In [56]:
truenegatives = reviews_test[(reviews_test.truth==False)&(reviews_test.truth==reviews_test.prediction)]
print(len(truenegatives), "true negatives.")
truenegatives.sort_values("probability", ascending=True).head(3)

7551 true negatives.


Unnamed: 0,review,label,file,probability,prediction,truth
22221,Talented screenwriter Alvin Sargent sadly cann...,0,74_4.txt,0.094289,False,False
23544,Does anyone care about any of the characters i...,0,8691_1.txt,0.094292,False,False
24709,The movie is not as funny as the director's pr...,0,973_4.txt,0.094298,False,False


In [57]:
falsepositives = reviews_test[(reviews_test.truth==True)&(reviews_test.truth!=reviews_test.prediction)]
print(len(falsepositives), "false positives.")
falsepositives.sort_values("probability", ascending=True).head(3)

7646 false positives.


Unnamed: 0,review,label,file,probability,prediction,truth
10857,"You don't review James Bond movies, you evalua...",1,8522_8.txt,0.09429,False,True
10858,"In 1983 two Bond movies was made, one was the ...",1,8523_10.txt,0.09429,False,True
3105,"Not a film of entertainment, but of real lives...",1,1545_7.txt,0.094291,False,True


In [58]:
falsenegatives = reviews_test[(reviews_test.truth==False)&(reviews_test.truth!=reviews_test.prediction)]
print(len(falsenegatives), "false negatives.")
falsenegatives.sort_values("probability", ascending=False).head(3)

4949 false negatives.


Unnamed: 0,review,label,file,probability,prediction,truth
14948,I sat (uncomfortably) through this film becomi...,0,12203_2.txt,0.904428,True,False
19084,File this one in the `How do movies like this ...,0,4677_4.txt,0.904428,True,False
12855,Here is a rundown of a typical Rachael Ray Sho...,0,1031_1.txt,0.904425,True,False


This is the review that got predicted as positive most certainly - while being labeled as negative. However, we can easily recognize it as a poorly labeled sample.

In [60]:
HTML(reviews_test.loc[22148].review)

---

## Model Application

### Custom Reviews
To use this model, we would store the model, along with the preprocessing vectorizers, and run the unseen texts through following pipeline.

In [61]:
unseen = pd.Series("this movie very good")

In [62]:
unseen = preprocess.transform(unseen)       # Text preprocessing
unseen = tfidf.transform(unseen).toarray()  # Feature engineering
unseen = unseen[:,bestIndeces]              # Feature selection
probability = model.predict(unseen)[0,0]  # Network feedforward

Transformed.


In [63]:
print(probability)
print("Positive!") if probability > 0.5 else print("Negative!")

0.19737606
Negative!
