In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer, text_to_word_sequence
import keras
from keras import Sequential
from keras.layers import Dense
from keras import losses
import os
path_pos = "/Users/aaditkapoor/Desktop/aclImdb/train/pos"
path_neg = "/Users/aaditkapoor/Desktop/aclImdb/train/neg"

Using TensorFlow backend.


In [3]:
pos = os.listdir(path="/Users/aaditkapoor/Desktop/aclImdb/train/pos")
neg = os.listdir(path="/Users/aaditkapoor/Desktop/aclImdb/train/neg")

In [5]:
pos[0]

'4715_9.txt'

In [4]:
pos_reviews = [] # 1
for i in pos:
    file = open(path_pos+"/"+i, "r")
    pos_reviews.append(file.read())
    file.close()

In [5]:
neg_reviews = [] # 0
for j in neg:
    file = open(path_neg+"/"+j, "r")
    neg_reviews.append(file.read())
    file.close()

In [6]:
# Creating a dataframe
data = {}
index = 1
for i,j in zip(pos_reviews, neg_reviews):
    data[i] = 1 # Positive review
    data[j] = 0 # Negative review

In [17]:
data # All the reviews with data

{'For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.': 1,
 "Working with one of the best Shakespeare sources, this film manages to be creditable to it's source, whilst still appealing to a wider audience.<br /><br />Branagh steals the film from under Fishburne's nose, and there's a talented cast on good form.": 0,
 'Bizarre horror movie filled with famous faces but stolen by Cristina Raines (later of TV\'s "Flamingo Road") as a pretty but somewhat unstable model with a gummy smile who is slated to pay for her attempted suicides by guarding the Gateway to Hell! The scenes with Raines modeling are very well captured, the mood music is perfect, Deborah Raffin is charming as Cristina\'s pal, but when Raines moves into a creepy Brooklyn Heights brownstone (inha

In [7]:
df = pd.DataFrame(list(data.items()), columns=["review","sentiment"])
df.reset_index(drop=True, inplace=True)

In [8]:
df.head()

Unnamed: 0,review,sentiment
0,For a movie that gets no respect there sure ar...,1
1,Working with one of the best Shakespeare sourc...,0
2,Bizarre horror movie filled with famous faces ...,1
3,"Well...tremors I, the original started off in ...",0
4,"A solid, if unremarkable film. Matthau, as Ein...",1


In [9]:
# Gathering features and labels
features = df.review.values
labels = df.sentiment.values

In [10]:
# Tokenizing text using tfidf
t = Tokenizer()
t.fit_on_texts(features)
features = t.texts_to_matrix(features, mode="tfidf")

In [11]:
# Now we have the data, let us split
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, random_state=42, shuffle=True)

In [73]:
len(features_train)

18678

In [74]:
len(features_test)

6226

In [77]:
# First we will try traditional ml algorithms
# let us start with naive bayes
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(features_train, labels_train)
clf.score(features_test,labels_test) # We are getting a 83%

0.83472534532605203

In [None]:
# Trying SVM
from sklearn.svm import SVC
clf2 = SVC()
clf2.fit(features_train, labels_train)
clf2.score(features_test, labels_test)
# Took too much time to train.

In [12]:
# Trying a neural network
model = Sequential()
model.add(Dense(100, input_dim = features_train.shape[1], activation="relu"))
model.add(Dense(75, activation="relu"))
model.add(Dense(1, activation="sigmoid"))
model.compile(optimizer="adam", loss=losses.binary_crossentropy, metrics=["accuracy"])

In [14]:
features_train.shape

(18678, 88583)

In [13]:
model.fit(features_train, labels_train, epochs=5)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x122ff2dd8>

In [14]:
model.evaluate(features_test, labels_test)



[0.78310250927665126, 0.88387407645358174]

In [15]:
from keras.utils import plot_model

In [16]:
plot_model(model, to_file="sentiment-analysis-imdb.png",show_layer_names=True, show_shapes=True)

In [None]:
model.get_weights()

In [24]:
model.save("sentiment-anaylsis-imdb.h5")

In [18]:
pred = model.predict(features_test)

In [19]:
pred = np.round(pred).astype(int)

In [37]:
pred[1]

array([ 1.], dtype=float32)

In [20]:
from sklearn.metrics import accuracy_score, roc_curve
accuracy_score(labels_test, pred)

0.88387407645358174

In [21]:
labels_train

array([1, 1, 1, ..., 1, 1, 1])

In [44]:
features_train[0]

array([ 0.        ,  2.14732195,  1.49047154, ...,  0.        ,
        0.        ,  0.        ])

In [45]:
labels_train[0]

1

In [50]:
labels[0]

1

In [51]:
pred[0]

array([ 0.], dtype=float32)

In [54]:
labels_test.keys()

Int64Index([12552,  3045, 23925,  4400,  4580, 12310,  3974, 19367,  3794,
             5995,
            ...
             9248,  2002, 21827,  9626, 21816, 13177,  4779,  7306, 13991,
            18801],
           dtype='int64', length=6226)

In [23]:
labels[0]

1

In [22]:
features[0]

array([ 0.        ,  1.18064658,  0.        , ...,  0.        ,
        0.        ,  0.        ])

In [25]:
pred[0]

array([0])

In [27]:
a = "aadit is great!"

In [28]:
words = text_to_word_sequence(a)

In [29]:
words

['aadit', 'is', 'great']

In [30]:
i = Tokenizer()

In [31]:
i.fit_on_texts(words)

In [32]:
a = i.texts_to_matrix(words, mode="tfidf")

In [34]:
a.shape

(3, 4)

In [40]:
features[2]

array([ 0.        ,  2.22945311,  1.49047154, ...,  0.        ,
        0.        ,  0.        ])