In [1]:
import pandas as pd    # to load dataset
import numpy as np     # for mathematic equation
import nltk
from nltk.corpus import stopwords # to get collection of stopwords
from sklearn.model_selection import train_test_split       # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer  # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences   # to do padding or truncating
from tensorflow.keras.models import Sequential     # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint   # save model
from tensorflow.keras.models import load_model   # load saved model
import re

In [2]:
# importing the dataset
data = pd.read_csv(r"C:\Users\ajayg\Desktop\CS\Projects\Sentimental analysis using LSTM\IMDBDataset.csv")
print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [3]:
english_stops = set(stopwords.words('english'))

In [4]:
def load_dataset():
    df = pd.read_csv(r"C:\Users\ajayg\Desktop\CS\Projects\Sentimental analysis using LSTM\IMDBDataset.csv")
    x_data = df['review']       # Reviews/Input
    y_data = df['sentiment']    # Sentiment/Output

    # PRE-PROCESS REVIEW
    x_data = x_data.replace({'<.*?>': ''}, regex = True)          # remove html tag
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True)     # remove non alphabet
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])  # remove stop words
    x_data = x_data.apply(lambda review: [w.lower() for w in review])   # lower case
    
    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data

x_data, y_data = load_dataset()

print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


In [5]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)

print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
1943     [i, saw, the, merchant, venice, london, last, ...
29990    [this, terrible, movie, barely, recognizable, ...
14429    [the, worst, movie, i, seen, yeah, fun, fantas...
2384     [truly, wonderful, movie, bruce, willis, gives...
25759    [everybody, wants, editor, watch, movie, it, s...
                               ...                        
23761    [know, read, countless, films, doubt, reading,...
22640    [i, really, care, had, gotten, rid, comedy, sl...
6771     [just, another, example, stepehn, king, books,...
34318    [i, picked, film, based, plot, summary, critic...
5034     [there, actually, good, reasons, person, take,...
Name: review, Length: 40000, dtype: object 

10170    [i, know, people, think, movie, without, even,...
39796    [please, help, economy, spend, money, elsewher...
15324    [i, rather, disappointed, the, first, tetsuo, ...
6195     [i, really, enjoyed, movie, in, my, dvd, colle...
12120    [when, showed, seattle, int, l, film, fest, i,...
 

In [6]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review))

    return int(np.ceil(np.mean(review_length)))

In [7]:
# ENCODE REVIEW
token = Tokenizer(lower=False)    # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)

max_length = get_max_length()

x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1   # add 1 because of 0 padding

print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   1  120    2 ...    0    0    0]
 [   8  290    3 ...    0    0    0]
 [   2  157    3 ...    0    0    0]
 ...
 [ 453   73  367 ...    0    0    0]
 [   1 1490    4 ...    0    0    0]
 [  50   75    9 ...  589    6  393]] 

Encoded X Test
 [[    1    47    20 ...     0     0     0]
 [  495   247  7686 ...     0     0     0]
 [    1   159   562 ...     0     0     0]
 ...
 [   50    48   714 ...   404   340  2670]
 [    5  3521   931 ...   979  7417   838]
 [    1  7181 12575 ...  3650   103   197]] 

Maximum review length:  130


In [8]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])

print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 130, 32)           2962816   
                                                                 
 lstm (LSTM)                 (None, 64)                24832     
                                                                 
 dense (Dense)               (None, 1)                 65        
                                                                 
Total params: 2,987,713
Trainable params: 2,987,713
Non-trainable params: 0
_________________________________________________________________
None


In [9]:
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)

In [12]:
model.fit(x_train, y_train, batch_size = 128, epochs = 6, callbacks=[checkpoint])

Epoch 1/6
Epoch 1: accuracy improved from 0.95117 to 0.95565, saving model to models\LSTM.h5
Epoch 2/6
Epoch 2: accuracy improved from 0.95565 to 0.96562, saving model to models\LSTM.h5
Epoch 3/6
Epoch 3: accuracy improved from 0.96562 to 0.97102, saving model to models\LSTM.h5
Epoch 4/6
Epoch 4: accuracy improved from 0.97102 to 0.97107, saving model to models\LSTM.h5
Epoch 5/6
Epoch 5: accuracy did not improve from 0.97107
Epoch 6/6
Epoch 6: accuracy improved from 0.97107 to 0.97513, saving model to models\LSTM.h5


<keras.callbacks.History at 0x2b90a6ac3d0>

In [13]:
#evaluate our model
result = model.evaluate(x_train, y_train)



In [14]:
loaded_model = load_model('models/LSTM.h5')

In [15]:
review = str(input('Movie Review: '))

Movie Review: I sure would like to see a resurrection of a up dated Seahunt series with the tech they have today it would bring back the kid excitement in me.I grew up on black and white TV and Seahunt with Gunsmoke were my hero's every week.You have my vote for a comeback of a new sea hunt.We need a change of pace in TV and this would work for a world of under water adventure.Oh by the way thank you for an outlet like this to view many viewpoints about TV and the many movies.So any ole way I believe I've got what I wanna say.Would be nice to read some more plus points about sea hunt.If my rhymes would be 10 lines would you let me submit,or leave me out to be in doubt and have me to quit,If this is so then I must go so lets do it.


In [16]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)

words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]

print('Filtered: ', filtered)

Cleaned:  I sure would like to see a resurrection of a up dated Seahunt series with the tech they have today it would bring back the kid excitement in meI grew up on black and white TV and Seahunt with Gunsmoke were my heros every weekYou have my vote for a comeback of a new sea huntWe need a change of pace in TV and this would work for a world of under water adventureOh by the way thank you for an outlet like this to view many viewpoints about TV and the many moviesSo any ole way I believe Ive got what I wanna sayWould be nice to read some more plus points about sea huntIf my rhymes would be  lines would you let me submitor leave me out to be in doubt and have me to quitIf this is so then I must go so lets do it
Filtered:  ['i sure would like see resurrection dated seahunt series tech today would bring back kid excitement mei grew black white tv seahunt gunsmoke heros every weekyou vote comeback new sea huntwe need change pace tv would work world water adventureoh way thank outlet lik

In [17]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length, padding='post', truncating='post')
print(tokenize_words)

[[    1   154    12     6    15  8554  1997 56916   108  5010   405    12
    636    63   421  2358 23494  2080   219   356   142 56916 18687 14266
     84  1967  6878    82  1565   265   573   950   142    12    74    85
    867    26  1266 14255     6   548    36 12729   142    36 11382    26
      1   166 16552    99     1  2919   236   241   827   717  1565 12287
     12   316    12   181   469   704     1   112    62  1485     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0]]


In [18]:
result = loaded_model.predict(tokenize_words)
print(result)

[[0.97409457]]


In [19]:
if result >= 0.7:
    print('positive')
else:
    print('negative')

positive
