<a href="https://colab.research.google.com/github/asimn7/DeepLearning/blob/main/Sentiment_analysis_using_LSTM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd # to load dataset
import numpy as np # for mathematic equation
from nltk.corpus import stopwords # to get collection of stopwords
from sklearn.model_selection import train_test_split # for splitting dataset
from tensorflow.keras.preprocessing.text import Tokenizer # to encode text to int
from tensorflow.keras.preprocessing.sequence import pad_sequences # to do padding or truncating
from tensorflow.keras.models import Sequential # the model
from tensorflow.keras.layers import Embedding, LSTM, Dense # layers of the architecture
from tensorflow.keras.callbacks import ModelCheckpoint # save model
from tensorflow.keras.models import load_model # load saved model
import re

In [4]:
data = pd.read_csv('IMDB Dataset.csv')
print(data)

                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]


In [5]:
import nltk
nltk.download('stopwords')
english_stops = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [6]:
#LOAD AND CLEAN DATASET
def load_dataset():
 df = pd.read_csv('IMDB Dataset.csv')
 x_data = df['review'] # Reviews/Input
 y_data = df['sentiment'] # Sentiment/Output
 # PRE-PROCESS REVIEW
 x_data = x_data.replace({'<.*?>': ''}, regex = True) # remove html tag
 x_data = x_data.replace({'[^A-Za-z]': ' '}, regex = True) # remove non alphabet
 x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops]) # remove stop words
 x_data = x_data.apply(lambda review: [w.lower() for w in review]) #lower case

 # ENCODE SENTIMENT -> 0 & 1
 y_data = y_data.replace('positive', 1)
 y_data = y_data.replace('negative', 0)
 return x_data, y_data
x_data, y_data = load_dataset()
print('Reviews')
print(x_data, '\n')
print('Sentiment')
print(y_data)

Reviews
0        [one, reviewers, mentioned, watching, oz, epis...
1        [a, wonderful, little, production, the, filmin...
2        [i, thought, wonderful, way, spend, time, hot,...
3        [basically, family, little, boy, jake, thinks,...
4        [petter, mattei, love, time, money, visually, ...
                               ...                        
49995    [i, thought, movie, right, good, job, it, crea...
49996    [bad, plot, bad, dialogue, bad, acting, idioti...
49997    [i, catholic, taught, parochial, elementary, s...
49998    [i, going, disagree, previous, comment, side, ...
49999    [no, one, expects, star, trek, movies, high, a...
Name: review, Length: 50000, dtype: object 

Sentiment
0        1
1        1
2        1
3        0
4        1
        ..
49995    1
49996    0
49997    0
49998    0
49999    0
Name: sentiment, Length: 50000, dtype: int64


  y_data = y_data.replace('negative', 0)


In [7]:
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size = 0.2)
print('Train Set')
print(x_train, '\n')
print(x_test, '\n')
print('Test Set')
print(y_train, '\n')
print(y_test)

Train Set
27919    [a, great, film, requiring, acquired, taste, i...
41843    [the, lifetime, channel, aired, october, i, go...
10754    [pyramid, clues, deep, blue, sea, tremors, sli...
7884     [i, went, crooked, earth, see, piece, new, zea...
27990    [this, excellent, modern, day, film, noir, exc...
                               ...                        
37378    [there, many, things, fall, aro, tolbukhin, en...
2648     [this, movie, crap, even, though, directors, c...
20329    [hmmm, worst, film, ever, well, sort, cast, sh...
9676     [at, start, one, england, course, i, chances, ...
46387    [silly, simplistic, short, gun, crazy, volume,...
Name: review, Length: 40000, dtype: object 

41479    [i, quite, know, explain, darkend, room, summa...
25636    [this, could, much, better, turned, tom, pittm...
27142    [the, love, letter, one, time, favorite, books...
11857    [this, movie, filmed, hometown, i, acquainted,...
21638    [this, one, boring, horror, movies, i, ever, s...
 

In [8]:
def get_max_length():
    review_length = []
    for review in x_train:
        review_length.append(len(review)) # Indented this line by 4 spaces
    return int(np.ceil(np.mean(review_length)))

In [9]:
# ENCODE REVIEW
token = Tokenizer(lower=False) # no need lower, because already lowered the data in load_data()
token.fit_on_texts(x_train)
x_train = token.texts_to_sequences(x_train)
x_test = token.texts_to_sequences(x_test)
max_length = get_max_length()
x_train = pad_sequences(x_train, maxlen=max_length, padding='post',
truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post',
truncating='post')
total_words = len(token.word_index) + 1 # add 1 because of 0 padding
print('Encoded X Train\n', x_train, '\n')
print('Encoded X Test\n', x_test, '\n')
print('Maximum review length: ', max_length)

Encoded X Train
 [[   39    20     4 ...     0     0     0]
 [    2  2437  1080 ...   496  1701  7410]
 [14031  3472   739 ...     0     0     0]
 ...
 [ 7417   153     4 ...     0     0     0]
 [  288   284     5 ...     5  2421    33]
 [  566  4249   233 ... 16105  4368   189]] 

Encoded X Test
 [[   1   88   47 ...    0    0    0]
 [   8   26   17 ...    0    0    0]
 [   2   41 2836 ...    0    0    0]
 ...
 [ 107  131    1 ... 1622   12 5931]
 [ 338  165 1510 ...    0    0    0]
 [  36   70  511 ...  548   43  390]] 

Maximum review length:  130


In [10]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64
model = Sequential()
model.add(Embedding(total_words, EMBED_DIM, input_length = max_length))
model.add(LSTM(LSTM_OUT))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics =
['accuracy'])
print(model.summary())




None


In [11]:
checkpoint = ModelCheckpoint(
    'models/LSTM.keras',  # Changed the file extension to .keras
    monitor='accuracy',
    save_best_only=True,
    verbose=1
)
model.fit(x_train, y_train, batch_size=128, epochs=5, callbacks=[checkpoint])

Epoch 1/5
[1m311/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 9ms/step - accuracy: 0.5212 - loss: 0.6898
Epoch 1: accuracy improved from -inf to 0.56243, saving model to models/LSTM.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 10ms/step - accuracy: 0.5216 - loss: 0.6897
Epoch 2/5
[1m312/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.6117 - loss: 0.6556
Epoch 2: accuracy improved from 0.56243 to 0.59380, saving model to models/LSTM.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 13ms/step - accuracy: 0.6115 - loss: 0.6556
Epoch 3/5
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.6040 - loss: 0.6572
Epoch 3: accuracy improved from 0.59380 to 0.60742, saving model to models/LSTM.keras
[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 10ms/step - accuracy: 0.6040 - loss: 0.6572
Epoch 4/5
[1m311/313[0m [32m━━━━━━━━━━━━━━━━━━━[0m[3

<keras.src.callbacks.history.History at 0x7c449f4c7ee0>

In [14]:
#TESTING
y_pred = (model.predict(x_test, batch_size = 128) > 0.5).astype(int) # Use model.predict and convert to class labels
true = 0
for i, y in enumerate(y_test):
    if y == y_pred[i]:
        true += 1
print('Correct Prediction: {}'.format(true))
print('Wrong Prediction: {}'.format(len(y_pred) - true))
print('Accuracy: {}'.format(true/len(y_pred)*100))

[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step
Correct Prediction: 5440
Wrong Prediction: 4560
Accuracy: 54.400000000000006


In [16]:
#LOAD SAVED MODEL
loaded_model = load_model('models/LSTM.keras')


In [20]:
review = str(input('Movie Review: '))

Movie Review: Nothing was typical about this Everything was beautifully done in t his movie the story the flow the scenario everything I highly recommend it fo r mystery lovers for anyone who wants to watch a good movie


In [21]:
# Pre-process input
regex = re.compile(r'[^a-zA-Z\s]')
review = regex.sub('', review)
print('Cleaned: ', review)
words = review.split(' ')
filtered = [w for w in words if w not in english_stops]
filtered = ' '.join(filtered)
filtered = [filtered.lower()]
print('Filtered: ', filtered)

Cleaned:  Nothing was typical about this Everything was beautifully done in t his movie the story the flow the scenario everything I highly recommend it fo r mystery lovers for anyone who wants to watch a good movie
Filtered:  ['nothing typical everything beautifully done movie story flow scenario everything i highly recommend fo r mystery lovers anyone wants watch good movie']


In [22]:
tokenize_words = token.texts_to_sequences(filtered)
tokenize_words = pad_sequences(tokenize_words, maxlen=max_length,
padding='post', truncating='post')
print(tokenize_words)

[[   77   697   168  1166   129     3    13  2674  2695   168     1   447
    280 55779  1123   705  1666   151   400    34     9     3     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0     0     0
      0     0     0     0     0     0     0     0     0     0]]


In [23]:
result = loaded_model.predict(tokenize_words)
print(result)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 104ms/step
[[0.47175416]]


In [24]:
if result >= 0.7:
 print('positive')
else:
 print('negative')

negative
