# Deep Learning Project: Sentiment Analysis on IMDB reviews using Long short-term memory (LSTM)

### Importing Dependencies 

In [96]:
import os 
import json

from zipfile import ZipFile
import pandas as pd 
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential 
from tensorflow.keras.layers import Dense, Embedding, LSTM
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input


### Data collection - Kaggle API 

Note: The following steps outline how to download the file directly from Kaggle. Alternatively, you can use the zip file uploaded in the datasets folder.

In [97]:
kaggle_dict = json.load(open("datasets/kaggle.json"))

In [98]:
kaggle_dict.keys()

dict_keys(['username', 'key'])

setup kaggle credentials as environment variables


In [99]:
os.environ["KAGGLE_USERNAME"] = kaggle_dict["username"]
os.environ["KAGGLE_KEY"] = kaggle_dict["key"]

In [100]:
!kaggle datasets download -d lakshmi25npathi/imdb-dataset-of-50k-movie-reviews -p datasets

Dataset URL: https://www.kaggle.com/datasets/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews
License(s): other
imdb-dataset-of-50k-movie-reviews.zip: Skipping, found more recently modified local copy (use --force to force download)


unzip the dataset file 

In [101]:
zip_file_path = "datasets/imdb-dataset-of-50k-movie-reviews.zip"
extract_to_folder = "datasets"  

with ZipFile(zip_file_path, "r") as zip_file:
    zip_file.extractall(extract_to_folder)


### Load the datasest 

In [102]:
df = pd.read_csv("datasets/IMDB Dataset.csv")

In [103]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [104]:
df.shape


(50000, 2)

In [105]:
df['sentiment'].value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

#### Copy data 

In [106]:
df_copy = df.copy()

#### Encode sentiment 

In [107]:
df_copy.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


  df_copy.replace({"sentiment": {"positive": 1, "negative": 0}}, inplace=True)


In [108]:
df_copy.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1


#### Split data into training and test data 

In [109]:
train_data, test_data = train_test_split(df_copy, test_size=0.2, random_state=42)

### Data Preprocessing 

#### Tokenize text data 
- The tokenizer should only keep the top 5,000 most frequent words from the training data. Words that occur less frequently than this threshold will be ignored, which helps reduce the dimensionality of the input data and focus on the most relevant words.
- It learns the vocabulary of the training dataset and assigns an integer index to each unique word, starting from 1 (0 is reserved for padding).
- This function pads the sequences to ensure they are all of the same length. In this case, sequences longer than 200 words will be truncated, and shorter sequences will be padded with zeros at the beginning (default behavior).

In [110]:
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(train_data["review"])
X_train = pad_sequences(tokenizer.texts_to_sequences(train_data["review"]), maxlen=200)
X_test = pad_sequences(tokenizer.texts_to_sequences(test_data["review"]), maxlen=200)

In [111]:
print(X_train)

[[1935    1 1200 ...  205  351 3856]
 [   3 1651  595 ...   89  103    9]
 [   0    0    0 ...    2  710   62]
 ...
 [   0    0    0 ... 1641    2  603]
 [   0    0    0 ...  245  103  125]
 [   0    0    0 ...   70   73 2062]]


In [112]:
print(X_test)

[[   0    0    0 ...  995  719  155]
 [  12  162   59 ...  380    7    7]
 [   0    0    0 ...   50 1088   96]
 ...
 [   0    0    0 ...  125  200 3241]
 [   0    0    0 ... 1066    1 2305]
 [   0    0    0 ...    1  332   27]]


In [113]:
Y_train = train_data["sentiment"]
Y_test = test_data["sentiment"]

In [114]:
print(Y_train)

39087    0
30893    0
45278    1
16398    0
13653    0
        ..
11284    1
44732    1
38158    0
860      1
15795    1
Name: sentiment, Length: 40000, dtype: int64


In [115]:
print(Y_test)

33553    1
9427     1
199      0
12447    1
39489    0
        ..
28567    0
25079    1
18707    1
15200    0
5857     1
Name: sentiment, Length: 10000, dtype: int64


### LSTM -Long Short Term Memory 

#### Build the model


In [116]:
model = Sequential()
model.add(Input(shape=(200,)))  # Specify the input shape
model.add(Embedding(input_dim=5000, output_dim=128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation="sigmoid"))

In [117]:
model.summary()

In [118]:
model.compile(optimizer="adam", loss="binary_crossentropy", metrics=["accuracy"])

#### Training the model 

In [119]:
model.fit(X_train, Y_train, epochs=5, batch_size=64, validation_split=0.2)

Epoch 1/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 115ms/step - accuracy: 0.7175 - loss: 0.5376 - val_accuracy: 0.8526 - val_loss: 0.3567
Epoch 2/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 149ms/step - accuracy: 0.8581 - loss: 0.3478 - val_accuracy: 0.8413 - val_loss: 0.3974
Epoch 3/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 144ms/step - accuracy: 0.8531 - loss: 0.3478 - val_accuracy: 0.8595 - val_loss: 0.3407
Epoch 4/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m70s[0m 140ms/step - accuracy: 0.8917 - loss: 0.2707 - val_accuracy: 0.8696 - val_loss: 0.3197
Epoch 5/5
[1m500/500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m72s[0m 145ms/step - accuracy: 0.9099 - loss: 0.2382 - val_accuracy: 0.8716 - val_loss: 0.3222


<keras.src.callbacks.history.History at 0x7fd8c74ecad0>

#### Model Evaluation 

In [120]:
loss, accuracy = model.evaluate(X_test, Y_test)
print(f"Test Loss: {loss}")
print(f"Test Accuracy: {accuracy}")

[1m313/313[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 27ms/step - accuracy: 0.8732 - loss: 0.3110
Test Loss: 0.3078487813472748
Test Accuracy: 0.8758999705314636


##### Building a predective system 

In [126]:
def predict_review_sentiment(review_text):

  # tokenize and pad the review
  review_sequence = tokenizer.texts_to_sequences([review_text])
  padded_sequence = pad_sequences(review_sequence, maxlen=200)
  
  # Use the model to predict the sentiment
  prediction = model.predict(padded_sequence)
  
  # Convert the prediction to sentiment label
  sentiment_label = "Positive" if prediction[0][0] >= 0.5 else "Negative"
  
  return sentiment_label

In [127]:
# Example review to predict sentiment
sample_review = "I really enjoyed this film, it was fantastic!"

# Get the sentiment prediction
review_sentiment = predict_review_sentiment(sample_review)

# Display the result
print(f"The sentiment of the review is: {review_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
The sentiment of the review is: Positive


In [128]:
# Example review to predict sentiment
sample_review = "This movie was ok but not that good."

# Get the sentiment prediction
review_sentiment = predict_review_sentiment(sample_review)

# Display the result
print(f"The sentiment of the review is: {review_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
The sentiment of the review is: Negative


In [129]:
# Example review to predict sentiment
sample_review = "This movie was not that good"

# Get the sentiment prediction
review_sentiment = predict_review_sentiment(sample_review)

# Display the result
print(f"The sentiment of the review is: {review_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 19ms/step
The sentiment of the review is: Negative


In [130]:
# Example review to predict sentiment
sample_review = "I just watched the latest installment of the action franchise, and I have mixed feelings about it. On one hand, the cinematography was absolutely stunning; every scene was beautifully crafted and the visual effects were top-notch. The action sequences were intense, and I found myself on the edge of my seat during the climax. However, the plot was quite predictable and lacked depth. The character development was almost non-existent, making it hard to connect with any of the protagonists. \n While I appreciate the effort put into the special effects, it felt like the filmmakers prioritized style over substance. The dialogue was cheesy at times, and some of the supporting characters were more of a distraction than a contribution to the story. \n Overall, I would say it's a film worth watching for the visuals, but if you're looking for a compelling story with rich characters, you might want to skip it. It's a decent movie, but it could have been so much more."

# Get the sentiment prediction
review_sentiment = predict_review_sentiment(sample_review)

# Display the result
print(f"The sentiment of the review is: {review_sentiment}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
The sentiment of the review is: Negative
