In [14]:
!pip install tensorflow




In [15]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Download stopwords (if not already installed)
nltk.download('stopwords')

# Load dataset
data = pd.read_csv('/content/drive/MyDrive/INST 750/Assignment 1/IMDB Dataset.csv')

# Initialize stopwords
english_stops = set(stopwords.words('english'))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
def load_dataset():
    df = data.copy()
    x_data = df['review']
    y_data = df['sentiment']

    # Pre-processing
    x_data = x_data.replace({'<.*?>': ''}, regex=True)
    x_data = x_data.replace({'[^A-Za-z]': ' '}, regex=True)
    x_data = x_data.apply(lambda review: [w for w in review.split() if w not in english_stops])
    x_data = x_data.apply(lambda review: [w.lower() for w in review])

    # ENCODE SENTIMENT -> 0 & 1
    y_data = y_data.replace('positive', 1)
    y_data = y_data.replace('negative', 0)

    return x_data, y_data


In [17]:
# Load dataset
x_data, y_data = load_dataset()

# First split: 75% train, 25% temp (validation + test)
x_train, x_temp, y_train, y_temp = train_test_split(x_data, y_data, test_size=0.25, random_state=42, stratify=y_data)

# Second split: 15% validation, 10% test (relative to full dataset)
x_val, x_test, y_val, y_test = train_test_split(x_temp, y_temp, test_size=0.4, random_state=42, stratify=y_temp)


  y_data = y_data.replace('negative', 0)


In [18]:
# Print dataset sizes
print(f"Training Set Size: {len(x_train)}")
print(f"Validation Set Size: {len(x_val)}")
print(f"Test Set Size: {len(x_test)}")

print('\nTrain Set')
print(x_train[:5], '\n')
print(y_train[:5], '\n')

print('Validation Set')
print(x_val[:5], '\n')
print(y_val[:5], '\n')

print('Test Set')
print(x_test[:5], '\n')
print(y_test[:5])

Training Set Size: 37500
Validation Set Size: 7500
Test Set Size: 5000

Train Set
17949    [i, saw, adam, had, four, sons, first, time, t...
5786     [i, one, shamelessly, enjoyed, every, episode,...
42175    [this, movie, journey, mind, screenwriter, cau...
39484    [this, absolutely, one, best, movies, i, seen,...
34209    [oh, geez, there, many, films, i, want, see, i...
Name: review, dtype: object 

17949    0
5786     1
42175    1
39484    1
34209    0
Name: sentiment, dtype: int64 

Validation Set
11066    [the, good, thing, movie, created, made, hungr...
19236    [i, provided, location, services, film, every,...
49911    [after, empire, strikes, back, return, jedi, s...
16692    [this, one, creepy, movie, creepier, anything,...
27069    [the, saddest, thing, tribute, almost, singers...
Name: review, dtype: object 

11066    0
19236    1
49911    1
16692    1
27069    0
Name: sentiment, dtype: int64 

Test Set
43988    [i, watched, movie, last, week, sometime, bigg...
41806    [i

In [19]:
# Function to determine max review length
def get_max_length():
    review_length = [len(review) for review in x_train]
    return int(np.ceil(np.mean(review_length)))

In [20]:
# ENCODE REVIEW
token = Tokenizer(lower=False)
token.fit_on_texts(x_train)

# Convert text to sequences
x_train = token.texts_to_sequences(x_train)
x_val = token.texts_to_sequences(x_val)
x_test = token.texts_to_sequences(x_test)

# Determine max sequence length
max_length = get_max_length()

# Pad sequences
x_train = pad_sequences(x_train, maxlen=max_length, padding='post', truncating='post')
x_val = pad_sequences(x_val, maxlen=max_length, padding='post', truncating='post')
x_test = pad_sequences(x_test, maxlen=max_length, padding='post', truncating='post')

total_words = len(token.word_index) + 1

print('\nEncoded X Train\n', x_train[:5], '\n')
print('Encoded X Validation\n', x_val[:5], '\n')
print('Encoded X Test\n', x_test[:5], '\n')
print('Maximum review length: ', max_length)


Encoded X Train
 [[    1   119  1871  2891   575  3358    23    10    66  3067     1   164
   2026   234  8395  8206   575  3358   111  9331  2181   575   786     4
   2847  8723   993  1871 38735  3458  1113   133  3042    12   453   705
   1816   116 21973  1317 25546    33     6 25546 38735  3358  2655    85
    204     1   323    70  5675  2107   664   210   434    31   571 34663
    124   819  4798 21974    87   143  2664 11685   202    49     5  7536
   5497   508 11685 18768  9089   148  5449     5   387  1417    72   444
    140   199   780     7   308 23055   145    84   206   732   695     1
     92   321     1  1245   856  2026 27135  8395  8206  2847  8723  2053
    196  1540   285   232    80     2  3358    20 13185  6224  3043   767
     43   124    48   566  3043  1582    89 13185  2656  8723]
 [    1     5  8607   410    83   298  4015 12395   773   339   859  3622
   4318    54  9904     9    46  3976  3675  2542  5451  3414   554   403
    931   519 38736  1842  7082

In [21]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np

In [22]:
# ARCHITECTURE
EMBED_DIM = 32
LSTM_OUT = 64

model = Sequential([
    Embedding(total_words, EMBED_DIM, input_length=max_length),
    LSTM(LSTM_OUT, dropout=0.3, recurrent_dropout=0.3),  # ✅ Added dropout to prevent overfitting
    Dense(1, activation='sigmoid')  # ✅ Sigmoid for binary classification
])

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()



In [23]:
# CHECKPOINTING
checkpoint = ModelCheckpoint(
    'models/LSTM.h5',
    monitor='val_loss',
    save_best_only=True,
    verbose=1
)

In [24]:
# TRAINING
history = model.fit(
    x_train, y_train,
    validation_data=(x_val, y_val),
    epochs=10,
    batch_size=128,
    callbacks=[checkpoint]
)

Epoch 1/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 147ms/step - accuracy: 0.5323 - loss: 0.6896
Epoch 1: val_loss improved from inf to 0.66679, saving model to models/LSTM.h5




[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m52s[0m 158ms/step - accuracy: 0.5324 - loss: 0.6895 - val_accuracy: 0.5876 - val_loss: 0.6668
Epoch 2/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.5805 - loss: 0.6650
Epoch 2: val_loss improved from 0.66679 to 0.65813, saving model to models/LSTM.h5




[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 144ms/step - accuracy: 0.5805 - loss: 0.6650 - val_accuracy: 0.5984 - val_loss: 0.6581
Epoch 3/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 138ms/step - accuracy: 0.6562 - loss: 0.6285
Epoch 3: val_loss improved from 0.65813 to 0.53901, saving model to models/LSTM.h5




[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m83s[0m 147ms/step - accuracy: 0.6563 - loss: 0.6284 - val_accuracy: 0.7628 - val_loss: 0.5390
Epoch 4/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 134ms/step - accuracy: 0.7326 - loss: 0.5646
Epoch 4: val_loss improved from 0.53901 to 0.53124, saving model to models/LSTM.h5




[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 143ms/step - accuracy: 0.7326 - loss: 0.5646 - val_accuracy: 0.7647 - val_loss: 0.5312
Epoch 5/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.7752 - loss: 0.5096
Epoch 5: val_loss improved from 0.53124 to 0.49014, saving model to models/LSTM.h5




[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 144ms/step - accuracy: 0.7752 - loss: 0.5095 - val_accuracy: 0.7921 - val_loss: 0.4901
Epoch 6/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.7851 - loss: 0.4911
Epoch 6: val_loss improved from 0.49014 to 0.40894, saving model to models/LSTM.h5




[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 144ms/step - accuracy: 0.7851 - loss: 0.4909 - val_accuracy: 0.8404 - val_loss: 0.4089
Epoch 7/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 135ms/step - accuracy: 0.8679 - loss: 0.3444
Epoch 7: val_loss improved from 0.40894 to 0.39795, saving model to models/LSTM.h5




[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 141ms/step - accuracy: 0.8679 - loss: 0.3444 - val_accuracy: 0.8539 - val_loss: 0.3980
Epoch 8/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 155ms/step - accuracy: 0.8959 - loss: 0.2850
Epoch 8: val_loss improved from 0.39795 to 0.37282, saving model to models/LSTM.h5




[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m88s[0m 161ms/step - accuracy: 0.8960 - loss: 0.2850 - val_accuracy: 0.8556 - val_loss: 0.3728
Epoch 9/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 137ms/step - accuracy: 0.9253 - loss: 0.2171
Epoch 9: val_loss did not improve from 0.37282
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m78s[0m 146ms/step - accuracy: 0.9253 - loss: 0.2171 - val_accuracy: 0.8595 - val_loss: 0.3815
Epoch 10/10
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 141ms/step - accuracy: 0.9437 - loss: 0.1699
Epoch 10: val_loss did not improve from 0.37282
[1m293/293[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 147ms/step - accuracy: 0.9436 - loss: 0.1699 - val_accuracy: 0.8599 - val_loss: 0.3916


In [25]:
# EVALUATE MODEL
loss, accuracy = model.evaluate(x_test, y_test)
print(f"LSTM Test Accuracy: {accuracy:.4f}")

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 16ms/step - accuracy: 0.8800 - loss: 0.3403
LSTM Test Accuracy: 0.8668


In [26]:
# PREDICTIONS
y_pred = (model.predict(x_test) > 0.5).astype("int32")

# CALCULATE CORRECT & WRONG PREDICTIONS
correct = np.sum(y_pred.flatten() == y_test)
wrong = len(y_pred) - correct

print(f'Correct Predictions: {correct}')
print(f'Wrong Predictions: {wrong}')
print(f'Final Accuracy: {correct / len(y_pred) * 100:.2f}%')

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 17ms/step
Correct Predictions: 4334
Wrong Predictions: 666
Final Accuracy: 86.68%
