In [None]:
# Step 1: Install necessary libraries (if not already installed)
!pip install tensorflow nltk

# Import libraries
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, LSTM, Bidirectional, Dense, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import matplotlib.pyplot as plt




[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Load the dataset (movie reviews)
# Here we're assuming the dataset is available as 'movie_reviews.csv'
import pandas as pd
data = pd.read_csv('IMDB_Dataset.csv')



In [None]:

# Split into input features (X) and target (y)
X = data['review'].values
y = data['sentiment'].values  # Assuming sentiment is binary (0 for negative, 1 for positive)

In [None]:
# Step 2: Tokenize the text data
vocab_size = 10000  # Limit the vocabulary size
max_len = 200  # Max number of words per review

In [None]:

tokenizer = Tokenizer(num_words=vocab_size)
tokenizer.fit_on_texts(X)
X_seq = tokenizer.texts_to_sequences(X)
X_pad = pad_sequences(X_seq, maxlen=max_len)

In [None]:

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_pad, y, test_size=0.2, random_state=42)


In [None]:
# Display the first few sequences
print(X_train[:2])

[[3015 3757 1882    2  146  145    3  226    4    3  207  326    2  145
  1077   16   88    4  132 2872 9046   18   10  154 9889   99    4    1
  3898  300   11   17  989   36    1  495  492 2592  250   73   76  106
   106  696   60   85 1057 1343    5  229  132   23 4201   31  138  212
  1133   14 4511 5339   31    3 2409    2    8   11    6    3  448   14
   619    4    1  719 3052    1 1245    2   73 3596    1  164 1514    1
  1239    5 1674    1  888 1261    5    1  309  140 2889    2  410  624
     7    7    1  272    6 3683 1010    5   26   39   14 1378  217   65
     2   46    6   30  224   27  191 1473    8 1088   18   10 4656   84
     1  227   66  358   68   54   27    5 3749   15   44   21  193    5
  8407    3  879 3438 1773   22   25    5  160  197  176    3  111   12
  1569  472   75  221    5  327    2 3498   35   23   51   71 1890 4808
    14    9 1388   11   19    6    3 3574 2064   16   61    1 2414  469
   533    2    3  171 2643 2811 1776    5  586    9   36    1 39

In [None]:
# Common function to build and compile models
def build_model(model_type='RNN', bidirectional=False):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length=max_len))

    if model_type == 'RNN':
        model.add(SimpleRNN(128, return_sequences=False))
    elif model_type == 'LSTM':
        model.add(LSTM(128, return_sequences=False))

    if bidirectional:
        model.add(Bidirectional(LSTM(128, return_sequences=False)))

    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

In [None]:
import numpy as np
from sklearn.preprocessing import LabelEncoder

# Create a LabelEncoder object
le = LabelEncoder()

# Fit the encoder to your labels
le.fit(y)

# Transform your labels to numerical representations
y_num = le.transform(y)

# Now use y_num for train_test_split
X_train, X_test, y_train, y_test = train_test_split(X_pad, y_num, test_size=0.2, random_state=42)

In [None]:

# Train RNN Model
rnn_model = build_model('RNN')
history_rnn = rnn_model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=64)

Epoch 1/5




[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m93s[0m 145ms/step - accuracy: 0.5154 - loss: 0.7386 - val_accuracy: 0.7901 - val_loss: 0.4692
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m98s[0m 157ms/step - accuracy: 0.7653 - loss: 0.5068 - val_accuracy: 0.7574 - val_loss: 0.5166
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m95s[0m 151ms/step - accuracy: 0.7694 - loss: 0.4997 - val_accuracy: 0.6286 - val_loss: 0.7996
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 150ms/step - accuracy: 0.7099 - loss: 0.5752 - val_accuracy: 0.8047 - val_loss: 0.4549
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m146s[0m 156ms/step - accuracy: 0.8351 - loss: 0.4070 - val_accuracy: 0.8165 - val_loss: 0.4413


In [None]:

# Train LSTM Model
lstm_model = build_model('LSTM')
history_lstm = lstm_model.fit(X_train, y_train, epochs=5, validation_data=(X_test, y_test), batch_size=64)


Epoch 1/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m350s[0m 554ms/step - accuracy: 0.7635 - loss: 0.4709 - val_accuracy: 0.8798 - val_loss: 0.2962
Epoch 2/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 539ms/step - accuracy: 0.9033 - loss: 0.2516 - val_accuracy: 0.8879 - val_loss: 0.2722
Epoch 3/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m385s[0m 545ms/step - accuracy: 0.9284 - loss: 0.1891 - val_accuracy: 0.8925 - val_loss: 0.2777
Epoch 4/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m378s[0m 538ms/step - accuracy: 0.9457 - loss: 0.1474 - val_accuracy: 0.8804 - val_loss: 0.2949
Epoch 5/5
[1m625/625[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m384s[0m 542ms/step - accuracy: 0.9552 - loss: 0.1243 - val_accuracy: 0.8848 - val_loss: 0.3187


In [None]:
# Common function to build and compile models
def build_model(model_type='RNN', bidirectional=False):
    model = Sequential()
    model.add(Embedding(vocab_size, 128, input_length=max_len))

    if model_type == 'RNN':
        model.add(SimpleRNN(128, return_sequences=False))
    elif model_type == 'LSTM':
        # If bidirectional is True, set return_sequences to True for the LSTM layer
        model.add(LSTM(128, return_sequences=bidirectional))

    if bidirectional:
        model.add(Bidirectional(LSTM(128, return_sequences=False)))

    model.add(Dropout(0.5))
    model.add(Dense(1, activation='sigmoid'))

    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

TUTORIAL 7
