In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [5]:
df = pd.read_csv('/content/drive/MyDrive/IMDB Dataset.csv')

In [6]:
df.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [13]:
df['sentiment'].value_counts()

positive    25000
negative    25000
Name: sentiment, dtype: int64

In [14]:
df.isnull().sum()

review       0
sentiment    0
dtype: int64

In [22]:

import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import string

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [23]:
# Define a function to clean and preprocess text
def clean_text(text):
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Convert to lowercase
    text = text.lower()
    
    # Remove stop words
    stop_words = set(stopwords.words('english'))
    words = text.split()
    filtered_words = [word for word in words if word.lower() not in stop_words]
    text = ' '.join(filtered_words)
    
    return text

In [24]:
df['review'] = df['review'].apply(clean_text)

In [26]:
print(df['review'].iloc[2])

thought wonderful way spend time hot summer weekend sitting air conditioned theater watching lighthearted comedy plot simplistic dialogue witty characters likable even well bread suspected serial killer may disappointed realize match point 2 risk addiction thought proof woody allen still fully control style many us grown lovebr br id laughed one woodys comedies years dare say decade ive never impressed scarlet johanson managed tone sexy image jumped right average spirited young womanbr br may crown jewel career wittier devil wears prada interesting superman great comedy go see friends


In [27]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from sklearn.model_selection import train_test_split

In [28]:
# Step 1: Cutoff reviews after 150 words
df['review'] = df['review'].apply(lambda x: ' '.join(x.split()[:150]))

In [29]:
# Step 2: Restrict training samples to 100
df_train = df[:100]

In [30]:
# Step 3: Validate on 10,000 samples
df_val = df[10000:20000]

In [31]:
# Step 4: Consider only the top 10,000 words
num_words = 10000
tokenizer = Tokenizer(num_words=num_words)
tokenizer.fit_on_texts(df_train['review'])
X_train = tokenizer.texts_to_sequences(df_train['review'])
X_val = tokenizer.texts_to_sequences(df_val['review'])
X_train = pad_sequences(X_train, maxlen=150)
X_val = pad_sequences(X_val, maxlen=150)
y_train = np.array(df_train['sentiment'])
y_val = np.array(df_val['sentiment'])

In [32]:
# Step 5a: Use an embedding layer
model1 = Sequential()
model1.add(Embedding(num_words, 32, input_length=150))
model1.add(Bidirectional(LSTM(64)))
model1.add(Dense(1, activation='sigmoid'))
model1.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model1.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 150, 32)           320000    
                                                                 
 bidirectional (Bidirectiona  (None, 128)              49664     
 l)                                                              
                                                                 
 dense (Dense)               (None, 1)                 129       
                                                                 
Total params: 369,793
Trainable params: 369,793
Non-trainable params: 0
_________________________________________________________________


In [None]:
# Step 5b: Use a pre-trained word embedding
from gensim.models import KeyedVectors
pretrained_path = 'path/to/pretrained/word/embedding'
pretrained_model = KeyedVectors.load_word2vec_format(pretrained_path, binary=True)
embedding_matrix = np.zeros((num_words, 300))
for word, i in tokenizer.word_index.items():
    if i >= num_words:
        break
    if word in pretrained_model:
        embedding_matrix[i] = pretrained_model[word]
model2 = Sequential()
model2.add(Embedding(num_words, 300, weights=[embedding_matrix], input_length=150, trainable=False))
model2.add(Bidirectional(LSTM(64)))
model2.add(Dense(1, activation='sigmoid'))
model2.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model2.summary()

In [None]:
# Train and evaluate the models
batch_size = 32
epochs = 10
history1 = model1.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))
history2 = model2.fit(X_train, y_train, batch_size=batch_size, epochs=epochs, validation_data=(X_val, y_val))

# Compare the results
print('Model 1 (embedding layer) accuracy:', max(history1.history['val_accuracy']))
print('Model 2 (pretrained word embedding) accuracy:', max(history2.history['val_accuracy']))

# Vary the number of training samples to see at what point the embedding layer gives better performance
num_train_samples = [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
for n in num_train_samples:
    df_train = df[:n]
    X_train = tokenizer.texts_to_sequences(df_train['review'])
    X_train = pad_sequences(X_train, maxlen=150)
    y_train = np.array(df_train['sentiment'])
    history1 = model