<a href="https://colab.research.google.com/github/andr3w1699/HumanLanguageTechnologyProject/blob/main/SentimentClassificationWithRecurrent.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install -q gdown

In [2]:
# Replace FILE_ID with your actual file ID
file_id = '0Bz8a_Dbh9QhbZVhsUnRWRDhETzA'
output_name = 'amazon_review_full_csv.tar.gz'

!gdown --id {file_id} -O {output_name}

Downloading...
From (original): https://drive.google.com/uc?id=0Bz8a_Dbh9QhbZVhsUnRWRDhETzA
From (redirected): https://drive.google.com/uc?id=0Bz8a_Dbh9QhbZVhsUnRWRDhETzA&confirm=t&uuid=404193b5-b8bb-43e2-ad74-4bd1fd984256
To: /content/amazon_review_full_csv.tar.gz
100% 644M/644M [00:07<00:00, 88.9MB/s]


In [3]:
import tarfile

with tarfile.open(output_name, "r:gz") as tar:
    tar.extractall("Dataset")

In [4]:
!ls -R Dataset

Dataset:
amazon_review_full_csv

Dataset/amazon_review_full_csv:
readme.txt  test.csv  train.csv


In [5]:
import pandas as pd

# Set options to show full text and all rows
pd.set_option('display.max_colwidth', None)

df_train = pd.read_csv(
    './Dataset/amazon_review_full_csv/train.csv',
    header=None,
    names=['label', 'title', 'text'],
    quotechar='"',
    doublequote=True,
    escapechar='\\',
    engine='python',
    encoding='utf-8',
    on_bad_lines='skip'  # Skip rows with parsing errors
)

df_train.head()

Unnamed: 0,label,title,text
0,3,more like funchuck,"Gave this to my dad for a gag gift after directing ""Nunsense,"" he got a reall kick out of it!"
1,5,Inspiring,"I hope a lot of people hear this cd. We need more strong and positive vibes like this. Great vocals, fresh tunes, cross-cultural happiness. Her blues is from the gut. The pop sounds are catchy and mature."
2,5,The best soundtrack ever to anything.,"I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny."
3,4,Chrono Cross OST,"The music of Yasunori Misuda is without question my close second below the great Nobuo Uematsu.Chrono Cross OST is a wonderful creation filled with rich orchestra and synthesized sounds. While ambiance is one of the music's major factors, yet at times it's very uplifting and vigorous. Some of my favourite tracks include; ""Scars Left by Time, The Girl who Stole the Stars, and Another World""."
4,5,Too good to be true,Probably the greatest soundtrack in history! Usually it's better to have played the game first but this is so enjoyable anyway! I worked so hard getting this soundtrack and after spending [money] to get it it was really worth every penny!! Get this OST! it's amazing! The first few tracks will have you dancing around with delight (especially Scars Left by Time)!! BUY IT NOW!!


In [6]:
# Number of rows
print("Number of rows:", len(df_train))

# Check for null values
if df_train.isnull().values.any():
    print("There are null elements in the DataFrame.")
else:
    print("There are no null elements in the DataFrame.")

Number of rows: 2999746
There are null elements in the DataFrame.


In [8]:
df_test = pd.read_csv(
    './Dataset/amazon_review_full_csv/train.csv',
    header=None,
    names=['label', 'title', 'text'],
    quotechar='"',
    doublequote=True,
    escapechar='\\',
    engine='python',
    encoding='utf-8',
    on_bad_lines='skip'  # Skip rows with parsing errors
)

df_test.head()

Unnamed: 0,label,title,text
0,3,more like funchuck,"Gave this to my dad for a gag gift after directing ""Nunsense,"" he got a reall kick out of it!"
1,5,Inspiring,"I hope a lot of people hear this cd. We need more strong and positive vibes like this. Great vocals, fresh tunes, cross-cultural happiness. Her blues is from the gut. The pop sounds are catchy and mature."
2,5,The best soundtrack ever to anything.,"I'm reading a lot of reviews saying that this is the best 'game soundtrack' and I figured that I'd write a review to disagree a bit. This in my opinino is Yasunori Mitsuda's ultimate masterpiece. The music is timeless and I'm been listening to it for years now and its beauty simply refuses to fade.The price tag on this is pretty staggering I must say, but if you are going to buy any cd for this much money, this is the only one that I feel would be worth every penny."
3,4,Chrono Cross OST,"The music of Yasunori Misuda is without question my close second below the great Nobuo Uematsu.Chrono Cross OST is a wonderful creation filled with rich orchestra and synthesized sounds. While ambiance is one of the music's major factors, yet at times it's very uplifting and vigorous. Some of my favourite tracks include; ""Scars Left by Time, The Girl who Stole the Stars, and Another World""."
4,5,Too good to be true,Probably the greatest soundtrack in history! Usually it's better to have played the game first but this is so enjoyable anyway! I worked so hard getting this soundtrack and after spending [money] to get it it was really worth every penny!! Get this OST! it's amazing! The first few tracks will have you dancing around with delight (especially Scars Left by Time)!! BUY IT NOW!!


In [9]:
# Keep only positive (4,5) and negative (1,2) ratings
df_train_binary = df_train[df_train['label'] != 3].copy()

# Map ratings to binary sentiment
df_train_binary['sentiment'] = df_train_binary['label'].apply(lambda x: 1 if x > 3 else 0)

In [10]:
df_train_binary['review'] = df_train_binary['title'].fillna('') + ' ' + df_train_binary['text'].fillna('')
df_train_sampled = df_train_binary.sample(n=100000, random_state=42)
X = df_train_sampled['review'].values
y = df_train_sampled['sentiment'].values

In [11]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Parameters
max_words = 30000  # Size of vocabulary
max_len = 200      # Max review length

# Tokenizer
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X)

# Convert text to sequences
sequences = tokenizer.texts_to_sequences(X)
X_padded = pad_sequences(sequences, maxlen=max_len, padding='post', truncating='post')

In [12]:
from sklearn.model_selection import train_test_split

X_train, X_val, y_train, y_val = train_test_split(X_padded, y, test_size=0.2, random_state=42)

In [13]:
from tensorflow.keras.layers import Bidirectional,LSTM, Input, Dense, Dropout, Embedding
from tensorflow.keras.models import Sequential

model_BiLSTM = Sequential([
    Input(shape=(max_len,)),  # Define the input shape
    Embedding(input_dim=max_words, output_dim=128),
    Bidirectional(LSTM(64)),  # BiLSTM instead of LSTM
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])
model_BiLSTM.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_BiLSTM.summary()

In [15]:
from tensorflow.keras.layers import LSTM, Input, Dense, Dropout, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping


# Define EarlyStopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

# Train the model with EarlyStopping
history = model_BiLSTM.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=512,
    epochs=1,
    callbacks=[early_stop]  # 👈 Early stopping in action
)

[1m157/157[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m343s[0m 2s/step - accuracy: 0.7293 - loss: 0.5283 - val_accuracy: 0.8856 - val_loss: 0.2837


In [16]:
# Keep only positive (4,5) and negative (1,2) ratings
df_test_binary = df_test[df_test['label'] != 3].copy()

# Map ratings to binary sentiment
df_test_binary['sentiment'] = df_test_binary['label'].apply(lambda x: 1 if x > 3 else 0)


# Preprocess test set
df_test_binary['review'] = df_test_binary['title'].fillna('') + ' ' + df_test_binary['text'].fillna('')

df_test_sampled = df_test_binary.sample(n=100000, random_state=42)

X_test_seq = tokenizer.texts_to_sequences(df_test_sampled['review'].values)
X_test_padded = pad_sequences(X_test_seq, maxlen=max_len, padding='post', truncating='post')
y_test = df_test_sampled['sentiment'].values

# Evaluate
loss, acc = model_BiLSTM.evaluate(X_test_padded, y_test)
print(f"Test accuracy: {acc:.2f}")



[1m3125/3125[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m193s[0m 62ms/step - accuracy: 0.9048 - loss: 0.2532
Test accuracy: 0.91


In [33]:
from tensorflow.keras.layers import LSTM, Input, Dense, Dropout, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.layers import SpatialDropout1D


# 1. Build model
model = Sequential([
    Input(shape=(max_len,)),
    Embedding(max_words, 128),
    SpatialDropout1D(0.3),
    LSTM(128),
    Dropout(0.7),
    Dense(1, activation='sigmoid')
])

# 2. Compile with gradient clipping
optimizer = Adam(learning_rate=1e-3, clipnorm=0.5)
model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model_LSTM.compile(
    loss='binary_crossentropy',
    optimizer=optimizer,
    metrics=['accuracy']
)
model_LSTM.summary()

In [None]:
from tensorflow.keras.layers import LSTM, Input, Dense, Dropout, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.callbacks import ReduceLROnPlateau

reduce_lr = ReduceLROnPlateau(
    monitor='val_loss',
    factor=0.5,        # lr = lr * 0.5
    patience=2,        # wait 2 epochs before reducing
    min_lr=1e-7,       # lower bound on lr
    verbose=1
)


# Define EarlyStopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=4,
    restore_best_weights=True
)

# Train the model with EarlyStopping
history = model_LSTM.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=512,
    epochs=50,
    callbacks=[early_stop, reduce_lr]  # 👈 Early stopping in action
)

Epoch 1/50
[1m 13/157[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m5:05[0m 2s/step - accuracy: 0.5004 - loss: nan

In [None]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dropout, Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

model_BiLSTM_CNN = Sequential([
    Input(shape=(max_len,)),
    Embedding(input_dim=max_words, output_dim=128),

    Bidirectional(LSTM(64, return_sequences=True)),  # Keep sequences for CNN
    Conv1D(filters=64, kernel_size=3, activation='relu'),
    MaxPooling1D(pool_size=2),

    GlobalMaxPooling1D(),  # 👈 This flattens (batch, time, features) → (batch, features)
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

# Compile the model
model_BiLSTM_CNN.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Early stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=2,
    restore_best_weights=True
)

# Train the model
history = model_BiLSTM_CNN.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=512,
    epochs=10,
    callbacks=[early_stop]
)

# Show model summary
model_BiLSTM_CNN.summary()

In [None]:
# Evaluate
loss, acc = model_BiLSTM_CNN.evaluate(X_test_padded, y_test)
print(f"Test accuracy: {acc:.2f}")


In [None]:
from tensorflow.keras.layers import GRU, Input, Dense, Dropout, Embedding
from tensorflow.keras.models import Sequential

model_GRU = Sequential([
    Input(shape=(max_len,)),  # Same input shape
    Embedding(input_dim=max_words, output_dim=128),
    GRU(32),                  # 👈 GRU instead of LSTM
    Dropout(0.5),
    Dense(1, activation='sigmoid')
])

model_GRU.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model_GRU.summary()

In [None]:
# Early stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=6,
    restore_best_weights=True
)

# Train the model
history = model_GRU.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=512,
    epochs=20,
    callbacks=[early_stop]
)

# Show model summary
model_GRU.summary()

In [None]:
from tensorflow.keras.layers import SimpleRNN, Input, Dense, Dropout, Embedding
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

model_Simple_RNN = Sequential([
    Input(shape=(max_len,)),               # input_length = max_len
    Embedding(input_dim=max_words,         # vocabulary size
              output_dim=128),             # embedding dimension
    Bidirectional(SimpleRNN(128)),                        # simple RNN with 128 units
    Dropout(0.5),
    Dense(1, activation='sigmoid')         # binary output
])

model_Simple_RNN.compile(
    loss='binary_crossentropy',
    optimizer='adam',
    metrics=['accuracy']
)

model_Simple_RNN.summary()

# Early stopping callback
early_stop = EarlyStopping(
    monitor='val_loss',
    patience=4,
    restore_best_weights=True
)

# Train
history = model_Simple_RNN.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=512,
    epochs=20,
    callbacks=[early_stop]
)

In [None]:
!pip install gdown

In [None]:
!pip install gensim

In [None]:
import gdown # Import gdown
# Google Drive URL for the Word2Vec model
url = 'https://drive.google.com/uc?id=0B7XkCwpI5KDYNlNUTTlSS21pQmM'
output = 'GoogleNews-vectors-negative300.bin.gz'

# Download the file
gdown.download(url, output, quiet=False)

In [None]:
import gzip
import shutil

with gzip.open('GoogleNews-vectors-negative300.bin.gz', 'rb') as f_in:
    with open('GoogleNews-vectors-negative300.bin', 'wb') as f_out:
        shutil.copyfileobj(f_in, f_out)

In [None]:
from gensim.models import KeyedVectors

# Load the Word2Vec model
w2v_model = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin', binary=True)

# Get vector for a word (e.g., "computer")
vector = w2v_model['computer']
print(vector)

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer

# Set up tokenizer
max_words = 10000  # Max number of words in the vocabulary
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(df_train_sampled['review'])  # Replace with your column name

# Prepare embedding matrix
embedding_dim = 300  # Google News vectors are 300-dimensional
embedding_matrix = np.zeros((max_words, embedding_dim))

for word, i in tokenizer.word_index.items():
    if i < max_words:
        try:
            embedding_matrix[i] = w2v_model[word]  # Fetch embedding
        except KeyError:
            embedding_matrix[i] = np.random.normal(scale=0.6, size=(embedding_dim,))  # Random embedding

In [None]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping

# Define the LSTM model
model_LSTM_W2V = Sequential([
    Input(shape=(max_len,)),  # Input shape is the padded sequence length
    Embedding(input_dim=max_words,
              output_dim=embedding_dim,
              weights=[embedding_matrix],  # Use Word2Vec embeddings
              input_length=max_len,
              trainable=False),  # Freeze embeddings
    LSTM(64),  # LSTM layer
    Dropout(0.5),
    Dense(1, activation='sigmoid')  # Binary classification
])

model_LSTM_W2V.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model_LSTM_W2V.summary()

In [None]:
# Set up early stopping
early_stop = EarlyStopping(monitor='val_loss', patience=4, restore_best_weights=True)

# Train the model
history = model_LSTM_W2V.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    batch_size=512,
    epochs=20,
    callbacks=[early_stop]
)
