In [1]:
import nltk
import pandas as pd
import numpy as np
import re
import string
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Model
from keras.layers import Input, Embedding, SimpleRNN, LSTM, Dense, Dropout, concatenate
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding, BatchNormalization, SimpleRNN
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import EarlyStopping

# Download necessary NLTK data
nltk.download("wordnet")
nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [37]:
# Load the dataset
df = pd.read_csv("amazon_reviews.csv")

# Inspect the data
df.head()

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score
0,positive,i wish would have gotten one earlier love it a...,19,5.0
1,neutral,i ve learned this lesson again open the packag...,88,1.0
2,neutral,it is so slow and lags find better option,9,2.0
3,neutral,roller ball stopped working within months of m...,12,1.0
4,neutral,i like the color and size but it few days out ...,21,1.0


### Data Preprocessing

In [38]:
# Check for missing values
print(df.isnull().sum())

sentiments               0
cleaned_review           3
cleaned_review_length    0
review_score             0
dtype: int64


In [39]:
df = df.dropna()

In [40]:
print(df.isnull().sum())

sentiments               0
cleaned_review           0
cleaned_review_length    0
review_score             0
dtype: int64


In [41]:
df['cleaned_review'] = df['cleaned_review'].astype(str)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['cleaned_review'] = df['cleaned_review'].astype(str)


In [42]:
# Remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

In [43]:
# Remove special characters
def special_characters(text):
    pattern = re.compile('[^a-zA-z0-9\s]')
    text = re.sub(pattern, '', text)
    return text

In [44]:
# Removing numbers
def remove_numbers(text):
  pattern = re.compile('\d+')
  text = re.sub(pattern, '', text)
  return text

In [45]:
# lowercase text
def lowercase(text):
    text = text.lower()
    return text

In [46]:
# Tokenize text
def tokenize_text(text):
    tokens = word_tokenize(text)
    return tokens

In [47]:
# Remove stopwords
def stopwords_removal(text):
    stopwords_list = set(stopwords.words('english'))
    filtered_words = [word for word in text if word.lower() not in stopwords_list]
    return filtered_words

In [48]:
# Lemmatize text
def lemmatizer(text):
    wordnet_lemmatizer = WordNetLemmatizer()
    lemm_text = [wordnet_lemmatizer.lemmatize(word) for word in text]
    return lemm_text

In [49]:
df.head()

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score
0,positive,i wish would have gotten one earlier love it a...,19,5.0
1,neutral,i ve learned this lesson again open the packag...,88,1.0
2,neutral,it is so slow and lags find better option,9,2.0
3,neutral,roller ball stopped working within months of m...,12,1.0
4,neutral,i like the color and size but it few days out ...,21,1.0


In [50]:
def preprocess(text):
    text = remove_numbers(text)
    text = remove_punctuation(text)
    text = special_characters(text)
    text = lowercase(text)
    text = tokenize_text(text)
    text = stopwords_removal(text)
    text = lemmatizer(text)
    return text

In [51]:
df['tokenized_reviews'] = df['cleaned_review'].apply(lambda x: preprocess(x))

In [52]:
df.head()

Unnamed: 0,sentiments,cleaned_review,cleaned_review_length,review_score,tokenized_reviews
0,positive,i wish would have gotten one earlier love it a...,19,5.0,"[wish, would, gotten, one, earlier, love, make..."
1,neutral,i ve learned this lesson again open the packag...,88,1.0,"[learned, lesson, open, package, use, product,..."
2,neutral,it is so slow and lags find better option,9,2.0,"[slow, lag, find, better, option]"
3,neutral,roller ball stopped working within months of m...,12,1.0,"[roller, ball, stopped, working, within, month..."
4,neutral,i like the color and size but it few days out ...,21,1.0,"[like, color, size, day, return, period, hold,..."


### Encoding Labels

In [53]:
# Encode the sentiment labels
df = pd.get_dummies(df, columns=['sentiments'])

In [54]:
df[['sentiments_negative', 'sentiments_neutral', 'sentiments_positive']] = df[['sentiments_negative', 'sentiments_neutral', 'sentiments_positive']].astype(int)

In [55]:
df.head()

Unnamed: 0,cleaned_review,cleaned_review_length,review_score,tokenized_reviews,sentiments_negative,sentiments_neutral,sentiments_positive
0,i wish would have gotten one earlier love it a...,19,5.0,"[wish, would, gotten, one, earlier, love, make...",0,0,1
1,i ve learned this lesson again open the packag...,88,1.0,"[learned, lesson, open, package, use, product,...",0,1,0
2,it is so slow and lags find better option,9,2.0,"[slow, lag, find, better, option]",0,1,0
3,roller ball stopped working within months of m...,12,1.0,"[roller, ball, stopped, working, within, month...",0,1,0
4,i like the color and size but it few days out ...,21,1.0,"[like, color, size, day, return, period, hold,...",0,1,0


### Word Embedding

In [56]:
def Word_Embedding(df, max_length=100):
    list_reviews = df['tokenized_reviews'].tolist()

    # Instantiate Tokenizer and fit on training data
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(list_reviews)

    # Convert text to sequences of indices
    sequences = tokenizer.texts_to_sequences(list_reviews)

    # Pad sequences
    padded_sequences = pad_sequences(sequences, maxlen=max_length)

    # Prepare X and y
    X = padded_sequences
    y = df[['sentiments_positive', 'sentiments_neutral', 'sentiments_negative']]

    return X, y, tokenizer

In [57]:
X, y, tokenizer = Word_Embedding(df)

### Data Splitting

In [58]:
# Split the data into training and validation sets (80% training, 20% validation)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

### Building LTSM Model

In [59]:
# LSTM MODEL
def create_lstm_model(tokenizer, max_length=100, embedding_size=500, lstm_units=300, dense_units=50, dropout_rate=0.3):
    # Define the LSTM model
    model_LSTM = Sequential()
    model_LSTM.add(Embedding(len(tokenizer.word_index) + 1, embedding_size, input_length=max_length))
    model_LSTM.add(LSTM(lstm_units))
    model_LSTM.add(BatchNormalization())
    model_LSTM.add(Dropout(dropout_rate))
    model_LSTM.add(Dense(dense_units, activation='relu'))
    model_LSTM.add(BatchNormalization())
    model_LSTM.add(Dropout(dropout_rate))
    model_LSTM.add(Dense(3, activation='softmax'))

    return model_LSTM

### Building RNN Model

In [60]:
# Simple RNN Model
def create_rnn_model(tokenizer, max_length=100, embedding_size=500, rnn_units=150, dense_units=50, dropout_rate=0.5, l2_reg=0.01):
    # Define the RNN model
    model_RNN = Sequential()
    model_RNN.add(Embedding(len(tokenizer.word_index) + 1, embedding_size, input_length=max_length))
    model_RNN.add(SimpleRNN(rnn_units))
    model_RNN.add(BatchNormalization())
    model_RNN.add(Dropout(dropout_rate))
    model_RNN.add(Dense(dense_units, activation='relu', kernel_regularizer=tf.keras.regularizers.l2(l2_reg)))
    model_RNN.add(BatchNormalization())
    model_RNN.add(Dropout(dropout_rate))
    model_RNN.add(Dense(3, activation='softmax'))

    return model_RNN

### Training LSTM Model

In [63]:
# Compile the LTSM model
model_LSTM = create_lstm_model(tokenizer)
model_LSTM.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

In [64]:
# Train the model
modelLTSM = model_LSTM.fit(X_train, y_train, batch_size=64, epochs=3, verbose=1, validation_data=(X_test, y_test))

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [65]:
# Evaluate the model
test_loss, test_accuracy = model_LSTM.evaluate(X_test, y_test, verbose=1)
print("LTSM Test Loss:", test_loss)
print("LTSM Test Accuracy:", test_accuracy)

LTSM Test Loss: 0.40118664503097534
LTSM Test Accuracy: 0.8566897511482239


### Training RNN Model

In [66]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [67]:
# Compile the RNN model
model_RNN = create_rnn_model(tokenizer)
model_RNN.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])

In [68]:
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [69]:
# Train the RNN model
modelRNN = model_RNN.fit(X_train, y_train, batch_size=64, epochs=10, verbose=1, validation_data=(X_test, y_test), callbacks=[early_stopping])

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [70]:
# Evaluate the model
test_loss, test_accuracy = model_RNN.evaluate(X_test, y_test, verbose=1)
print("RNN Test Loss:", test_loss)
print("RNN Test Accuracy:", test_accuracy)

RNN Test Loss: 0.5450009107589722
RNN Test Accuracy: 0.8497692942619324


### Bonus Part

In [None]:
def predict_sentiment(new_review, model, tokenizer, max_length):
    # Preprocess the review
    review_cleaned = preprocess(new_review)
    # reviews = review_cleaned.tolist()

    # Convert review text to sequence
    review_sequence = tokenizer.texts_to_sequences([review_cleaned])
    review_padded = pad_sequences(review_sequence, maxlen=max_length)

    # Predict sentiment
    prediction = model.predict(review_padded)

    sentiment_index = np.argmax(prediction)
    sentiment_labels = ['sentiments_positive', 'sentiments_neutral', 'sentiments_negative']
    sentiment_dict = {0: 'Positive', 1: 'Neutral'}

    if sentiment_index in sentiment_dict:
        sentiment = sentiment_dict[sentiment_index]
    else:
        sentiment = "Negative"

    return sentiment

    # return prediction

In [None]:
# Function to get user input and predict sentiment using both models
def evaluate_user_review():
    review = input("Enter a new review: ")

    print("\nEvaluating review using LSTM model:")
    lstm_sentiment = predict_sentiment(review, model_LSTM, tokenizer, max_length)
    print(f"Predicted sentiment: {lstm_sentiment}")

    print("\nEvaluating review using RNN model:")
    rnn_sentiment = predict_sentiment(review, model_RNN, tokenizer, max_length)
    print(f"Predicted sentiment: {rnn_sentiment}")

In [None]:
evaluate_user_review()

Enter a new review: i love this product!

Evaluating review using LSTM model:
Predicted sentiment: Positive

Evaluating review using RNN model:
Predicted sentiment: Positive
