# Data Preparation

In [2]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

df = kagglehub.load_dataset(
    KaggleDatasetAdapter.PANDAS,
    "arhamrumi/amazon-product-reviews",
    "Reviews.csv"
)

print(df.head())

  df = kagglehub.load_dataset(


   Id   ProductId          UserId                      ProfileName  \
0   1  B001E4KFG0  A3SGXH7AUHU8GW                       delmartian   
1   2  B00813GRG4  A1D87F6ZCVE5NK                           dll pa   
2   3  B000LQOCH0   ABXLMWJIXXAIN  Natalia Corres "Natalia Corres"   
3   4  B000UA0QIQ  A395BORC6FGVXV                             Karl   
4   5  B006K2ZZ7K  A1UQRSCLF8GW1T    Michael D. Bigham "M. Wassir"   

   HelpfulnessNumerator  HelpfulnessDenominator  Score        Time  \
0                     1                       1      5  1303862400   
1                     0                       0      1  1346976000   
2                     1                       1      4  1219017600   
3                     3                       3      2  1307923200   
4                     0                       0      5  1350777600   

                 Summary                                               Text  
0  Good Quality Dog Food  I have bought several of the Vitality canned d...  
1 

In [3]:
def make_sentiment_label(score):
    if score <= 2:
        return 0  # negative
    elif score == 3:
        return 1  # neutral
    else:
        return 2  # positive

df['label'] = df['Score'].apply(make_sentiment_label)

print("\nValue counts of newly created sentiment labels:")
print(df['label'].value_counts())

# For convenience, rename the text column to something shorter:
df.rename(columns={'Text': 'review_text'}, inplace=True)


Value counts of newly created sentiment labels:
label
2    443777
0     82037
1     42640
Name: count, dtype: int64


In [4]:
from sklearn.model_selection import train_test_split

X = df['review_text']
y = df['label']

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y  # ensure balanced splits
)

print(f"\nTraining set size: {len(X_train)}")
print(f"Testing set size:  {len(X_test)}")


Training set size: 454763
Testing set size:  113691


In [6]:
import pandas as pd

train_counts = pd.Series(y_train).value_counts().sort_index()
test_counts = pd.Series(y_test).value_counts().sort_index()
print("\n===TRAINING DATASET===")
print(f"Number of negative reviews: {train_counts.get(0,0)}")
print(f"Number of neutral reviews:  {train_counts.get(1,0)}")
print(f"Number of positive reviews: {train_counts.get(2,0)}")

print("\n===TESTING DATASET===")
print(f"Number of negative reviews: {test_counts.get(0,0)}")
print(f"Number of neutral reviews:  {test_counts.get(1,0)}")
print(f"Number of positive reviews: {test_counts.get(2,0)}")


===TRAINING DATASET===
Number of negative reviews: 65630
Number of neutral reviews:  34112
Number of positive reviews: 355021

===TESTING DATASET===
Number of negative reviews: 16407
Number of neutral reviews:  8528
Number of positive reviews: 88756


In [7]:
import numpy as np

train_sentences = X_train.to_numpy()
val_sentences   = X_test.to_numpy()

train_labels = y_train.to_numpy()
val_labels   = y_test.to_numpy()

# Converting Text into Numbers

## Text vectorization (tokenization)

In [8]:
import tensorflow as tf
from tensorflow.keras import layers

max_vocab_length = 10000  # limit vocabulary
max_length = 50           # truncate/pad reviews to 50 tokens (adjust as needed)

text_vectorizer = layers.TextVectorization(
    max_tokens=max_vocab_length,
    output_mode="int",
    output_sequence_length=max_length
)

In [9]:
# Fit the vectorizer to our training text
text_vectorizer.adapt(train_sentences)

In [10]:
words_in_vocab = text_vectorizer.get_vocabulary()
print("\nVocab size (truncate to max_vocab_length):", len(words_in_vocab))
print("Top 5 words:", words_in_vocab[:5])
print("Bottom 5 words:", words_in_vocab[-5:])


Vocab size (truncate to max_vocab_length): 10000
Top 5 words: ['', '[UNK]', np.str_('the'), np.str_('i'), np.str_('and')]
Bottom 5 words: [np.str_('unscrew'), np.str_('twang'), np.str_('thrived'), np.str_('tale'), np.str_('stave')]


## Creating an Embedding using an Embedding Layer

In [11]:
tf.random.set_seed(42)

embedding = layers.Embedding(
    input_dim=max_vocab_length,   # same as number of words in vocab
    output_dim=128,              # desired embedding size
    name="embedding"
)

In [14]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

def calculate_results(y_true, y_pred, average="weighted"):
    """
    Returns a dictionary of accuracy, precision, recall, f1 for multi-class.
    Uses 'precision_recall_fscore_support' from scikit-learn with the specified
    averaging (e.g. 'weighted' or 'macro').
    """
    acc = accuracy_score(y_true, y_pred) * 100
    prec, rec, f1, _ = precision_recall_fscore_support(y_true, y_pred, average=average)
    return {
        "accuracy": acc,
        "precision": prec,
        "recall": rec,
        "f1": f1,
    }

# Model 0: Naive Bayes (baseline)

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

model_0 = Pipeline([
    ("tfidf", TfidfVectorizer()),
    ("nb",    MultinomialNB())  # by default handles multi-class
])

model_0.fit(X_train, y_train)
baseline_acc = model_0.score(X_test, y_test)
baseline_preds = model_0.predict(X_test)
baseline_results = calculate_results(y_test, baseline_preds)
print("\n=== Baseline (MultinomialNB) results ===")
print(f"Accuracy: {baseline_acc*100:.2f}%")
print(baseline_results)


=== Baseline (MultinomialNB) results ===
Accuracy: 79.78%
{'accuracy': 79.77676333218989, 'precision': 0.7614360180287343, 'recall': 0.7977676333218988, 'f1': 0.7236049016939615}


# Model 1: Feed-forward Neural Network (Dense Model)

In [16]:
# Build
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GlobalAveragePooling1D()(x)
outputs = layers.Dense(3, activation="softmax")(x)  # 3 classes
model_1 = tf.keras.Model(inputs, outputs)

In [17]:
# Compile
model_1.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [18]:
# Fit
print("\n=== Training Simple Dense model (3-class) ===")
# Fit
model_1.fit(
    train_sentences,
    train_labels,
    epochs=3,  # increase for better performance
    validation_data=(val_sentences, val_labels),
    verbose=1
)


=== Training Simple Dense model (3-class) ===
Epoch 1/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 10ms/step - accuracy: 0.8254 - loss: 0.4769 - val_accuracy: 0.8500 - val_loss: 0.4098
Epoch 2/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m122s[0m 8ms/step - accuracy: 0.8525 - loss: 0.3986 - val_accuracy: 0.8522 - val_loss: 0.4070
Epoch 3/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m149s[0m 9ms/step - accuracy: 0.8555 - loss: 0.3908 - val_accuracy: 0.8529 - val_loss: 0.4063


<keras.src.callbacks.history.History at 0x7b6f54946c50>

In [19]:
# Evaluate
model_1_eval = model_1.evaluate(val_sentences, val_labels, verbose=0)
print(f"Validation accuracy: {model_1_eval[1]*100:.2f}%")

Validation accuracy: 85.29%


In [20]:
# Predict
model_1_probas = model_1.predict(val_sentences)
model_1_preds = tf.argmax(model_1_probas, axis=1)  # convert softmax -> int label

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step


In [21]:
model_1_results = calculate_results(val_labels, model_1_preds)
print("=== Simple Dense model results ===")
print(model_1_results)

=== Simple Dense model results ===
{'accuracy': 85.28907301369502, 'precision': 0.8330467957041239, 'recall': 0.8528907301369502, 'f1': 0.8344316281960862}


# Model 2: LSTM

In [22]:
# Build LSTM model
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.LSTM(64)(x)
outputs = layers.Dense(3, activation="softmax")(x)
model_2 = tf.keras.Model(inputs, outputs, name="model_2_LSTM")

In [23]:
# Compile Build LSTM model
model_2.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [24]:
print("\n=== Training LSTM model (3-class) ===")
model_2.fit(
    train_sentences,
    train_labels,
    epochs=3,
    validation_data=(val_sentences, val_labels),
    verbose=1
)


=== Training LSTM model (3-class) ===
Epoch 1/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 17ms/step - accuracy: 0.8575 - loss: 0.3914 - val_accuracy: 0.8764 - val_loss: 0.3388
Epoch 2/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m172s[0m 12ms/step - accuracy: 0.8889 - loss: 0.3046 - val_accuracy: 0.8849 - val_loss: 0.3227
Epoch 3/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m182s[0m 13ms/step - accuracy: 0.9078 - loss: 0.2550 - val_accuracy: 0.8885 - val_loss: 0.3270


<keras.src.callbacks.history.History at 0x7b6fb7351e50>

In [25]:
# Evaluate LSTM model
model_2_probas = model_2.predict(val_sentences)
model_2_preds = tf.argmax(model_2_probas, axis=1)
model_2_results = calculate_results(val_labels, model_2_preds)
print("=== LSTM model results ===")
print(model_2_results)

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 6ms/step
=== LSTM model results ===
{'accuracy': 88.8478419575868, 'precision': 0.8783985459700132, 'recall': 0.8884784195758679, 'f1': 0.8810600585697077}


# Model 3: GRU

In [26]:
# Build GRU model
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.GRU(64)(x)
outputs = layers.Dense(3, activation="softmax")(x)
model_3 = tf.keras.Model(inputs, outputs, name="model_GRU_3class")

In [27]:
# Compile GRU model
model_3.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [28]:
print("\n=== Training GRU model (3-class) ===")
model_3.fit(
    train_sentences,
    train_labels,
    epochs=3,
    validation_data=(val_sentences, val_labels),
    verbose=1
)


=== Training GRU model (3-class) ===
Epoch 1/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m211s[0m 15ms/step - accuracy: 0.8822 - loss: 0.3260 - val_accuracy: 0.8852 - val_loss: 0.3330
Epoch 2/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m150s[0m 11ms/step - accuracy: 0.9146 - loss: 0.2395 - val_accuracy: 0.8860 - val_loss: 0.3604
Epoch 3/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m165s[0m 12ms/step - accuracy: 0.9296 - loss: 0.2023 - val_accuracy: 0.8850 - val_loss: 0.3828


<keras.src.callbacks.history.History at 0x7b6f54af6ed0>

In [29]:
# Evaluate GRU model
model_3_probs = model_3.predict(val_sentences)
model_3_preds = tf.argmax(model_3_probs, axis=1)
model_3_results = calculate_results(val_labels, model_3_preds)
print("=== GRU model results ===")
print(model_3_results)

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 5ms/step
=== GRU model results ===
{'accuracy': 88.4951315407552, 'precision': 0.87857816935765, 'recall': 0.884951315407552, 'f1': 0.8812088744748546}


# Model 4: Bidirectonal LSTM

In [30]:
# Build Bidirectional LSTM
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Bidirectional(layers.LSTM(64))(x)
outputs = layers.Dense(3, activation="softmax")(x)
model_4 = tf.keras.Model(inputs, outputs, name="model_BiLSTM_3class")

In [31]:
# Compile Bidirectional LSTM
model_4.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [32]:
print("\n=== Training Bidirectional LSTM model (3-class) ===")
model_4.fit(
    train_sentences,
    train_labels,
    epochs=3,
    validation_data=(val_sentences, val_labels),
    verbose=1
)


=== Training Bidirectional LSTM model (3-class) ===
Epoch 1/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m208s[0m 14ms/step - accuracy: 0.8969 - loss: 0.2860 - val_accuracy: 0.8830 - val_loss: 0.3577
Epoch 2/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m269s[0m 15ms/step - accuracy: 0.9292 - loss: 0.2020 - val_accuracy: 0.8864 - val_loss: 0.3864
Epoch 3/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m219s[0m 15ms/step - accuracy: 0.9419 - loss: 0.1659 - val_accuracy: 0.8856 - val_loss: 0.4371


<keras.src.callbacks.history.History at 0x7b6fb033c490>

In [33]:
# Evaluate Bidirectional LSTM
model_4_probs = model_4.predict(val_sentences)
model_4_preds = tf.argmax(model_4_probs, axis=1)
model_4_results = calculate_results(val_labels, model_4_preds)
print("=== GRU model results ===")
print(model_4_results)


[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 8ms/step
=== GRU model results ===
{'accuracy': 88.56461813160233, 'precision': 0.8789715649738844, 'recall': 0.8856461813160232, 'f1': 0.8816702297604523}


# Model 5: 1D Convolutional Neural Network

In [34]:
# Build Conv1D model
inputs = layers.Input(shape=(1,), dtype=tf.string)
x = text_vectorizer(inputs)
x = embedding(x)
x = layers.Conv1D(filters=32, kernel_size=5, activation="relu")(x)
x = layers.GlobalMaxPool1D()(x)
outputs = layers.Dense(3, activation="softmax")(x)
model_5 = tf.keras.Model(inputs, outputs, name="model_Conv1D_3class")

In [35]:
# Compile Conv1D model
model_5.compile(
    loss="sparse_categorical_crossentropy",
    optimizer=tf.keras.optimizers.Adam(),
    metrics=["accuracy"]
)

In [36]:
print("\n=== Training Conv1D model (3-class) ===")
model_5.fit(
    train_sentences,
    train_labels,
    epochs=3,
    validation_data=(val_sentences, val_labels),
    verbose=1
)


=== Training Conv1D model (3-class) ===
Epoch 1/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m137s[0m 9ms/step - accuracy: 0.8702 - loss: 0.3542 - val_accuracy: 0.8723 - val_loss: 0.3709
Epoch 2/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m138s[0m 10ms/step - accuracy: 0.8996 - loss: 0.2811 - val_accuracy: 0.8794 - val_loss: 0.3754
Epoch 3/3
[1m14212/14212[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m145s[0m 10ms/step - accuracy: 0.9209 - loss: 0.2283 - val_accuracy: 0.8809 - val_loss: 0.3995


<keras.src.callbacks.history.History at 0x7b6f819bf790>

In [37]:
# Evaluate Conv1D
model_5_probs = model_5.predict(val_sentences)
model_5_preds = tf.argmax(model_5_probs, axis=1)
model_5_results = calculate_results(val_labels, model_5_preds)
print("=== GRU model results ===")
print(model_5_results)

[1m3553/3553[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 3ms/step
=== GRU model results ===
{'accuracy': 88.08700776666579, 'precision': 0.8717259934554324, 'recall': 0.8808700776666579, 'f1': 0.8746617629572737}


# Model 6: BERT (Pre-trained)

In [40]:
def star_label_to_triplet(label_str: str) -> int:
    """
    Convert '1 star', '2 stars', etc. (from the HF pipeline)
    into numerical sentiment label:
       0 = negative, 1 = neutral, 2 = positive
    """
    label_str = label_str.lower()  # e.g. "4 stars" => "4 stars"
    if "1 star" in label_str or "2 stars" in label_str:
        return 0  # negative
    elif "3 stars" in label_str:
        return 1  # neutral
    elif "4 stars" in label_str or "5 stars" in label_str:
        return 2  # positive
    else:
        # fallback (shouldn't happen if model is correct)
        return 2

In [47]:
from transformers import pipeline

model_name = "nlptown/bert-base-multilingual-uncased-sentiment"
sentiment_pipeline = pipeline("sentiment-analysis", model=model_name, tokenizer=model_name)

Device set to use cuda:0


In [45]:
pred_labels = []
true_labels = y_test.to_list()

# Convert X_test to list (if it's a pandas Series)
test_reviews = X_test.tolist()

In [48]:
print("Running inference with the pre-trained pipeline...")
pred_labels = []

for review_text in test_reviews:
    # Truncate at 512 tokens
    result = sentiment_pipeline(
        review_text,
        truncation=True,
        max_length=512
    )[0]

    # Convert "1 star", "2 stars", etc. => 0,1,2
    label_3class = star_label_to_triplet(result['label'])
    pred_labels.append(label_3class)

Running inference with the pre-trained pipeline...


You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset


In [49]:
model_6_results = calculate_results(true_labels, pred_labels)
print("\n=== BERT Pretrained (Zero-Fine-Tuning) Results ===")
print(model_6_results)


=== BERT Pretrained (Zero-Fine-Tuning) Results ===
{'accuracy': 84.92668724877079, 'precision': 0.8790900960584925, 'recall': 0.8492668724877079, 'f1': 0.8606725943438792}


# Comparing the performance of all the models

In [50]:
# Combine model results into a DataFrame
all_model_results = pd.DataFrame({"baseline": baseline_results,
                                  "simple_dense": model_1_results,
                                  "lstm": model_2_results,
                                  "gru": model_3_results,
                                  "bidirectional": model_4_results,
                                  "conv1d": model_5_results,
                                  "bert": model_6_results})

all_model_results = all_model_results.transpose()
all_model_results

Unnamed: 0,accuracy,precision,recall,f1
baseline,79.776763,0.761436,0.797768,0.723605
simple_dense,85.289073,0.833047,0.852891,0.834432
lstm,88.847842,0.878399,0.888478,0.88106
gru,88.495132,0.878578,0.884951,0.881209
bidirectional,88.564618,0.878972,0.885646,0.88167
conv1d,88.087008,0.871726,0.88087,0.874662
bert,84.926687,0.87909,0.849267,0.860673


In [53]:
# Save models
import joblib

joblib.dump(model_0, "model_0.joblib")
model_1.save("model_1.keras")
model_2.save("model_2.keras")
model_3.save("model_3.keras")
model_4.save("model_4.keras")
model_5.save("model_5.keras")