Dataset
https://www.kaggle.com/datasets/yasserh/amazon-product-reviews-dataset

In [54]:
# 1. Load Data
import pandas as pd

# Load the Amazon Fine Food Reviews dataset
df = pd.read_csv("amazon_product_review.csv")
df.head()

Unnamed: 0,id,asins,brand,categories,colors,dateAdded,dateUpdated,dimension,ean,keys,...,reviews.rating,reviews.sourceURLs,reviews.text,reviews.title,reviews.userCity,reviews.userProvince,reviews.username,sizes,upc,weight
0,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I initially had trouble deciding between the p...,"Paperwhite voyage, no regrets!",,,Cristina M,,,205 grams
1,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,Allow me to preface this with a little history...,One Simply Could Not Ask For More,,,Ricky,,,205 grams
2,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,4.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I am enjoying it so far. Great for reading. Ha...,Great for those that just want an e-reader,,,Tedd Gardiner,,,205 grams
3,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I bought one of the first Paperwhites and have...,Love / Hate relationship,,,Dougal,,,205 grams
4,AVpe7AsMilAPnD_xQ78G,B00QJDU3KY,Amazon,"Amazon Devices,mazon.co.uk",,2016-03-08T20:21:53Z,2017-07-18T23:52:58Z,169 mm x 117 mm x 9.1 mm,,kindlepaperwhite/b00qjdu3ky,...,5.0,https://www.amazon.com/Kindle-Paperwhite-High-...,I have to say upfront - I don't like coroporat...,I LOVE IT,,,Miljan David Tanic,,,205 grams


In [55]:
print(df.columns)


Index(['id', 'asins', 'brand', 'categories', 'colors', 'dateAdded',
       'dateUpdated', 'dimension', 'ean', 'keys', 'manufacturer',
       'manufacturerNumber', 'name', 'prices', 'reviews.date',
       'reviews.doRecommend', 'reviews.numHelpful', 'reviews.rating',
       'reviews.sourceURLs', 'reviews.text', 'reviews.title',
       'reviews.userCity', 'reviews.userProvince', 'reviews.username', 'sizes',
       'upc', 'weight'],
      dtype='object')


In [56]:
df = df[['reviews.text', 'reviews.rating']].dropna()
df.columns = ['Text', 'Score']

Preprocess: Tokenize, Lemmatize, Remove Stopwords

In [57]:
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text_advanced(text):
    text = str(text).lower()
    text = re.sub(r"http\S+|www\S+", '', text)  # Remove links
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation

    # Handle negation (e.g., "not good" -> "not_good")
    tokens = text.split()
    negation_handled_tokens = []
    i = 0
    while i < len(tokens):
        if tokens[i] == 'not' and i + 1 < len(tokens):
            negation_handled_tokens.append('not_' + tokens[i+1])
            i += 2
        else:
            negation_handled_tokens.append(tokens[i])
            i += 1

    # Lemmatize and remove stopwords
    tokens = [lemmatizer.lemmatize(word) for word in negation_handled_tokens if word not in stop_words]
    return ' '.join(tokens)
df['Cleaned_Text'] = df['Text'].astype(str).apply(clean_text_advanced)

Convert Text to TF-IDF Vectors

In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))

# Converting score to sentiment label
df_binary = df[df['Score'] != 3].copy()
def score_to_sentiment_binary(score):
    return 'positive' if score > 3 else 'negative'
df_binary['Sentiment'] = df_binary['Score'].apply(score_to_sentiment_binary)

# Apply cleaning to the filtered DataFrame
df_binary['Cleaned_Text'] = df_binary['Text'].astype(str).apply(clean_text_advanced)

X = vectorizer.fit_transform(df_binary['Cleaned_Text'])
y = df_binary['Sentiment']

Train and Test Split

In [68]:
# Use a LabelEncoder to convert text labels to numbers
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
le.fit(y) # Fit LabelEncoder on the original sentiment labels
y_encoded = le.transform(y) # Transform the sentiment labels to numerical labels

In [60]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded
)


Model Training (Logistic Regression)

In [65]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

model_pipeline = Pipeline([
    ('classifier', LogisticRegression(max_iter=1000, class_weight='balanced'))
])
# Train the pipeline using the already vectorized data from X_train
model_pipeline.fit(X_train, y_train)

In [69]:
# Evaluate the model
y_pred = model_pipeline.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=le.classes_))

Classification Report:
              precision    recall  f1-score   support

    negative       0.56      0.67      0.61        15
    positive       0.97      0.96      0.97       196

    accuracy                           0.94       211
   macro avg       0.76      0.81      0.79       211
weighted avg       0.94      0.94      0.94       211



In [70]:

# Save the model pipeline and label encoder
with open("model.pkl", "wb") as f:
    pickle.dump((model_pipeline, le), f)