Data Preprocessing

In [1]:
import pandas as pd
import numpy as np
import re, nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectKBest, chi2
from scipy.sparse import hstack
from sklearn.decomposition import TruncatedSVD

nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')
nltk.download('wordnet')

# ================= LOAD DATA =================
df = pd.read_csv("amazon_review_dataset.csv")
print("Initial Dataset Shape:", df.shape)
print(df.head(3))

# ================= MISSING VALUES =================
print("\n--- BEFORE MISSING VALUES ---")
print(df[["content","score","thumbsUpCount"]].isnull().sum())

df = df.dropna(subset=["content","score"])
df["thumbsUpCount"] = df["thumbsUpCount"].fillna(0)

print("\n--- AFTER MISSING VALUES ---")
print(df[["content","score","thumbsUpCount"]].isnull().sum())

# ================= TEXT NORMALISATION =================
stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r"[^a-z\s]", "", text)
    tokens = word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return " ".join(tokens)

print("\n--- BEFORE TEXT CLEANING ---")
print(df["content"].head(3))

df["clean_content"] = df["content"].apply(clean_text)

print("\n--- AFTER TEXT CLEANING ---")
print(df["clean_content"].head(3))

# ================= FEATURE ENGINEERING =================
df["review_length"] = df["clean_content"].apply(lambda x: len(x.split()))
print("\n--- REVIEW LENGTH ---")
print(df["review_length"].head())

# ================= OUTLIER HANDLING =================
print("\n--- BEFORE OUTLIER CLIPPING ---")
print(df[["review_length","thumbsUpCount"]].describe())

df["review_length"] = df["review_length"].clip(upper=df["review_length"].quantile(0.99))
df["thumbsUpCount"] = df["thumbsUpCount"].clip(upper=df["thumbsUpCount"].quantile(0.99))

print("\n--- AFTER OUTLIER CLIPPING ---")
print(df[["review_length","thumbsUpCount"]].describe())

# ================= SKEWNESS HANDLING =================
print("\n--- BEFORE LOG TRANSFORMATION ---")
print(df[["review_length", "thumbsUpCount"]].head())
print(df[["review_length", "thumbsUpCount"]].describe())

df["review_length"] = np.log1p(df["review_length"])
df["thumbsUpCount"] = np.log1p(df["thumbsUpCount"])
print("\n--- AFTER LOG TRANSFORMATION ---")
print(df[["review_length","thumbsUpCount"]].head())

# ================= NORMALISATION =================
print("\n--- BEFORE NORMALISATION ---")
print(df[["review_length","thumbsUpCount"]].describe())

scaler = StandardScaler()
df[["thumbsUpCount","review_length"]] = scaler.fit_transform(df[["thumbsUpCount","review_length"]])

print("\n--- AFTER NORMALISATION ---")
print(df[["review_length","thumbsUpCount"]].describe())

# ================= TARGET ENCODING =================
print("\n--- BEFORE TARGET ENCODING ---")
print(df["score"].head())

df["sentiment"] = df["score"].apply(lambda x: 1 if x > 3 else 0)
df = df.drop(columns=["score"])

print("\n--- AFTER TARGET ENCODING ---")
print(df["sentiment"].head())

# ================= FEATURE SELECTION =================
X_text = df["clean_content"]
X_num = df[["thumbsUpCount","review_length"]]
y = df["sentiment"]

tfidf = TfidfVectorizer(max_features=50000)
X_tfidf = tfidf.fit_transform(X_text)
print("\n--- TF-IDF SHAPE ---", X_tfidf.shape)

selector = SelectKBest(chi2, k=5000)
X_selected = selector.fit_transform(X_tfidf, y)
print("--- SELECTED FEATURES SHAPE ---", X_selected.shape)

X_final = hstack([X_selected, X_num])

print("Before Feature Selection:", X_tfidf.shape)
print("After Feature Selection:", X_selected.shape)

print("\n--- FINAL DATASET ---")
print("X shape:", X_final.shape)
print("y shape:", y.shape)


# ================= Preprocessed File =================

orange_df = df[[
    "content",
    "thumbsUpCount",
    "review_length",
    "sentiment"
]]

orange_df.to_csv("amazon_orange_readable.csv", index=False)
print("Saved: amazon_orange_readable.csv")






[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\banzu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\banzu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\banzu\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\banzu\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Initial Dataset Shape: (80291, 8)
                               reviewId          userName  \
0  ed55cacc-b2da-4626-9a71-4299c5824726        Tene Smith   
1  ffda6f08-60df-4c45-8c13-82cef4959f5d            Fucc U   
2  3d416dda-8de9-4bfd-9f2d-38c20a010266  Derek and Sarita   

                                             content  score  thumbsUpCount  \
0                         Great shopping experience.      5            0.0   
1  I love Amazon but I'm tired of being punished ...      1            0.0   
2  Shopping couldn't be easier and should you nee...      5            0.0   

  reviewCreatedVersion               at   appVersion  
0          30.20.0.100  2025-11-1 11:27  30.20.0.100  
1          30.19.0.100  2025-11-1 10:42  30.19.0.100  
2          30.20.0.100   2025-11-1 9:54  30.20.0.100  

--- BEFORE MISSING VALUES ---
content           6
score             0
thumbsUpCount    24
dtype: int64

--- AFTER MISSING VALUES ---
content          0
score            0
thumbsUpCount   

Trainâ€“Test Split Strategy & Predictive Modelling

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib

# ================= SPLIT RATIOS =================
split_ratios = {
    "60:40": 0.40,
    "70:30": 0.30,
    "80:20": 0.20,
    "90:10": 0.10
}

# ================= MODELS =================
models = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "SVM": LinearSVC(),
    "Random Forest": RandomForestClassifier(n_estimators=200, random_state=42),
    "kNN": KNeighborsClassifier(n_neighbors=5)
}

# ================= TRAIN & EVALUATE =================
for split_name, test_size in split_ratios.items():
    print(f"\n================ SPLIT {split_name} ================")

    X_train, X_test, y_train, y_test = train_test_split(
        X_final, y, test_size=test_size, stratify=y, random_state=42
    )

    print("Train size:", len(y_train), " | Test size:", len(y_test))

    for model_name, model in models.items():
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

        acc = accuracy_score(y_test, y_pred)
        print(f"\nModel: {model_name}")
        print("Accuracy:", acc)
        print(classification_report(y_test, y_pred))

# ================= SAVE MODEL FOR APP =================
# Using Logistic Regression with 80:20 split as it's simple and performs well
X_train, X_test, y_train, y_test = train_test_split(
    X_final, y, test_size=0.20, stratify=y, random_state=42
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

# Save the model and preprocessing objects
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
joblib.dump(scaler, 'scaler.pkl')
joblib.dump(selector, 'feature_selector.pkl')

print("\nModel and preprocessing objects saved for the app.")



Train size: 48171  | Test size: 32114

Model: Logistic Regression
Accuracy: 0.9001681509621972
              precision    recall  f1-score   support

           0       0.90      0.95      0.93     20823
           1       0.90      0.80      0.85     11291

    accuracy                           0.90     32114
   macro avg       0.90      0.88      0.89     32114
weighted avg       0.90      0.90      0.90     32114


Model: SVM
Accuracy: 0.9013514355109921
              precision    recall  f1-score   support

           0       0.90      0.95      0.93     20823
           1       0.90      0.81      0.85     11291

    accuracy                           0.90     32114
   macro avg       0.90      0.88      0.89     32114
weighted avg       0.90      0.90      0.90     32114


Model: Random Forest
Accuracy: 0.8815158497851404
              precision    recall  f1-score   support

           0       0.87      0.96      0.91     20823
           1       0.92      0.73      0.81     1