In [1]:
import numpy as np
import pandas as pd
from pathlib import Path
import joblib
import os
import re

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [3]:
import nltk
nltk.download('stopwords', quiet=True)
from nltk.corpus import stopwords


In [4]:
RAW_PATH = "DS_DATASET.xlsx"
CLEANED_PATH = "Cleaned_DS_DATASET.xlsx"
MODEL_PATH = "sentiment_model.pkl"    # final output (model + vectorizer)
TEXT_COL_CANDIDATES = ["content", "review text", "review", "review_text", "text"]
RATING_COL_CANDIDATES = ["rating", "product rating", "rating_value", "stars"]

In [5]:
STOPWORDS = set(stopwords.words("english"))

def clean_text(s: str) -> str:
    """Basic text cleaning pipeline:
       - to lowercase
       - remove html tags
       - remove non-alphabetic characters (keep spaces)
       - collapse multiple spaces
       - strip
    """
    if pd.isna(s):
        return ""
    s = str(s)
    s = s.lower()
    # remove html tags
    s = re.sub(r"<[^>]+>", " ", s)
    # remove URLs
    s = re.sub(r"http\S+|www\S+|https\S+", " ", s)
    # remove non-alpha (keep spaces)
    s = re.sub(r"[^a-z\s]", " ", s)
    # collapse spaces
    s = re.sub(r"\s+", " ", s).strip()
    # optional: remove stopwords (we will leave heavy filtering to TF-IDF stop_words)
    # tokens = [t for t in s.split() if t not in STOPWORDS]
    # return " ".join(tokens)
    return s

In [6]:
if not Path(RAW_PATH).exists():
    raise FileNotFoundError(f"Raw dataset not found at '{RAW_PATH}'. Put DS_DATASET.xlsx in the working directory.")

print("Loading raw dataset:", RAW_PATH)
df = pd.read_excel(RAW_PATH)
print("Initial shape:", df.shape)
print("Columns:", df.columns.tolist())


Loading raw dataset: DS_DATASET.xlsx
Initial shape: (625, 9)
Columns: ['product id', 'product category', 'product brand', 'product name', 'product description', 'product tags', 'product rating', 'content', 'sentiment_labels']


In [7]:
def find_first_present(candidates, columns):
    for c in candidates:
        if c in columns:
            return c
        # also try case-insensitive match
        for col in columns:
            if col.lower().strip() == c.lower().strip():
                return col
    return None


In [8]:
text_col = find_first_present(TEXT_COL_CANDIDATES, df.columns)
rating_col = find_first_present(RATING_COL_CANDIDATES, df.columns)

print("Detected text column:", text_col)
print("Detected rating column:", rating_col)

Detected text column: content
Detected rating column: product rating


In [9]:
before_dup = df.shape[0]
df = df.drop_duplicates()
after_dup = df.shape[0]
print(f"Dropped {before_dup - after_dup} duplicate rows.")

Dropped 0 duplicate rows.


In [10]:
for col in df.columns:
    if df[col].dtype == object:
        df[col] = df[col].astype(str).str.strip()


In [11]:
if text_col is None:
    # fallback: try product description + reviews
    possible_combo = []
    if 'product description' in df.columns:
        possible_combo.append('product description')
    if 'product tags' in df.columns:
        possible_combo.append('product tags')
    if possible_combo:
        df['content'] = df[possible_combo].fillna('').agg(' '.join, axis=1)
        text_col = 'content'
        print("Created 'content' by concatenating:", possible_combo)
    else:
        raise ValueError("No suitable text column found. Add a column named 'content' or 'review text' to the dataset.")


In [12]:
df[text_col] = df[text_col].fillna('').astype(str)
before_text_drop = df.shape[0]
df = df[df[text_col].str.strip() != ""].copy()
after_text_drop = df.shape[0]
print(f"Dropped {before_text_drop - after_text_drop} rows with empty text in '{text_col}'.")


Dropped 0 rows with empty text in 'content'.


In [13]:
df[text_col] = df[text_col].apply(clean_text)

In [14]:
if 'sentiment_labels' not in df.columns:
    print("'sentiment_labels' not found. Attempting to create from rating column...")
    if rating_col is not None:
        # create mapping: rating >=4 -> positive, <=2 -> negative, else neutral
        def rating_to_label(x):
            try:
                r = float(x)
            except:
                return "neutral"
            if r >= 4.0:
                return "positive"
            elif r <= 2.0:
                return "negative"
            else:
                return "neutral"
        df['sentiment_labels'] = df[rating_col].apply(rating_to_label)
        print("Created 'sentiment_labels' from rating column:", rating_col)
    else:
        # If no rating column, attempt to create a placeholder neutral label for all rows (user may want to label later)
        print("No rating column found to infer labels. Defaulting all labels to 'neutral'.")
        df['sentiment_labels'] = "neutral"

# Quick check: class balance
print("Sentiment value counts:\n", df['sentiment_labels'].value_counts())


Sentiment value counts:
 sentiment_labels
POSITIVE    551
NEUTRAL      56
NEGATIVE     18
Name: count, dtype: int64


In [15]:
print("Saving cleaned dataset to:", CLEANED_PATH)
df.to_excel(CLEANED_PATH, index=False)
print("Saved cleaned dataset.")

Saving cleaned dataset to: Cleaned_DS_DATASET.xlsx
Saved cleaned dataset.


In [16]:
print("Preparing training data...")
X = df['content'].astype(str)
y = df['sentiment_labels'].astype(str)

Preparing training data...


In [17]:
valid_idx = y.str.strip() != ""
X = X[valid_idx]
y = y[valid_idx]


In [18]:
unique_labels = y.unique()
if len(unique_labels) < 2:
    raise ValueError(f"Not enough label variety to train a classifier. Found labels: {unique_labels}")


In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)


In [20]:
print("Training samples:", X_train.shape[0], "Test samples:", X_test.shape[0])


Training samples: 500 Test samples: 125


In [21]:
vectorizer = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

In [22]:
model = LogisticRegression(max_iter=500)
print("Training Logistic Regression model...")
model.fit(X_train_vec, y_train)


Training Logistic Regression model...


0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,500


In [23]:
print("Evaluating on test set...")
preds = model.predict(X_test_vec)
print("Accuracy:", accuracy_score(y_test, preds))
print("\nClassification report:\n", classification_report(y_test, preds))

Evaluating on test set...
Accuracy: 0.88

Classification report:
               precision    recall  f1-score   support

    NEGATIVE       0.00      0.00      0.00         4
     NEUTRAL       0.00      0.00      0.00        11
    POSITIVE       0.88      1.00      0.94       110

    accuracy                           0.88       125
   macro avg       0.29      0.33      0.31       125
weighted avg       0.77      0.88      0.82       125



  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [24]:
print(f"Saving model+vectorizer to {MODEL_PATH} ...")
joblib.dump((model, vectorizer), MODEL_PATH)
print("Saved model successfully.")

print("All done. Cleaned dataset and model are ready.")

Saving model+vectorizer to sentiment_model.pkl ...
Saved model successfully.
All done. Cleaned dataset and model are ready.


In [26]:
import joblib

# model → your trained LogisticRegression model
# vectorizer → your trained TF-IDF vectorizer

MODEL_PATH = "recommendation_model.pkl"

# Save model + vectorizer as a tuple
joblib.dump((model, vectorizer), MODEL_PATH)

print(f"Model exported successfully to {MODEL_PATH}")

Model exported successfully to recommendation_model.pkl
