**AAI-590: Capstone Project**

By: Aditya, Deepak and Rajesh

In [None]:
# ================================================================================
# LIBRARY IMPORTS
# ================================================================================
# Import essential libraries for data processing, machine learning, and visualization

# Data manipulation and numerical operations
import pandas as pd
import numpy as np
import re

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# JSON handling and type annotations
import json
from typing import List, Tuple, Dict, Set

# Machine learning and model evaluation
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import classification_report, f1_score
from sklearn.utils import class_weight
import joblib

# Gradio interface and image processing
import gradio as gr
import io
from PIL import Image
from wordcloud import WordCloud

# Google Drive integration for data storage
from google.colab import drive
drive.mount('/content/drive')

# Add custom module path for helper functions
import sys
sys.path.append('/content/drive/MyDrive/AAI-590/')

Mounted at /content/drive


## Define Helper Functions

In [None]:
%%writefile /content/drive/MyDrive/AAI-590/text_helpers.py

# ================================================================================
# TEXT PREPROCESSING HELPER FUNCTIONS
# ================================================================================
# This module contains utility functions for text preprocessing and cleaning

import re
import pandas as pd
from typing import *

def build_text(title: str, review: str) -> str:
    """
    Concatenate review title and review text into a single lowercase string.
    
    Args:
        title (str): Review title
        review (str): Review text content
        
    Returns:
        str: Combined and lowercased text with title and review separated by newline
    """
    return (str(title or "") + " \n" + str(review or "")).strip().lower()

def safe_get(row: pd.Series, col: str) -> str:
    """
    Safely extract and clean text from a DataFrame row.
    
    Handles missing values (NaN) and removes non-English characters while preserving
    basic punctuation. Keeps only: letters, numbers, spaces, and .,!?-
    
    Args:
        row (pd.Series): DataFrame row
        col (str): Column name to extract
        
    Returns:
        str: Cleaned text string, empty string if value is missing
    """
    value = row[col] if col in row and not pd.isna(row[col]) else ""
    if isinstance(value, str):
        # Keep only English letters, numbers, spaces, and basic punctuation
        return re.sub(r'[^a-zA-Z0-9\s.,!?-]', '', value)
    return str(value)

Overwriting /content/drive/MyDrive/AAI-590/text_helpers.py


In [None]:
# ================================================================================
# IMPORT HELPER FUNCTIONS
# ================================================================================
# Load custom text preprocessing functions from the helper module

import sys
sys.path.append('/content/drive/MyDrive/AAI-590/')
from text_helpers import build_text, safe_get

print("build_text and safe_get functions imported from text_helpers.py")

build_text and safe_get functions imported from text_helpers.py


## Define Business Rules for Attribute Extraction

This cell creates a comprehensive rule-based system stored in `attribute_rules.py` that defines regex patterns for extracting product attributes from review text.

In [None]:
%%writefile /content/drive/MyDrive/AAI-590/attribute_rules.py

# ================================================================================
# ATTRIBUTE EXTRACTION RULES
# ================================================================================
# Comprehensive regex-based rules for extracting product attributes from reviews
# Each tuple contains: (regex_pattern, attribute_tag)
# These rules cover multiple product categories including clothing, footwear, and accessories

rules_list = [
    # ============================================================================
    # WARMTH ATTRIBUTES
    # ============================================================================
    (r"\b(warmest|toasty|cozy|very warm|super warm|warm & toasty|warm hat)\b", "Warmth:high"),
    (r"\b(keeps .* head warm|keeps .* ears warm|kept .* head warm|keeps me warm|kept me warm)\b", "Warmth:high"),
    (r"\b(not warm|isn't warm|cold passes through|wind (?:passes|blows) right through|doesn't keep .* warm|not as warm)\b", "Warmth:low"),
    (r"\b(wind (?:blocking|proof)|blocks the wind)\b", "Warmth:wind_blocking"),
    
    # ============================================================================
    # FIT & SIZING ATTRIBUTES
    # ============================================================================
    (r"\b(too tight|very tight|super tight|tight fit|gives me headaches)\b", "Fit:tight"),
    (r"\b(too small|runs small|child'?s size|youth size|smaller than|tiny)\b", "Fit:small"),
    (r"\b(snug fit|snug)\b", "Fit:snug"),
    (r"\b(too loose|loose fit|baggy|very loose|ill[- ]?fitting|slides off|slips off)\b", "Fit:loose"),
    (r"\b(rides up|creeps up|shrinks off my head)\b", "Fit:rides_up"),
    (r"\b(too long|very long|super long|tall|conehead|pointy|smurf|extra fabric|excess fabric|sticks up|floppy|looks like a cone|cat in the hat)\b", "Fit:excess_length"),
    (r"\b(one size fits all|osfa)\b", "Sizing:one_size"),
    (r"\b(inconsistent size|quality control|different sizes|wildly different sizes|manufacturing oversight)\b", "Sizing:inconsistent"),
    
    # ============================================================================
    # COLOR ATTRIBUTES
    # ============================================================================
    (r"\b(color (?:exactly|as pictured|as described|true to (?:pic|picture|site)))\b", "Color:accurate"),
    (r"\b(color (?:wrong|off|not .* (?:as pictured|as described)|different in person)|looks (?:brown|dark) not .* (?:purple|burgundy|blackberry)|blackberry .* brown)\b", "Color:off"),
    (r"\b(too bright|very bright|hunter orange|blaze orange|brite lime|high(?:-)?vis|hi(?:-)?vis|osha)\b", "Color:bright_hi_vis"),
    (r"\b(darker than|much darker|very dark|almost black)\b", "Color:darker"),
    
    # ============================================================================
    # MATERIAL & TEXTURE ATTRIBUTES
    # ============================================================================
    (r"\b(soft|super soft|so soft)\b", "Material:soft"),
    (r"\b(itchy|makes my forehead itchy|not soft)\b", "Material:itchy"),
    (r"\b(thin|not thick|light ?weight)\b", "Material:thin"),
    (r"\b(thick|hefty knit|double layer)\b", "Material:thick"),
    (r"\bacrylic\b", "Material:acrylic"),
    (r"\b(quality (?:went down|declined|poor)|worse quality|quality change|not the same quality)\b", "Quality:declined"),
    
    # ============================================================================
    # STRETCH ATTRIBUTES
    # ============================================================================
    (r"\b(stretches (?:well|easily)|good stretch)\b", "Stretch:good"),
    (r"\b(doesn't stretch|won't stretch)\b", "Stretch:poor"),
    (r"\b(stretched out|gets baggy|stretch(?:es)? out easy)\b", "Stretch:stretches_out"),
    
    # ============================================================================
    # DURABILITY & CONSTRUCTION ATTRIBUTES
    # ============================================================================
    (r"\b(durable|lasts forever|last for years|very well made|well made)\b", "Durability:high"),
    (r"\b(stitching (?:crooked|came out)|logo (?:fell off|upside down|incorrect)|hole|defective|poor quality control)\b", "Durability:issues"),
    
    # ============================================================================
    # STYLE & USE CASE ATTRIBUTES
    # ============================================================================
    (r"\b(cute|stylish|looks great|fashion|trendy)\b", "Style:cute"),
    (r"\b(classic|staple|iconic)\b", "Style:classic"),
    (r"\b(hard hat|jobsite|work|warehouse|freezer|construction|farm)\b", "Use:work"),
    (r"\b(hunt(?:ing)?|blaze orange)\b", "Use:hunting"),
    (r"\b(ski(?:ing)?|snowboard|hiking|mountaineering|outdoors|camping)\b", "Use:outdoors"),
    (r"\b(gift|present|stocking stuffer)\b", "Use:gift"),
    
    # ============================================================================
    # COUNTRY OF ORIGIN
    # ============================================================================
    (r"made in usa", "Made_in:USA"),
    (r"made in canada", "Made_in:Canada"),
    (r"made in china", "Made_in:China"),
    (r"made in vietnam", "Made_in:Vietnam"),
    
    # ============================================================================
    # PRICE & SERVICE ATTRIBUTES
    # ============================================================================
    (r"\b(overpriced|pricey)\b", "Price:overpriced"),
    (r"\b(great price|fair price|good price|value)\b", "Price:value"),
    (r"\b(fast shipping|arrived quickly|quick delivery)\b", "Shipping:fast"),
    (r"\b(slow shipping|arrived late|took (?:weeks|long)|delivery .* (?:late|slow))\b", "Shipping:slow"),
    
    # ============================================================================
    # CARE INSTRUCTIONS
    # ============================================================================
    (r"\b(hand wash|do not (?:machine )?dry|shrinks|shrinkage|wash(?:es)? well|launder)\b", "Care:notes"),
    
    # ============================================================================
    # PRODUCT TYPES (Footwear & Accessories)
    # ============================================================================
    (r"\bodor[- ]?x\b|\bodor[- ]?fighting foot powder\b|\bfoot powder\b", "Product:FootPowder"),
    (r"\binsoles?\b|\bshoe inserts?\b", "Product:Insole"),
    (r"\bheel (?:cushion|cup)s?\b", "Product:HeelCushion"),
    (r"\blaces?\b|\bshoelaces?\b", "Product:Laces"),
    (r"\bshoe ?horn\b", "Product:ShoeHorn"),
    (r"\bshoe trees?\b", "Product:ShoeTrees"),

    # ============================================================================
    # ODOR & SWEAT CONTROL (Footwear-specific)
    # ============================================================================
    (r"\b(kills the funk|odor fighter|no more (?:stinky|smelly) feet|"
      r"eliminat(?:e|es|ed) odor|destroy(?:s|ed)? odor|neutraliz(?:e|es|ed)|"
      r"odor[- ]?control)\b", "OdorControl:effective"),
    (r"\b(?:odor|smell) (?:still|worse|bad)|does(?:n't| not) (?:work|help).*(?:odor|smell)\b", "OdorControl:ineffective"),
    (r"\b(keeps|kept) (?:feet|socks) (?:dry|cool)\b", "Sweat:dry"),
    (r"\b(?:too|very|brutally) dry|dries out (?:feet|skin)\b", "Sweat:overdry"),
    (r"\b(messy|powder footprints?|white footprint|powder all over)\b", "Powder:messy"),
    (r"\b(light(?:ly)?|mild) (?:scent|smell)\b", "Scent:mild"),
    (r"\b(strong|heavy) (?:scent|smell)\b", "Scent:strong"),
    (r"\b(makes .*feet.* sweaty|feet (?:sweaty|hot))\b", "Sweat:increased"),

    # ============================================================================
    # COMFORT & SUPPORT (Footwear-specific)
    # ============================================================================
    (r"\b(walking on (?:air|clouds|pillows)|cushion(?:ing)?|cushy|massaging gel|"
      r"comfortable|comfort)\b", "Comfort:high"),
    (r"\b(reduce(?:s|d)? (?:fatigue|tired)|more (?:energy|support)|support(?:ive)?)\b", "Support:good"),
    (r"\b(plantar fasciitis|heel pain relief|help(?:s)? my heels)\b", "Health:heel_pain_relief"),
    (r"\barch support\b", "Support:arch"),
    (r"\b(no arch support|arch too (?:low|high)|painful arch support)\b", "Support:arch_issue"),
    (r"\b(hard|rigid|too (?:hard|stiff)|uncomfortable|painful)\b", "Comfort:low"),

    # ============================================================================
    # FIT & INSTALLATION (Footwear-specific)
    # ============================================================================
    (r"\b(trim to size|cut to size|easy to trim|sizing guide)\b", "Fit:trim_to_size"),
    (r"\b(too thick|bulky|makes shoes tight|snug fit|no room|raised heel)\b", "Fit:too_thick"),
    (r"\b(not wide enough|narrow|size mismark(?:ed)?|wrong size)\b", "Fit:size_issue"),
    (r"\b(slid(?:e|es|ing)|moves?|bunch(?:es)? up|curl(?:s|ed)|does(?:n't| not) stay in place)\b", "Fit:moves_in_shoe"),
    (r"\b(squeak|gurgle|nois(?:e|y))\b", "Fit:noise"),

    # ============================================================================
    # DURABILITY (Footwear-specific)
    # ============================================================================
    (r"\b(fell apart|separat(?:e|ed)|delaminat(?:e|ed)|"
      r"gel (?:leaked|disintegrated|broke|flattened)|"
      r"wore out (?:quickly|fast)|poor lifespan|quality (?:declined|down))\b", "Durability:issues"),
    (r"\b(last(?:s|ed) (?:weeks|months|years)|durable)\b", "Durability:high"),

    # ============================================================================
    # USE CONTEXTS (Profession & Activity)
    # ============================================================================
    (r"\b(work boots?|steel toe|warehouse|factory|concrete|long shift|12 ?hours?|"
      r"standing all day|on my feet all day)\b", "Use:work_long_hours"),
    (r"\b(hiking|trail|outdoors|boots)\b", "Use:hiking"),
    (r"\b(nurse|teacher|barista|server|construction|landscap(?:er|ing))\b", "Use:profession"),
    (r"\b(dress shoes|heels?|cowboy boots|rain boots|sneakers|tennis shoes|keds|converse)\b", "Use:shoe_type"),

    # ============================================================================
    # LACES ATTRIBUTES
    # ============================================================================
    (r"\bstay tied\b", "Laces:stay_tied"),
    (r"\bthin\b", "Laces:thin"),
    (r"\bwaxed\b", "Laces:waxed"),

    # ============================================================================
    # SHOE ACCESSORIES
    # ============================================================================
    (r"\bshoe ?horn\b.*\b(?:useless|spring).*(?:rigid|support)|\bspring\b.*\bdoes(?:n't| not) allow support\b", "ShoeHorn:ineffective"),
    (r"\bprevent wrinkles|stretch the length\b", "ShoeTrees:effective"),
    
    # ============================================================================
    # RECOMMENDATION
    # ============================================================================
    (r"\b(highly recommend|would recommend|recommend to|definitely recommend)\b", "Recommendation:recommend"),
    (r"\b(would not recommend|won't recommend|not recommend)\b", "Recommendation:not_recommend")
]

Overwriting /content/drive/MyDrive/AAI-590/attribute_rules.py


## Load and Prepare Data


Loading the two CSV files into DataFrames, selecting relevant columns, add the 'Category' column, and concatenate them into a single `df_selected` DataFrame. Ensure consistent use of `df_selected` throughout the notebook.

In [None]:
# ================================================================================
# DATA LOADING AND PREPARATION
# ================================================================================
# Load review datasets from CSV files and combine them into a single DataFrame

# Define file paths for the two datasets
csv_file_path_1 = '/content/drive/MyDrive/AAI-590/Womens-clothingReviews.csv'
csv_file_path_2 = '/content/drive/MyDrive/AAI-590/shoe-care-insolesReviews.csv'

# Load datasets into DataFrames
df1 = pd.read_csv(csv_file_path_1)
df2 = pd.read_csv(csv_file_path_2)

print("DataFrame loaded successfully.")
print("First 5 rows of the DataFrame:")
print(df1.head())

print("\nColumn names and their data types:")
print(df1.info())

# Select relevant columns for analysis
selected_columns = ['SourceClient', 'OriginalProductName', 'Title', 'ReviewText', 'Rating', 'IsRecommended']

# Process first dataset and add category label
df_selected1 = df1[selected_columns]
df_selected1['Category'] = 'Women_Clothing'

# Process second dataset and add category label
df_selected2 = df2[selected_columns]
df_selected2['Category'] = 'Shoe_insole'

# Combine both datasets into a single DataFrame
df_selected = pd.concat([df_selected1, df_selected2], ignore_index=True)

display(df_selected.head())

DataFrame loaded successfully.
First 5 rows of the DataFrame:
          Id                                   CID SourceClient  \
0  314505599  43bb9f86-6037-5f19-8dc4-06e2f59bb9f2     carhartt   
1  311030383  e2655a8b-e7d1-56e2-9196-193c93bd2dbf     carhartt   
2  311030373  287c81dc-05c3-5b2a-94b5-4c39e389cc21     carhartt   
3  276723966  5d33f88d-c9ec-52bb-ad02-7f59df3c8dd5     carhartt   
4  276451217  c146de06-eb01-5b8a-be47-c2574b577259     carhartt   

               LastModeratedTime           LastModificationTime  ProductId  \
0  2024-07-17T14:46:11.000+00:00  2024-07-17T14:46:11.000+00:00     328283   
1  2024-06-04T14:47:21.000+00:00  2024-06-04T14:47:21.000+00:00     328283   
2  2024-06-04T14:47:21.000+00:00  2024-06-04T14:47:21.000+00:00     328283   
3  2024-04-15T14:15:47.000+00:00  2024-04-15T14:15:47.000+00:00     328283   
4  2024-04-10T14:46:17.000+00:00  2024-04-10T14:46:17.000+00:00     328283   

  OriginalProductName    UserLocation                   AuthorId  

Unnamed: 0,SourceClient,OriginalProductName,Title,ReviewText,Rating,IsRecommended,Category
0,carhartt,Knit Cuffed Beanie,Great hat,"These hats are great. They are warm, comfortab...",5,,Women_Clothing
1,carhartt,Knit Cuffed Beanie,"Awkward fit, miss my old Carhartt beanie",I ordered this beanie to replace a Carhartt be...,2,,Women_Clothing
2,carhartt,Knit Cuffed Beanie,,But for these beanies and they all were great....,5,,Women_Clothing
3,carhartt,Knit Cuffed Beanie,Beanie2024Chicago,My son immediately worn his beanie to work. Ap...,5,,Women_Clothing
4,carhartt,Knit Cuffed Beanie,Best Beanie,Great fit and fast shipping. My Daughters need...,5,,Women_Clothing


## Sentiment Model Training

Train a multi-class sentiment classification model using TF-IDF features and Logistic Regression. The model classifies reviews into three categories: positive, neutral, and negative based on ratings.

In [6]:
def rating_to_sentiment_label(rating):
    if rating >= 4:
        return 'positive'
    elif rating == 3:
        return 'neutral'
    elif rating <= 2:
        return 'negative'
    return None # Handle cases where rating might be NaN or unexpected value

df_selected["__text__"] = df_selected.apply(lambda row: build_text(safe_get(row, "Title"),
                                                     safe_get(row, "ReviewText")), axis=1)
# =========================================================
# 1) Train ML Sentiment Model (multiclass)
# =========================================================
df_selected["__sentiment_label__"] = df_selected["Rating"].apply(rating_to_sentiment_label)

# Use rows that have labels (rating present)
senti_train = df_selected.dropna(subset=["__sentiment_label__"]).copy()
X_senti = senti_train["__text__"].values
y_senti = senti_train["__sentiment_label__"].values

# Train / test split
X_tr, X_te, y_tr, y_te = train_test_split(
    X_senti, y_senti, test_size=0.2, random_state=26, stratify=y_senti
)

# TF-IDF + Logistic Regression (multiclass)
# class_weight balances imbalanced classes
sentiment_clf = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),
        min_df=5,
        max_df=0.9,
        strip_accents="unicode",
        sublinear_tf=True
    )),
    ("logreg", LogisticRegression(
        multi_class="auto",
        class_weight="balanced",
        solver="lbfgs",
        max_iter=200
    ))
])

sentiment_clf.fit(X_tr, y_tr)
y_pred = sentiment_clf.predict(X_te)
print("\n=== Sentiment classification report ===")
print(classification_report(y_te, y_pred, digits=3))

try:
  proba_all = sentiment_clf.predict_proba(df_selected["__text__"].values)
  # get class order
  senti_classes = sentiment_clf.named_steps["logreg"].classes_
  # make nice probability dicts
  prob_df = pd.DataFrame(proba_all, columns=[f"ML_Sentiment_Prob_{c}" for c in senti_classes])
except Exception:
  # fallback to decision_function (scaled to 0..1 per class via softmax)
  from scipy.special import softmax
  scores = sentiment_clf.decision_function(df_selected["__text__"].values)
  if scores.ndim == 1:
      scores = np.vstack([scores, -scores]).T
      senti_classes = np.array(["Positive", "Negative"])
  else:
      senti_classes = sentiment_clf.named_steps["logreg"].classes_
  proba_all = softmax(scores, axis=1);
  prob_df = pd.DataFrame(proba_all, columns=[f"ML_Sentiment_Prob_{c}" for c in senti_classes])

# Add probability columns to df_selected
for c in prob_df.columns:
        df_selected[c] = prob_df[c]

senti_pred = sentiment_clf.predict(df_selected["__text__"].values)
df_selected["ML_Sentiment_Label"] = senti_pred

# Create a continuous score in [-1, +1] from probs:
# Positive prob - Negative prob (Neutral reduces the magnitude naturally)
pos_col = [c for c in prob_df.columns if c.endswith("positive")] # Changed to lowercase 'positive'
neg_col = [c for c in prob_df.columns if c.endswith("negative")] # Changed to lowercase 'negative'
pos_prob = prob_df[pos_col[0]] if pos_col else 0.0
neg_prob = prob_df[neg_col[0]] if neg_col else 0.0
df_selected["ML_Sentiment_Score"] = (pos_prob - neg_prob).fillna(0.0)

# Save sentiment model to Google Drive
joblib.dump(sentiment_clf, "/content/drive/MyDrive/AAI-590/models_sentiment.pkl")
print("Sentiment model trained, evaluated, probabilities calculated, and saved.")


=== Sentiment classification report ===
              precision    recall  f1-score   support

    negative      0.863     0.952     0.905      1053
     neutral      0.655     0.892     0.755       517
    positive      0.994     0.957     0.975      7881

    accuracy                          0.953      9451
   macro avg      0.837     0.933     0.878      9451
weighted avg      0.961     0.953     0.955      9451

Sentiment model trained, evaluated, probabilities calculated, and saved.


## Multi-Label Attribute Classification Model Training

Train a multi-label classification model to identify product attributes from review text. Uses TF-IDF vectorization with OneVsRestClassifier and Logistic Regression to handle multiple simultaneous labels per review.

In [None]:
# ================================================================================
# ATTRIBUTE EXTRACTION MODEL TRAINING
# ================================================================================
# Train a multi-label classifier to identify product attributes from review text

# Import attribute rules
import sys
sys.path.append('/content/drive/MyDrive/AAI-590/')
from attribute_rules import rules_list

def compile_attribute_rules():
    """
    Compile regex patterns from the rules list for efficient matching.
    
    Returns:
        list: Tuples of (compiled_pattern, attribute_tag)
    """
    rules = rules_list
    return [(re.compile(pat), tag) for pat, tag in rules]

# Configuration constants
ATTR_COL = "Highlighted_Product_attributes"  # Column name for attribute labels
MIN_ATTR_SUPPORT = 10  # Minimum occurrences required for an attribute to be included
RANDOM_SEED = 26  # Random seed for reproducibility

# ================================================================================
# PREPARE LABELS FOR TRAINING
# ================================================================================
# Use existing attributes if available, otherwise generate weak labels using rules

if ATTR_COL in df_selected.columns and df_selected[ATTR_COL].notna().any():
    print("\nUsing existing Highlighted_Product_attributes as labels for ML attribute model.")
    labels = df_selected[ATTR_COL].fillna("").apply(lambda s: [t.strip() for t in s.split(";") if t.strip()])
else:
    print("\nHighlighted_Product_attributes missing/empty — generating weak labels via rules.")
    compiled_rules = compile_attribute_rules()
    labels = df_selected["__text__"].apply(lambda t: extract_rule_tags(t, compiled_rules))

# ================================================================================
# FILTER ATTRIBUTES BY MINIMUM SUPPORT
# ================================================================================
# Keep only attributes that appear frequently enough to be meaningful

attr_counts: Dict[str, int] = {}
for tag_list in labels:
    for t in tag_list:
        attr_counts[t] = attr_counts.get(t, 0) + 1

# Filter to common attributes (support >= MIN_ATTR_SUPPORT)
common_attrs = {t for t, cnt in attr_counts.items() if cnt >= MIN_ATTR_SUPPORT}

def filter_common(tag_list):
    """Filter tag list to include only common attributes."""
    return [t for t in tag_list if t in common_attrs]

filtered_labels = labels.apply(filter_common)

# ================================================================================
# PREPARE TRAINING DATA
# ================================================================================
# Use only rows that have at least one label

ml_rows = filtered_labels.apply(lambda lst: len(lst) > 0)
X_attr_all = df_selected.loc[ml_rows, "__text__"].values
Y_attr_all = filtered_labels.loc[ml_rows].values

# Binarize multi-labels using MultiLabelBinarizer
mlb = MultiLabelBinarizer()
Y_bin = mlb.fit_transform(Y_attr_all)

# ================================================================================
# HANDLE STRATIFICATION FOR TRAIN-TEST SPLIT
# ================================================================================
# Remove samples with unique stratification values to enable stratified splitting

# Calculate label counts per sample for stratification
stratify_values = Y_bin.sum(axis=1)

# Identify and remove samples that appear only once
value_counts = pd.Series(stratify_values).value_counts()
single_occurrence_values = value_counts[value_counts == 1].index
valid_indices = ~np.isin(stratify_values, single_occurrence_values)

# Filter datasets to valid indices
X_attr_all_filtered = X_attr_all[valid_indices]
Y_bin_filtered = Y_bin[valid_indices]
stratify_values_filtered = stratify_values[valid_indices]

# ================================================================================
# TRAIN-TEST SPLIT
# ================================================================================
# Split data with stratification to maintain class distribution

Xa_tr, Xa_te, Ya_tr, Ya_te = train_test_split(
    X_attr_all_filtered, Y_bin_filtered, 
    test_size=0.2, 
    random_state=RANDOM_SEED, 
    stratify=stratify_values_filtered
)

# ================================================================================
# BUILD AND TRAIN MODEL PIPELINE
# ================================================================================
# Create pipeline with TF-IDF vectorization and OneVsRestClassifier

attr_clf = Pipeline(steps=[
    ("tfidf", TfidfVectorizer(
        ngram_range=(1,2),      # Use unigrams and bigrams
        min_df=5,               # Minimum document frequency
        max_df=0.95,            # Maximum document frequency (remove very common terms)
        strip_accents="unicode",
        sublinear_tf=True       # Apply sublinear TF scaling
    )),
    ("ovr", OneVsRestClassifier(
        LogisticRegression(
            class_weight="balanced",  # Handle class imbalance
            solver="lbfgs",
            max_iter=200
        ),
        n_jobs=-1  # Use all available CPU cores
    ))
])

# Train the model
attr_clf.fit(Xa_tr, Ya_tr)

# ================================================================================
# EVALUATE MODEL PERFORMANCE
# ================================================================================

Ya_pred = attr_clf.predict(Xa_te)
macro_f1 = f1_score(Ya_te, Ya_pred, average="macro")

print(f"\n=== Attribute multi-label model ===\nMacro-F1 (held-out): {macro_f1:.3f}")
print("Label space size:", len(mlb.classes_))
print("Top 15 attribute labels:", mlb.classes_[:15])

# ================================================================================
# GENERATE PREDICTIONS FOR ENTIRE DATASET
# ================================================================================

try:
    # Get probability scores for each attribute
    attr_proba = attr_clf.predict_proba(df_selected["__text__"].values)
except Exception:
    # Fallback: use decision function with sigmoid transformation
    from scipy.special import expit
    scores = attr_clf.decision_function(attr_clf.named_steps['tfidf'].transform(df_selected["__text__"].values))
    attr_proba = expit(scores)

# Apply threshold of 0.5 to convert probabilities to binary predictions
attr_pred_bin = (attr_proba >= 0.5).astype(int)

# Convert binary predictions back to attribute tag lists
def tags_from_bin_row(bin_row):
    """Convert binary row to list of attribute tags."""
    return [mlb.classes_[i] for i, v in enumerate(bin_row) if v == 1]

ml_attr_tags = [tags_from_bin_row(row) for row in attr_pred_bin]
df_selected["ML_Attribute_Tags"] = ["; ".join(tags) for tags in ml_attr_tags]

# Store raw probabilities as JSON for detailed analysis
df_selected["ML_Attribute_Proba_JSON"] = [
    json.dumps({mlb.classes_[i]: float(p) for i, p in enumerate(attr_proba_row)}, ensure_ascii=False)
    for attr_proba_row in attr_proba
]

# ================================================================================
# SAVE MODEL AND LABEL BINARIZER
# ================================================================================

joblib.dump({"pipeline": attr_clf, "mlb": mlb}, "/content/drive/MyDrive/AAI-590/models_attributes.pkl")
print("Attribute model and label binarizer trained, evaluated, and saved.")


Highlighted_Product_attributes missing/empty — generating weak labels via rules.

=== Attribute multi-label model ===
Macro-F1 (held-out): 0.941
Label space size: 59
Top 15 attribute labels: ['Care:notes' 'Color:bright_hi_vis' 'Color:darker' 'Color:off'
 'Comfort:high' 'Comfort:low' 'Durability:high' 'Durability:issues'
 'Fit:excess_length' 'Fit:loose' 'Fit:moves_in_shoe' 'Fit:noise'
 'Fit:rides_up' 'Fit:size_issue' 'Fit:small']
Attribute model and label binarizer trained, evaluated, and saved.
