
# E2E: Product Category Classification (TF‑IDF + LinearSVC)

Ovaj notebook trenira i evaluira modele za klasifikaciju kategorija proizvoda na osnovu kolone "Product Title".
Generiše izlaze u:
- `reports/classification_report.txt`
- `reports/confusion_matrix.png` (ograničeno do 50 najčešćih klasa ako ih ima previše)
- `reports/confusion_matrix_top20.png`
- `models/product_category_model.pkl` (najbolji model, ceo pipeline)


In [1]:

import os
import re
import warnings
import numpy as np
import pandas as pd
import scipy.sparse as sp
import matplotlib.pyplot as plt
import seaborn as sns

from typing import List, Optional

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB
from sklearn.svm import LinearSVC
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.base import BaseEstimator, TransformerMixin

import joblib

warnings.filterwarnings("ignore")
sns.set(style="whitegrid")

os.makedirs("reports", exist_ok=True)
os.makedirs("models", exist_ok=True)


In [2]:

# Konfiguracija
DATA_PATH = "data/products.csv"  

# Ako su nazivi kolona drugaciji, mozete ih ovde zadati rucno (npr. "title", "category")
TEXT_COL = None   # npr. "Product Title"
LABEL_COL = None  # npr. "Category Label"
TEST_SIZE = 0.2
RANDOM_STATE = 42

print(f"DATA_PATH = {DATA_PATH}")


DATA_PATH = data/products.csv


In [3]:

# Ucitavanje podataka
df = pd.read_csv(DATA_PATH, low_memory=False)
print("Broj redova:", len(df))
print("Kolone:", list(df.columns))

# Pomocne funkcije za robustno prepoznavanje kolona
def norm_col(s: str) -> str:
    return re.sub(r"[^a-z0-9]+", "", str(s).lower())

def find_text_col(cands=None) -> Optional[str]:
    if cands is None:
        cands = ["producttitle", "title", "productname", "name"]
    norm_map = {norm_col(c): c for c in df.columns}
    for key in cands:
        if key in norm_map:
            return norm_map[key]
    for k, orig in norm_map.items():
        if "title" in k or re.search(r"(^|[^a-z])name([^a-z]|$)", k):
            return orig
    return None

def find_label_col(cands=None) -> Optional[str]:
    if cands is None:
        cands = ["categorylabel", "category", "label"]
    norm_map = {norm_col(c): c for c in df.columns}
    for key in cands:
        if key in norm_map:
            return norm_map[key]
    for k, orig in norm_map.items():
        if "category" in k or "label" in k:
            return orig
    return None

auto_text = find_text_col()
auto_label = find_label_col()

if TEXT_COL is None:
    TEXT_COL = auto_text
if LABEL_COL is None:
    LABEL_COL = auto_label

print(f"TEXT_COL  = {TEXT_COL}")
print(f"LABEL_COL = {LABEL_COL}")
assert TEXT_COL is not None and LABEL_COL is not None, "Nisu pronadjene potrebne kolone. Podesite TEXT_COL i LABEL_COL u konfiguraciji."


Broj redova: 35311
Kolone: ['product ID', 'Product Title', 'Merchant ID', ' Category Label', '_Product Code', 'Number_of_Views', 'Merchant Rating', ' Listing Date  ']
TEXT_COL  = Product Title
LABEL_COL =  Category Label


In [4]:

 # Čišćenje i osnovna priprema + kanonikalizacija labela
df = df[[TEXT_COL, LABEL_COL]].copy()

before = len(df)
df.dropna(subset=[TEXT_COL, LABEL_COL], inplace=True)
df[TEXT_COL] = df[TEXT_COL].astype(str).str.strip()
df[LABEL_COL] = df[LABEL_COL].astype(str).str.strip()
df = df[(df[TEXT_COL] != "") & (df[LABEL_COL] != "")]
df.drop_duplicates(subset=[TEXT_COL, LABEL_COL], inplace=True)
after_basic = len(df)
print(f"Obrisano redova (osnovno čišćenje): {before - after_basic}; Zadržano: {after_basic}")

# Spajanje varijanti naziva klasa (singular/plural, mala/velika slova, sleng)
label_map = {
    'cpu': 'CPUs',
    'cpus': 'CPUs',
    'mobile phone': 'Mobile Phones',
    'mobile phones': 'Mobile Phones',
    'fridge': 'Fridges',
    'fridges': 'Fridges',
    'fridge freezer': 'Fridge Freezers',
    'fridge freezers': 'Fridge Freezers',
    'freezer': 'Freezers',
    'freezers': 'Freezers',
    'tv': 'TVs',
    'tvs': 'TVs',
    'digital camera': 'Digital Cameras',
    'digital cameras': 'Digital Cameras',
    'washing machine': 'Washing Machines',
    'washing machines': 'Washing Machines',
    'microwave': 'Microwaves',
    'microwaves': 'Microwaves',
    'dishwasher': 'Dishwashers',
    'dishwashers': 'Dishwashers',
}

def canonicalize_label(x: str) -> str:
    s = str(x).strip().lower()
    return label_map.get(s, x)  # ako nije u mapi, ostavi original

df[LABEL_COL] = df[LABEL_COL].apply(canonicalize_label)

print("\nTop 20 klasa po učestalosti (posle kanonikalizacije):")
print(df[LABEL_COL].value_counts().head(20))


Obrisano redova (osnovno čišćenje): 4425; Zadržano: 30886

Top 20 klasa po učestalosti (posle kanonikalizacije):
 Category Label
Fridge Freezers     4809
Mobile Phones       3669
Washing Machines    3403
TVs                 3275
Fridges             3216
CPUs                3070
Dishwashers         3042
Digital Cameras     2405
Microwaves          2094
Freezers            1903
Name: count, dtype: int64


In [5]:

# Numeričke karakteristike izdvojene iz naslova (non-negative, kompatibilno sa NB)
def extract_numeric_features(texts: List[str]):
    lens, ndigs, has_gb, has_inch, has_mp = [], [], [], [], []
    gb_re   = re.compile(r'\b(\d+)\s?gb\b', re.I)
    inch_re = re.compile(r'\b(\d+(?:\.\d+)?)\s?(?:(?:inches|inch)\b|["”″])', re.I)
    mp_re   = re.compile(r'\b(\d+)\s?mp\b', re.I)
    for t in texts:
        s = str(t)
        lens.append(len(s))
        ndigs.append(sum(ch.isdigit() for ch in s))
        has_gb.append(1 if gb_re.search(s) else 0)
        has_inch.append(1 if inch_re.search(s) else 0)
        has_mp.append(1 if mp_re.search(s) else 0)
    M = np.vstack([lens, ndigs, has_gb, has_inch, has_mp]).T.astype(float)
    return sp.csr_matrix(M)

# Robustna varijanta bez lambda (pickle-friendly)
class NumericFeatures(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    def transform(self, X):
        return extract_numeric_features(X)

numeric_ft = NumericFeatures()

tfidf = TfidfVectorizer(
    ngram_range=(1, 2),
    min_df=2,
    max_df=0.9,
    strip_accents='unicode',
    lowercase=True,
)

feats = FeatureUnion([
    ("tfidf", tfidf),
    ("num", numeric_ft),
])


In [6]:

    X = df[TEXT_COL].astype(str).values
    y = df[LABEL_COL].astype(str).values

    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=TEST_SIZE, random_state=RANDOM_STATE, stratify=y
    )

    svc = LinearSVC(C=1.0, random_state=RANDOM_STATE)
    nb = ComplementNB()

    pipe_svc = Pipeline([
        ("feats", feats),
        ("clf", svc),
    ])

    pipe_nb = Pipeline([
        ("feats", feats),
        ("clf", nb),
    ])

    models = {
        "LinearSVC": pipe_svc,
        "ComplementNB": pipe_nb,
    }

    results = {}
    for name, model in models.items():
        print(f"Treniram: {name}...")
        model.fit(X_train, y_train)
        pred = model.predict(X_test)
        acc = accuracy_score(y_test, pred)
        print(f"Accuracy ({name}): {acc:.4f}")
        results[name] = {"model": model, "pred": pred, "acc": acc}

    best_name = max(results.keys(), key=lambda k: results[k]["acc"])
    best_model = results[best_name]["model"]
    y_pred = results[best_name]["pred"]
    print(f"Najbolji model: {best_name} (accuracy={results[best_name]['acc']:.4f})")


Treniram: LinearSVC...
Accuracy (LinearSVC): 0.9535
Treniram: ComplementNB...
Accuracy (ComplementNB): 0.9550
Najbolji model: ComplementNB (accuracy=0.9550)


In [7]:

from collections import Counter

# Klasifikacioni izveštaj (samo .txt)
report_text = classification_report(y_test, y_pred, digits=4)
print(report_text)

with open("reports/classification_report.txt", "w", encoding="utf-8") as f:
    f.write(f"Model: {best_name}\n\n")
    f.write(report_text)
    f.write("\n")

# Konfuziona matrica (samo jedna .png)
counts = Counter(y_test)
classes_by_support = [cls for cls, _ in counts.most_common()]

def plot_cm(y_true, y_hat, classes_to_use, title, out_path, normalize=False):
    labels = classes_to_use
    cm = confusion_matrix(y_true, y_hat, labels=labels)
    if normalize:
        with np.errstate(all='ignore'):
            cm = cm.astype('float') / cm.sum(axis=1, keepdims=True)
            cm = np.nan_to_num(cm)
    figsize = max(8, 0.35 * len(labels))
    plt.figure(figsize=(figsize, figsize))
    sns.heatmap(cm, annot=False, cmap="Blues",
                xticklabels=labels, yticklabels=labels,
                fmt=".2f" if normalize else "d")
    plt.title(f"{title} - {best_name}")
    plt.xlabel("Predicted")
    plt.ylabel("True")
    plt.tight_layout()
    plt.savefig(out_path, dpi=200)
    plt.close()

# Koristi sve klase; ako ih je previše, ograniči prikaz na Top 50 radi čitljivosti
if len(classes_by_support) > 50:
    use_classes = classes_by_support[:50]
    title_full = f"Confusion Matrix (Top 50 classes of {len(classes_by_support)})"
else:
    use_classes = classes_by_support
    title_full = f"Confusion Matrix (All {len(use_classes)} classes)"

plot_cm(y_test, y_pred, use_classes, title_full, "reports/confusion_matrix.png", normalize=False)

print("\nSačuvani fajlovi:")
print(" - reports/classification_report.txt")
print(" - reports/confusion_matrix.png")

                  precision    recall  f1-score   support

            CPUs     0.9808    1.0000    0.9903       614
 Digital Cameras     0.9917    0.9958    0.9938       481
     Dishwashers     0.9390    0.9622    0.9504       608
        Freezers     0.9782    0.8241    0.8946       381
 Fridge Freezers     0.8889    0.9480    0.9175       962
         Fridges     0.9272    0.8523    0.8882       643
      Microwaves     0.9878    0.9666    0.9771       419
   Mobile Phones     0.9959    0.9891    0.9925       734
             TVs     0.9819    0.9954    0.9886       655
Washing Machines     0.9433    0.9765    0.9596       681

        accuracy                         0.9550      6178
       macro avg     0.9615    0.9510    0.9553      6178
    weighted avg     0.9557    0.9550    0.9546      6178


Sačuvani fajlovi:
 - reports/classification_report.txt
 - reports/confusion_matrix.png


In [8]:

best_model.fit(X, y)
out_model_path = "models/product_category_model.pkl"
joblib.dump(best_model, out_model_path)
print(f"Model sacuvan u: {out_model_path}")


Model sacuvan u: models/product_category_model.pkl


In [9]:

demo_titles = [
    "apple iphone 8 plus 64gb space grey",
    "samsung galaxy s21 ultra 128gb phantom black",
    "dyson v12 cordless vacuum cleaner"
]

preds = best_model.predict(demo_titles)
for t, p in zip(demo_titles, preds):
    print(f"{t} -> {p}")


apple iphone 8 plus 64gb space grey -> Mobile Phones
samsung galaxy s21 ultra 128gb phantom black -> Mobile Phones
dyson v12 cordless vacuum cleaner -> Fridge Freezers
