In [1]:
"""
01_exploration_and_model.py

Ovaj fajl služi kao 'analitički izveštaj'.
Prikazuje:
- učitavanje podataka
- osnovnu analizu kategorija
- treniranje baznog modela (TF-IDF + LinearSVC)
- testiranje na ručnim primerima
"""

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score, classification_report


# =========================
# 1. Ucitavanje podataka
# =========================

# Ako pokreces ovaj fajl direktno iz root foldera projekta,
# promeni putanju u "data/IMLP4_TASK_03-products.csv".
# Ako pokreces iz ovog notebooks/ foldera, "../data/..." je ok.
CSV_PATH = "C:\\Users\\User\\product category classifier\\data\\IMLP4_TASK_03-products.csv"

df = pd.read_csv(CSV_PATH)

# ocisti eventualne whitespace-ove iz naziva kolona
df.columns = [c.strip() for c in df.columns]

print("\n[HEAD] Prvih par redova:")
print(df.head())

print("\n[INFO] Informacije o kolona i tipovima:")
print(df.info())

print("\n[VALUE COUNTS - ORIGINAL Category Label]")
print(df["Category Label"].value_counts().head(20))


# =========================
# 2. Ciscenje target kolone
# =========================

mapping = {
    "fridge": "fridges",
    "cpu": "cpus",
    "mobile phone": "mobile phones",
}

df["clean_category"] = (
    df["Category Label"]
    .astype(str)
    .str.strip()
    .str.lower()
    .replace(mapping)
)

# izbacimo redove koji nemaju title ili kategoriju
df = df.dropna(subset=["Product Title", "clean_category"])

print("\n[VALUE COUNTS - clean_category]")
print(df["clean_category"].value_counts().head(20))


# =========================
# 3. Train / test split
# =========================

X = df["Product Title"]
y = df["clean_category"]

X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)

print("\n[DATA SPLIT] Velicine skupova:")
print("Train:", len(X_train), "Test:", len(X_test))


# =========================
# 4. Bazni model (bez dodatnih feature-a)
#    TF-IDF ngrami + LinearSVC
# =========================

baseline_pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(
        ngram_range=(1, 2),
        min_df=2,
        max_df=0.9
    )),
    ("clf", LinearSVC())
])

print("\n[TRAIN] Treniranje baznog pipeline-a...")
baseline_pipeline.fit(X_train, y_train)

y_pred = baseline_pipeline.predict(X_test)

acc = accuracy_score(y_test, y_pred)
print("\n[RESULTS] Accuracy:", round(acc, 4))

print("\n[CLASSIFICATION REPORT]")
print(classification_report(y_test, y_pred))


# =========================
# 5. Rucno testiranje par naslova
#    (ovo je "praktična upotreba sistema")
# =========================

test_titles = [
    "iphone 7 32gb gold 4.7 inch",
    "bosch wap28390gb 8kg 1400 spin",
    "kenwood k20mss15 solo",
    "olympus e m10 mark iii kit",
    "bosch serie 4 kgv39vl31g"
]

print("\n[MANUAL TESTS]")
for t in test_titles:
    pred = baseline_pipeline.predict([t])[0]
    print(f"{t}  -->  {pred}")

"""
Zakljucak:
- Ovde u .py fajlu smo pokazali analizu i test modela,
  sto je analogno onome sto bi bio Jupyter notebook.
- U glavnom kodu projekta (src/train_model.py) imamo napredniji pipeline
  sa dodatnim numerickim feature-ima iz naslova i cuvanjem modela u .pkl fajl.
- predict_category.py koristi taj sacuvani model da predvidi kategoriju
  za bilo koji novi naslov proizvoda.
"""



[HEAD] Prvih par redova:
   product ID                                      Product Title  Merchant ID  \
0           1                    apple iphone 8 plus 64gb silver            1   
1           2                apple iphone 8 plus 64 gb spacegrau            2   
2           3  apple mq8n2b/a iphone 8 plus 64gb 5.5 12mp sim...            3   
3           4                apple iphone 8 plus 64gb space grey            4   
4           5  apple iphone 8 plus gold 5.5 64gb 4g unlocked ...            5   

  Category Label _Product Code  Number_of_Views  Merchant Rating Listing Date  
0  Mobile Phones    QA-2276-XC            860.0              2.5    5/10/2024  
1  Mobile Phones    KA-2501-QO           3772.0              4.8   12/31/2024  
2  Mobile Phones    FP-8086-IE           3092.0              3.9   11/10/2024  
3  Mobile Phones    YI-0086-US            466.0              3.4     5/2/2022  
4  Mobile Phones    NZ-3586-WP           4426.0              1.6    4/12/2023  

[INFO]

'\nZakljucak:\n- Ovde u .py fajlu smo pokazali analizu i test modela,\n  sto je analogno onome sto bi bio Jupyter notebook.\n- U glavnom kodu projekta (src/train_model.py) imamo napredniji pipeline\n  sa dodatnim numerickim feature-ima iz naslova i cuvanjem modela u .pkl fajl.\n- predict_category.py koristi taj sacuvani model da predvidi kategoriju\n  za bilo koji novi naslov proizvoda.\n'