# 02 — Treniranje i evaluacija modela

In [None]:
# 4 - threshold/cost analysis (XGBoost)
import numpy as np, pandas as pd
from sklearn.metrics import confusion_matrix

c_fp, c_fn = 1, 5
ths = np.linspace(0, 1, 101)
rows, best = [], (None, 1e18)

for t in ths:
    y_hat = (y_proba_xgb >= t).astype(int)
    tn, fp, fn, tp = confusion_matrix(y_test_bin, y_hat).ravel()
    cost = c_fp*fp + c_fn*fn
    rows.append({"threshold": float(t), "TP": int(tp), "FP": int(fp), "FN": int(fn), "TN": int(tn), "cost": int(cost)})
    if cost < best[1]:
        best = (t, cost)

pd.DataFrame(rows).to_csv(os.path.join(FIG_DIR, "threshold_sweep_xgb.csv"), index=False)
print("Best threshold (cost-min):", round(best[0], 3), "cost:", best[1])


Best threshold (cost-min): 0.68 cost: 17


In [None]:
# 3 - FP/FN summary
import pandas as pd
from sklearn.metrics import confusion_matrix

summary = []
for name, y_pred in [("MultinomialNB", y_pred_nb), ("LinearSVC", y_pred_svc), ("XGBoost", y_pred_xgb)]:
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred, labels=["ham","spam"]).ravel()
    summary.append({"model": name, "TP": int(tp), "FP": int(fp), "FN": int(fn), "TN": int(tn)})

df_fpfn = pd.DataFrame(summary)
df_fpfn.to_csv(os.path.join(FIG_DIR, "fp_fn_summary.csv"), index=False)
df_fpfn


Unnamed: 0,model,TP,FP,FN,TN
0,MultinomialNB,84,3,1,91
1,LinearSVC,83,1,2,93
2,XGBoost,82,7,3,87


In [None]:
# 2 - XGBoost + ROC/PR
import numpy as np, os, matplotlib.pyplot as plt, json
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve

y_train_bin = (y_train == "spam").astype(int)

xgb = XGBClassifier(
    n_estimators=300, max_depth=5, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8, n_jobs=-1, eval_metric="logloss"
)
xgb.fit(X_train_vec, y_train_bin)

y_pred_xgb_bin = xgb.predict(X_test_vec)
y_pred_xgb = np.where(y_pred_xgb_bin == 1, "spam", "ham")
report_xgb = classification_report(y_test, y_pred_xgb, output_dict=True, zero_division=0)
results["XGBoost"] = report_xgb

disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred_xgb, labels=["ham","spam"], normalize="true")
disp.ax_.set_title("Confusion Matrix — XGBoost (normalized)")
disp.figure_.savefig(os.path.join(FIG_DIR, "cm_XGBoost.png"), bbox_inches="tight"); plt.close(disp.figure_)

y_proba_xgb = xgb.predict_proba(X_test_vec)[:, 1]
fpr, tpr, _ = roc_curve(y_test_bin, y_proba_xgb); roc_auc = auc(fpr, tpr)
fig = plt.figure(); plt.plot(fpr,tpr,label=f"AUC={roc_auc:.3f}"); plt.title("ROC — XGBoost"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend()
fig.savefig(os.path.join(FIG_DIR, "roc_xgb.png"), bbox_inches="tight"); plt.close(fig)

precision, recall, _ = precision_recall_curve(y_test_bin, y_proba_xgb)
fig = plt.figure(); plt.plot(recall, precision); plt.title("Precision-Recall — XGBoost"); plt.xlabel("Recall"); plt.ylabel("Precision")
fig.savefig(os.path.join(FIG_DIR, "pr_xgb.png"), bbox_inches="tight"); plt.close(fig)

with open(os.path.join(FIG_DIR, "classification_reports.json"), "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print("Cell B OK — XGBoost added; y_pred_xgb & y_proba_xgb exist.")


Cell B OK — XGBoost added; y_pred_xgb & y_proba_xgb exist.


In [None]:
# 1 - TF-IDF, NB + SVC, basic plots, results
import os, json, numpy as np, pandas as pd, matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve, confusion_matrix

CSV_PATH = "../data/processed/dataset.csv"
FIG_DIR  = "../reports/figures"
os.makedirs(FIG_DIR, exist_ok=True)


df = pd.read_csv(CSV_PATH)
X_train, X_test, y_train, y_test = train_test_split(
    df["text"], df["label"], test_size=0.2, stratify=df["label"], random_state=42
)

tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1,2), stop_words="english")
X_train_vec = tfidf.fit_transform(X_train)
X_test_vec  = tfidf.transform(X_test)

results = {}


nb = MultinomialNB()
nb.fit(X_train_vec, y_train)
y_pred_nb = nb.predict(X_test_vec)
y_proba_nb = nb.predict_proba(X_test_vec)[:, list(nb.classes_).index("spam")]
report_nb  = classification_report(y_test, y_pred_nb, output_dict=True, zero_division=0)
results["MultinomialNB"] = report_nb
disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred_nb, labels=["ham","spam"], normalize="true")
disp.ax_.set_title("Confusion Matrix — MultinomialNB (normalized)")
disp.figure_.savefig(os.path.join(FIG_DIR, "cm_MultinomialNB.png"), bbox_inches="tight"); plt.close(disp.figure_)
y_test_bin = (y_test == "spam").astype(int)
fpr, tpr, _ = roc_curve(y_test_bin, y_proba_nb); roc_auc = auc(fpr,tpr)
fig = plt.figure(); plt.plot(fpr,tpr,label=f"AUC={roc_auc:.3f}"); plt.title("ROC — MultinomialNB"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend()
fig.savefig(os.path.join(FIG_DIR, "roc_nb.png"), bbox_inches="tight"); plt.close(fig)
precision, recall, _ = precision_recall_curve(y_test_bin, y_proba_nb)
fig = plt.figure(); plt.plot(recall, precision); plt.title("Precision-Recall — MultinomialNB"); plt.xlabel("Recall"); plt.ylabel("Precision")
fig.savefig(os.path.join(FIG_DIR, "pr_nb.png"), bbox_inches="tight"); plt.close(fig)


svc = LinearSVC()
svc.fit(X_train_vec, y_train)
y_pred_svc = svc.predict(X_test_vec)
report_svc = classification_report(y_test, y_pred_svc, output_dict=True, zero_division=0)
results["LinearSVC"] = report_svc
disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred_svc, labels=["ham","spam"], normalize="true")
disp.ax_.set_title("Confusion Matrix — LinearSVC (normalized)")
disp.figure_.savefig(os.path.join(FIG_DIR, "cm_LinearSVC.png"), bbox_inches="tight"); plt.close(disp.figure_)

with open(os.path.join(FIG_DIR, "classification_reports.json"), "w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print("Cell A OK — variables ready: y_train, y_test, y_test_bin, X_*_vec, y_pred_nb, y_pred_svc, y_proba_nb, results.")


Cell A OK — variables ready: y_train, y_test, y_test_bin, X_*_vec, y_pred_nb, y_pred_svc, y_proba_nb, results.


In [None]:
import numpy as np

c_fp, c_fn = 1, 5
ths = np.linspace(0,1,101)
best = (None, 1e18)
rows = []
for t in ths:
    y_hat = (y_proba_xgb >= t).astype(int)
    cm = confusion_matrix(y_test_bin, y_hat)
    tn, fp, fn, tp = cm.ravel()
    cost = c_fp*fp + c_fn*fn
    rows.append({"threshold": float(t), "TP": int(tp), "FP": int(fp), "FN": int(fn), "TN": int(tn), "cost": int(cost)})
    if cost < best[1]: best = (t, cost)
pd.DataFrame(rows).to_csv(os.path.join(FIG_DIR, "threshold_sweep_xgb.csv"), index=False)
print("Best threshold (cost-min):", round(best[0],3), "cost:", best[1])


NameError: name 'y_proba_xgb' is not defined

In [4]:
import pandas as pd
from sklearn.metrics import confusion_matrix

summary = []
for name, y_pred in [("MultinomialNB", y_pred_nb), ("LinearSVC", y_pred_svc), ("XGBoost", y_pred_xgb)]:
    cm = confusion_matrix(y_test, y_pred, labels=["ham","spam"])
    tn, fp, fn, tp = cm.ravel()
    summary.append({"model": name, "TP": int(tp), "FP": int(fp), "FN": int(fn), "TN": int(tn)})

df_fpfn = pd.DataFrame(summary)
df_fpfn.to_csv(os.path.join(FIG_DIR, "fp_fn_summary.csv"), index=False)
df_fpfn


NameError: name 'y_pred_nb' is not defined

In [None]:
#not good I think
import numpy as np, os
from sklearn.metrics import classification_report, ConfusionMatrixDisplay, roc_curve, auc, precision_recall_curve


from xgboost import XGBClassifier


y_train_bin = (y_train == "spam").astype(int)
y_test_bin  = (y_test  == "spam").astype(int)

xgb = XGBClassifier(
    n_estimators=300, max_depth=5, learning_rate=0.1,
    subsample=0.8, colsample_bytree=0.8, n_jobs=-1, eval_metric="logloss"
)
xgb.fit(X_train_vec, y_train_bin)


y_pred_xgb_bin = xgb.predict(X_test_vec)
y_pred_xgb = np.where(y_pred_xgb_bin == 1, "spam", "ham")
report_xgb = classification_report(y_test, y_pred_xgb, output_dict=True, zero_division=0)
results["XGBoost"] = report_xgb


disp = ConfusionMatrixDisplay.from_predictions(y_test, y_pred_xgb, labels=["ham","spam"], normalize='true')
disp.ax_.set_title("Confusion Matrix — XGBoost (normalized)")
fig = disp.figure_
fig.savefig(os.path.join(FIG_DIR, "cm_XGBoost.png"), bbox_inches="tight")


y_proba_xgb = xgb.predict_proba(X_test_vec)[:,1]
fpr, tpr, _ = roc_curve(y_test_bin, y_proba_xgb); roc_auc = auc(fpr, tpr)
fig = plt.figure(); plt.plot(fpr, tpr, label=f"AUC={roc_auc:.3f}"); plt.title("ROC — XGBoost"); plt.xlabel("FPR"); plt.ylabel("TPR"); plt.legend()
fig.savefig(os.path.join(FIG_DIR, "roc_xgb.png"), bbox_inches="tight"); plt.close(fig)

precision, recall, _ = precision_recall_curve(y_test_bin, y_proba_xgb)
fig = plt.figure(); plt.plot(recall, precision); plt.title("Precision-Recall — XGBoost"); plt.xlabel("Recall"); plt.ylabel("Precision")
fig.savefig(os.path.join(FIG_DIR, "pr_xgb.png"), bbox_inches="tight"); plt.close(fig)


NameError: name 'y_train' is not defined

In [None]:

import os, json
import sys

sys.path.append(os.path.abspath(os.path.join(os.getcwd(), "..")))

from src.models import train_and_evaluate


CSV_PATH = "../data/processed/dataset.csv"
FIG_DIR  = "../reports/figures"


results = train_and_evaluate(CSV_PATH, FIG_DIR)
print("Gotovi izveštaji i grafici su u:", FIG_DIR)
print(json.dumps(results, indent=2))


Gotovi izveštaji i grafici su u: ../reports/figures
{
  "MultinomialNB": {
    "ham": {
      "precision": 0.9891304347826086,
      "recall": 0.9680851063829787,
      "f1-score": 0.978494623655914,
      "support": 94.0
    },
    "spam": {
      "precision": 0.9655172413793104,
      "recall": 0.9882352941176471,
      "f1-score": 0.9767441860465116,
      "support": 85.0
    },
    "accuracy": 0.9776536312849162,
    "macro avg": {
      "precision": 0.9773238380809595,
      "recall": 0.9781602002503129,
      "f1-score": 0.9776194048512128,
      "support": 179.0
    },
    "weighted avg": {
      "precision": 0.9779174658480815,
      "recall": 0.9776536312849162,
      "f1-score": 0.9776634102659744,
      "support": 179.0
    }
  },
  "LinearSVC": {
    "ham": {
      "precision": 0.9789473684210527,
      "recall": 0.9893617021276596,
      "f1-score": 0.9841269841269841,
      "support": 94.0
    },
    "spam": {
      "precision": 0.9880952380952381,
      "recall": 0.97647