In [1]:
SAVE_EMBEDDINGS_DIR = "models/embeddings_train_test"
SAVE_MODEL_DIR = "models/svm"

In [2]:
import numpy as np
import joblib
from sklearn.svm import LinearSVC
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import accuracy_score, f1_score, classification_report

# =========================
# 1️⃣ Load dữ liệu
# =========================
X_train = np.load(f"{SAVE_EMBEDDINGS_DIR}/X_train_emb.npy")
y_train = np.load(f"{SAVE_EMBEDDINGS_DIR}/y_train.npy")

X_test = np.load(f"{SAVE_EMBEDDINGS_DIR}/X_test_emb.npy")
y_test = np.load(f"{SAVE_EMBEDDINGS_DIR}/y_test.npy")

print("Train shape:", X_train.shape)
print("Test shape :", X_test.shape)

# =========================
# 2️⃣ Base model: LinearSVC
# =========================
base_clf = LinearSVC(
    C=1.76702,
    class_weight="balanced",
    random_state=42
)

# =========================
# 3️⃣ Calibrate (Cách B): CV trên TRAIN
# =========================
# method="sigmoid" ổn định; "isotonic" chỉ nên dùng khi data nhiều
clf = CalibratedClassifierCV(
    estimator=base_clf,
    method="sigmoid",
    cv=5
)

print(">> Training + Calibrating (LinearSVC + Platt sigmoid, cv=5)...")
clf.fit(X_train, y_train)

# =========================
# 4️⃣ Evaluate
# =========================
y_pred = clf.predict(X_test)

acc = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average="macro")

print(f"Accuracy : {acc:.4f}")
print(f"F1-macro : {f1:.4f}")
print("\nClassification report:")
print(classification_report(y_test, y_pred))

# =========================
# 5️⃣ In độ tin cậy (confidence)
# =========================
proba = clf.predict_proba(X_test)      # (n_samples, n_classes)
conf  = proba.max(axis=1)             # độ tin cậy của class được chọn

print("\nTop-5 confidence samples:")
for i in range(min(5, len(conf))):
    print(f"  sample {i}: pred={y_pred[i]}  conf={float(conf[i]):.4f}")

# Nếu bạn muốn thêm gap giữa top1-top2 (từ proba):
sorted_proba = np.sort(proba, axis=1)
gap = sorted_proba[:, -1] - sorted_proba[:, -2]
print("\nTop-5 gap (top1-top2) samples:")
for i in range(min(5, len(gap))):
    print(f"  sample {i}: gap={float(gap[i]):.4f}")

# =========================
# 6️⃣ Save model (đã calibrate)
# =========================
joblib.dump(clf, f"{SAVE_MODEL_DIR}/linear_svc_cso_calibrated.joblib")
print(">> Model saved to linear_svc_cso_calibrated.joblib")

Train shape: (37698, 768)
Test shape : (9425, 768)
>> Training + Calibrating (LinearSVC + Platt sigmoid, cv=5)...
Accuracy : 0.8761
F1-macro : 0.8761

Classification report:
              precision    recall  f1-score   support

           0       0.94      0.90      0.92       730
           1       0.81      0.80      0.81       716
           2       0.86      0.88      0.87       714
           3       0.95      0.94      0.95       734
           4       0.81      0.84      0.83       733
           5       0.88      0.86      0.87       710
           6       0.96      0.96      0.96       735
           7       0.81      0.88      0.84       714
           8       0.86      0.86      0.86       733
           9       0.98      0.97      0.97       726
          10       0.80      0.78      0.79       731
          11       0.93      0.95      0.94       721
          12       0.79      0.79      0.79       728

    accuracy                           0.88      9425
   macro avg  