In [3]:
import os
import re
import joblib
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.metrics import (
    accuracy_score,
    f1_score,
    confusion_matrix
)

# =========================
# CONFIG
# =========================
MODEL_PATH = "outputs/LogisticRegression.joblib"   # ‡∏´‡∏£‡∏∑‡∏≠ LinearSVM.joblib
DATA_PATH = "data/2.synthetic_wisesight_like_thai_sentiment_hard_5000 (1).csv"
OUT_DIR = "outputs"
os.makedirs(OUT_DIR, exist_ok=True)

TEXT_COL = "text"
LABEL_COL = "sentiment"

# =========================
# UTIL
# =========================
def clean(text):
    """Minimal preprocessing: whitespace normalization only"""
    return re.sub(r"\s+", " ", str(text)).strip()

def plot_cm(cm, labels, path):
    plt.figure(figsize=(6, 5))
    plt.imshow(cm)
    plt.title("Confusion Matrix")
    plt.xticks(range(len(labels)), labels, rotation=30)
    plt.yticks(range(len(labels)), labels)

    for i in range(len(labels)):
        for j in range(len(labels)):
            plt.text(j, i, cm[i, j], ha="center", va="center")

    plt.tight_layout()
    plt.savefig(path, dpi=200)
    plt.close()

# =========================
# LOAD MODEL
# =========================
bundle = joblib.load(MODEL_PATH)
pipe = bundle["pipeline"]

# =========================
# LOAD DATA
# =========================
df = pd.read_csv(DATA_PATH)[[TEXT_COL, LABEL_COL]].dropna()
df[TEXT_COL] = df[TEXT_COL].apply(clean)

print("Label distribution:")
print(df[LABEL_COL].value_counts(), "\n")

# =========================
# PREDICT
# =========================
y_true = df[LABEL_COL].values
y_pred = pipe.predict(df[TEXT_COL])

acc = accuracy_score(y_true, y_pred)
macro_f1 = f1_score(y_true, y_pred, average="macro")

print("‚úÖ DONE")
print(f"Accuracy : {acc:.4f}")
print(f"Macro-F1 : {macro_f1:.4f}")

# =========================
# CONFUSION MATRIX
# =========================
labels = sorted(df[LABEL_COL].unique())
cm = confusion_matrix(y_true, y_pred, labels=labels)
plot_cm(cm, labels, os.path.join(OUT_DIR, "confusion_matrix.png"))

# =========================
# ERROR ANALYSIS
# =========================
conf = np.zeros(len(y_pred))

if hasattr(pipe, "predict_proba"):
    proba = pipe.predict_proba(df[TEXT_COL])
    conf = proba.max(axis=1)

err_df = pd.DataFrame({
    "text": df[TEXT_COL],
    "true_label": y_true,
    "pred_label": y_pred,
    "confidence": conf
})

# ‡πÄ‡∏•‡∏∑‡∏≠‡∏Å‡πÄ‡∏â‡∏û‡∏≤‡∏∞‡∏ó‡∏µ‡πà‡∏ó‡∏≥‡∏ô‡∏≤‡∏¢‡∏ú‡∏¥‡∏î
err_df = err_df[err_df.true_label != err_df.pred_label]

# ‡πÄ‡∏£‡∏µ‡∏¢‡∏á‡∏à‡∏≤‡∏Å‡∏Ñ‡∏ß‡∏≤‡∏°‡∏°‡∏±‡πà‡∏ô‡πÉ‡∏à‡∏ï‡πà‡∏≥ ‚Üí ‡∏™‡∏π‡∏á
err_df = err_df.sort_values("confidence")

# üî• ‡∏ß‡∏¥‡πÄ‡∏Ñ‡∏£‡∏≤‡∏∞‡∏´‡πå‡∏°‡∏≤‡∏Å‡∏Å‡∏ß‡πà‡∏≤ 10 ‡πÄ‡∏û‡∏∑‡πà‡∏≠‡πÉ‡∏´‡πâ‡πÄ‡∏à‡∏≠ negation / sarcasm
errN = err_df.head(30).copy()

# =========================
# ERROR TYPE HEURISTIC
# =========================
NEGATION_WORDS = [
    "‡πÑ‡∏°‡πà", "‡πÑ‡∏°‡πà‡πÑ‡∏î‡πâ", "‡πÑ‡∏°‡πà‡∏°‡∏µ", "‡πÑ‡∏°‡πà‡∏Ñ‡πà‡∏≠‡∏¢", "‡πÑ‡∏°‡πà‡πÄ‡∏•‡∏¢", "‡πÑ‡∏°‡πà‡πÉ‡∏ä‡πà"
]

SARCASM_PATTERNS = [
    "‡∏î‡∏µ‡∏°‡∏≤‡∏Å‡∏ô‡∏∞", "‡∏™‡∏∏‡∏î‡∏¢‡∏≠‡∏î‡πÄ‡∏•‡∏¢", "‡πÄ‡∏Å‡πà‡∏á‡∏°‡∏≤‡∏Å",
    "555", "‡∏Æ‡πà‡∏≤‡πÜ", "üòÇ", "ü§£",
    "‡πÄ‡∏ô‡∏≠‡∏∞", "‡∏≠‡∏∞‡∏ô‡∏∞", "‡πÄ‡∏´‡∏≠‡∏∞"
]

INFORMAL_NOISE = [
    "‡πÜ", "‡∏°‡∏≤‡∏Å‡∏Å‡∏Å", "‡∏à‡∏±‡∏á‡∏á‡∏á", "‡πÇ‡∏Ñ‡∏ï‡∏£", "‡∏™‡∏∏‡∏î‡πÜ"
]

def classify_error(text):
    text = str(text)

    # 1Ô∏è‚É£ sarcasm / irony (‡∏ï‡∏£‡∏ß‡∏à‡∏Å‡πà‡∏≠‡∏ô!)
    if any(p in text for p in SARCASM_PATTERNS):
        return "sarcasm / irony"

    # 2Ô∏è‚É£ negation
    if any(w in text for w in NEGATION_WORDS):
        return "negation"

    # 3Ô∏è‚É£ informal / noise
    if any(n in text for n in INFORMAL_NOISE):
        return "informal / noise"

    # 4Ô∏è‚É£ ambiguous / short
    if len(text.split()) <= 3:
        return "ambiguous / short text"

    return "other"

errN["error_type"] = errN["text"].apply(classify_error)

# =========================
# SAVE OUTPUTS
# =========================

# üëâ ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö‡∏´‡∏ô‡πâ‡∏≤‡πÄ‡∏ß‡πá‡∏ö ‡πÅ‡∏™‡∏î‡∏á‡πÅ‡∏Ñ‡πà 10 ‡∏ï‡∏±‡∏ß‡∏≠‡∏¢‡πà‡∏≤‡∏á
errN.head(10).to_csv(
    os.path.join(OUT_DIR, "misclassified_10.csv"),
    index=False,
    encoding="utf-8-sig"
)

# üëâ summary ‡πÉ‡∏ä‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏´‡∏°‡∏î
summary = errN["error_type"].value_counts().to_dict()

with open(os.path.join(OUT_DIR, "error_summary.json"), "w", encoding="utf-8") as f:
    json.dump(summary, f, ensure_ascii=False, indent=2)

print("\nüìä Error type summary:")
for k, v in summary.items():
    print(f"- {k}: {v}")


Label distribution:
sentiment
Positive    2000
Neutral     1500
Negative    1500
Name: count, dtype: int64 

‚úÖ DONE
Accuracy : 0.9948
Macro-F1 : 0.9947

üìä Error type summary:
- ambiguous / short text: 16
- informal / noise: 9
- sarcasm / irony: 1
