In [None]:
!pip install -q gradio scikit-learn joblib matplotlib seaborn xgboost imbalanced-learn

import re, unicodedata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, precision_recall_curve, f1_score
from imblearn.over_sampling import RandomOverSampler

CSV_PATH = "/content/facebook_captions_comments_spam_dataset (1).csv"  # sửa nếu khác

encodings_to_try = ["utf-8-sig", "utf-8", "cp1258", "latin-1"]
df = None
for enc in encodings_to_try:
    try:
        df = pd.read_csv(CSV_PATH, encoding=enc, low_memory=False)
        print("Đã đọc file với encoding:", enc)
        break
    except Exception as e:
        print("Lỗi với", enc, ":", e)

if df is None:
    raise ValueError("Không đọc được file CSV với encodings thử.")

print("Columns:", df.columns.tolist())
display(df.head(5))

def remove_accents(txt):
    if not isinstance(txt, str): return txt
    nkfd = unicodedata.normalize("NFKD", txt)
    return "".join([c for c in nkfd if not unicodedata.combining(c)])

def clean_text(s, remove_accent=False):
    if not isinstance(s, str): return ""
    s = s.strip()
    s = re.sub(r"http\S+|www\.\S+", " <URL> ", s)
    s = re.sub(r"\+?\d[\d\-\s]{7,}\d", " <PHONE> ", s)
    s = re.sub(r"[^\wÀ-?<>\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    s = s.lower()
    if remove_accent:
        s = remove_accents(s)
    return s

caption_col = None
for c in ["caption","Caption"]:
    if c in df.columns:
        caption_col = c
        break

comment_col = None
for c in ["comment","coment","comments","Comment","Coment"]:
    if c in df.columns:
        comment_col = c
        break

if comment_col is None:
    raise KeyError("Không tìm thấy cột comment/coment trong file.")

if caption_col:
    df["text"] = df[caption_col].fillna("").astype(str) + " " + df[comment_col].fillna("").astype(str)
else:
    df["text"] = df[comment_col].fillna("").astype(str)

df["text"] = df["text"].apply(lambda z: clean_text(z, remove_accent=False))

if "label" not in df.columns:
    raise KeyError("Không thấy cột 'label' trong file.")

if df['label'].dtype == object:
    df['label_num'] = df['label'].map({'ham':0,'spam':1})
else:
    df['label_num'] = pd.to_numeric(df['label'], errors='coerce')
df['label_num'] = df['label_num'].fillna(0).astype(int)

print("Sample text after clean:")
display(df[["text","label","label_num"]].head(5))

def extract_numeric_features_from_text(raw):
    s = "" if pd.isna(raw) else str(raw)
    lower = s.lower()
    has_url = int(bool(re.search(r'<url>', lower))) or int(bool(re.search(r'http[s]?://|www\.', lower)))
    has_phone = int(bool(re.search(r'<phone>', lower))) or int(bool(re.search(r'\d{9,}', lower)))
    num_digits = sum(c.isdigit() for c in s)
    num_words = len(re.findall(r"[\wÀ-?]+", s))
    num_exclaim = s.count('!')
    num_question = s.count('?')
    num_at = s.count('@')
    num_hash = s.count('#')
    upper_ratio = (sum(1 for c in s if c.isupper()) / (len(s) if len(s)>0 else 1))
    kw_pattern = r'giảm|sale|khuyến mãi|miễn phí|voucher|mua ngay|click|link|inbox|zalo|bán|giá rẻ|tặng'
    has_discount_kw = int(bool(re.search(kw_pattern, lower)))
    return pd.Series({
        "has_url": has_url,
        "has_phone": has_phone,
        "num_digits": num_digits,
        "num_words": num_words,
        "num_exclaim": num_exclaim,
        "num_question": num_question,
        "num_at": num_at,
        "num_hash": num_hash,
        "upper_ratio": upper_ratio,
        "has_discount_kw": has_discount_kw
    })

num_feats = df['text'].apply(extract_numeric_features_from_text)
df = pd.concat([df.reset_index(drop=True), num_feats.reset_index(drop=True)], axis=1)

numeric_cols = ["has_url","has_phone","num_digits","num_words","num_exclaim","num_question","num_at","num_hash","upper_ratio","has_discount_kw"]
print("Feature cols:", numeric_cols)
display(df[numeric_cols+["label_num"]].head(5))

X = df[["text"] + numeric_cols].copy()
y = df["label_num"].copy()

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Class distribution train:\n", y_train.value_counts())
print("Class distribution test:\n", y_test.value_counts())

ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
X_train_res = pd.DataFrame(X_train_res, columns=X_train.columns)

print("After oversample train dist:", np.bincount(y_train_res))

def vi_tokenizer(text):
    return re.findall(r"[\wÀ-?]+", text)

tf_word = TfidfVectorizer(tokenizer=vi_tokenizer, lowercase=False, max_df=0.95, min_df=2, ngram_range=(1,3), sublinear_tf=True, max_features=20000)
tf_char = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), min_df=3)

preprocessor = ColumnTransformer(
    transformers=[
        ("tf_word", tf_word, "text"),
        ("tf_char", tf_char, "text"),
        ("num", StandardScaler(), numeric_cols)
    ],
    remainder='drop'
)

preprocessor.fit(X_train_res)
print("Vocab sizes:", len(preprocessor.named_transformers_['tf_word'].get_feature_names_out()),
      len(preprocessor.named_transformers_['tf_char'].get_feature_names_out()))

pipeline = Pipeline([
    ("pre", preprocessor),
    ("clf", LogisticRegression(solver="saga", max_iter=5000, class_weight="balanced"))
])

param_grid = {
    "pre__tf_word__ngram_range": [(1,2),(1,3)],
    "pre__tf_word__max_df": [0.9, 0.95],
    "clf__C": [0.1, 1, 5]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(pipeline, param_grid, cv=cv, scoring="f1", n_jobs=-1, verbose=1)

grid.fit(X_train_res, y_train_res)

best_model = grid.best_estimator_
print("Best params:", grid.best_params_)
print("Best CV f1:", grid.best_score_)

y_pred = best_model.predict(X_test)
y_prob = best_model.predict_proba(X_test)[:,1] if hasattr(best_model, "predict_proba") else None

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=["ham","spam"]))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=["ham","spam"], yticklabels=["ham","spam"])
plt.title("Confusion matrix")
plt.show()

if y_prob is not None:
    fpr, tpr, _ = roc_curve(y_test, y_prob)
    print("AUC:", auc(fpr,tpr))
    plt.figure(figsize=(5,5)); plt.plot(fpr,tpr,label=f"AUC={auc(fpr,tpr):.4f}"); plt.plot([0,1],[0,1],'--',color='gray'); plt.legend(); plt.show()
    prec, rec, _ = precision_recall_curve(y_test, y_prob)
    plt.figure(figsize=(5,5)); plt.plot(rec,prec); plt.title("Precision-Recall"); plt.xlabel("Recall"); plt.ylabel("Precision"); plt.show()

print("Cross-val F1 (5-fold) on full data (slow):")
print(cross_val_score(best_model, X, y, cv=5, scoring="f1", n_jobs=-1))

joblib.dump(best_model, "/content/best_spam_improved.joblib")
print("Saved model to /content/best_spam_improved.joblib")

from gradio import Blocks, Row, Textbox, Button, Markdown

best_pipe = joblib.load("/content/best_spam_improved.joblib")

def make_features_row_from_caption_comment(caption, comment):
    text = ("" if caption is None else str(caption)) + " " + ("" if comment is None else str(comment))
    text = clean_text(text, remove_accent=False)
    return pd.DataFrame([{
        "text": text,
        **extract_numeric_features_from_text(text)
    }])

def explain_top_tokens(text, top_k=8):
    try:
        text_clean = clean_text(text, remove_accent=False)
        tf_word_vect = best_pipe.named_steps['pre'].named_transformers_['tf_word']
        vec = tf_word_vect.transform([text_clean]).toarray()[0]
        if vec.sum() == 0:
            return "Không có token TF-IDF nào khớp."
        clf = best_pipe.named_steps['clf']
        coef = clf.coef_[0][:len(tf_word_vect.get_feature_names_out())]
        idxs = np.where(vec > 0)[0]
        toks = tf_word_vect.get_feature_names_out()
        token_scores = [(toks[i], vec[i]*coef[i]) for i in idxs]
        token_scores_sorted = sorted(token_scores, key=lambda x: x[1], reverse=True)
        top = token_scores_sorted[:top_k]
        return ", ".join([f"{t} ({s:.3f})" for t,s in top])
    except Exception as e:
        return f"Lỗi explain: {e}"

def predict_and_explain(caption, comment):
    text = (caption or "") + " " + (comment or "")
    if text.strip()=="":
        return "⚠️ Vui lòng nhập caption hoặc comment!", None, None
    row = make_features_row_from_caption_comment(caption, comment)
    prob = best_pipe.predict_proba(row)[:,1][0] if hasattr(best_pipe, "predict_proba") else None
    pred = best_pipe.predict(row)[0]
    label = "🚨 Spam" if int(pred)==1 else "✅ Hợp lệ"
    prob_str = f"{prob*100:.2f}% khả năng là Spam" if prob is not None else "N/A"
    explain = explain_top_tokens(text)
    return label, prob_str, explain

with Blocks() as demo:
    Markdown("## 🔎 Nhận diện Spam (TF-IDF + đặc trưng mở rộng)")
    with Row():
        caption_in = Textbox(label="Caption bài viết", placeholder="Nhập caption...")
        comment_in = Textbox(label="Bình luận", placeholder="Nhập comment...")
    with Row():
        out_label = Textbox(label="Kết quả")
        out_prob = Textbox(label="Xác suất (Spam)")
    out_explain = Textbox(label="Top từ/cụm từ đóng góp")
    btn = Button("🚀 Dự đoán & Giải thích")
    btn.click(fn=predict_and_explain, inputs=[caption_in, comment_in], outputs=[out_label, out_prob, out_explain])

demo.launch(share=True)
