<a href="https://colab.research.google.com/github/alycnttk21clc/NLP-SPAM-FACEBOOK/blob/main/Untitled5_(3).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import re, unicodedata
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import joblib

from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import MaxAbsScaler, LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_curve, auc, precision_recall_curve, f1_score

from imblearn.over_sampling import RandomOverSampler
from sklearn.naive_bayes import MultinomialNB

In [2]:
CSV_PATH = "/content/law_qa_dataset_with_answers_plus_300_UTF8.csv"

In [3]:
encodings_to_try = ["utf-8-sig", "utf-8", "cp1258", "latin-1"]
df = None
for enc in encodings_to_try:
    try:
        df = pd.read_csv(CSV_PATH, encoding=enc, low_memory=False)
        print("ƒê√£ ƒë·ªçc file v·ªõi encoding:", enc)
        break
    except Exception as e:
        print("L·ªói v·ªõi", enc, ":", e)
if df is None:
    raise ValueError("Kh√¥ng ƒë·ªçc ƒë∆∞·ª£c file CSV v·ªõi encodings th·ª≠.")
print("Columns:", df.columns.tolist())
display(df.head(5))

ƒê√£ ƒë·ªçc file v·ªõi encoding: utf-8-sig
Columns: ['caption', 'comment', 'label']


Unnamed: 0,caption,comment,label
0,T∆∞ v·∫•n ph√°p lu·∫≠t H√¥n nh√¢n v√† gia ƒë√¨nh,"Khi k·∫øt h√¥n, nam t·ª´ ƒë·ªß 20 tu·ªïi, n·ªØ t·ª´ ƒë·ªß 18 tu...",0
1,T∆∞ v·∫•n ph√°p lu·∫≠t H√¥n nh√¢n v√† gia ƒë√¨nh,Ng∆∞·ªùi ch∆∞a ƒë·ªß tu·ªïi v·∫´n c√≥ th·ªÉ ƒëƒÉng k√Ω k·∫øt h√¥n ...,1
2,H·ªèi ƒë√°p Di tr√∫,Ng∆∞·ªùi lao ƒë·ªông b·ªã tai n·∫°n trong khi l√†m vi·ªác ƒë...,0
3,H·ªèi ƒë√°p Di tr√∫,Ng∆∞·ªùi lao ƒë·ªông t·ª± ch·ªãu chi ph√≠ ƒëi·ªÅu tr·ªã khi b·ªã...,1
4,T∆∞ v·∫•n mi·ªÖn ph√≠ lƒ©nh v·ª±c Doanh nghi·ªáp,Ng∆∞·ªùi lao ƒë·ªông t·ª± ch·ªãu chi ph√≠ ƒëi·ªÅu tr·ªã khi b·ªã...,1


In [4]:
# 4. H√†m s·∫°ch + b·ªè d·∫•u
def remove_accents(txt):
    if not isinstance(txt, str): return txt
    nkfd = unicodedata.normalize("NFKD", txt)
    return "".join([c for c in nkfd if not unicodedata.combining(c)])

def clean_text(s, remove_accent=False):
    if not isinstance(s, str): return ""
    s = s.strip()
    s = re.sub(r"http\S+|www\.\S+", " <URL> ", s)
    s = re.sub(r"\+?\d[\d\-\s]{7,}\d", " <PHONE> ", s)
    s = re.sub(r"[^\w√Ä-·ªπ<>\s]", " ", s)
    s = re.sub(r"\s+", " ", s).strip()
    s = s.lower()
    if remove_accent:
        s = remove_accents(s)
    return s

In [5]:
# 5. Gh√©p caption + comment, l∆∞u raw_text
caption_col = next((c for c in ["caption","Caption"] if c in df.columns), None)
comment_col = next((c for c in ["comment","coment","comments","Comment","Coment"] if c in df.columns), None)
if comment_col is None:
    raise KeyError("Kh√¥ng t√¨m th·∫•y c·ªôt comment/coment trong file.")

if caption_col:
    df["raw_text"] = df[caption_col].fillna("").astype(str) + " " + df[comment_col].fillna("").astype(str)
else:
    df["raw_text"] = df[comment_col].fillna("").astype(str)

df["text"] = df["raw_text"].apply(lambda z: clean_text(z, remove_accent=False))  # ƒë·ªïi remove_accent=True n·∫øu mu·ªën


In [6]:
# 6. Label encoding an to√†n
if "label" not in df.columns:
    raise KeyError("Kh√¥ng th·∫•y c·ªôt 'label' trong file.")
le = LabelEncoder()
df["label_num"] = le.fit_transform(df["label"].astype(str))
print("Label classes:", le.classes_)

Label classes: ['0' '1']


In [7]:
# 7. Tr√≠ch numeric features (d·ª±a tr√™n raw_text ƒë·ªÉ t√≠nh upper_ratio ch√≠nh x√°c)
def extract_numeric_features_from_text(raw):
    s = "" if pd.isna(raw) else str(raw)
    upper_ratio = sum(1 for c in s if c.isupper()) / (len(s) if len(s) > 0 else 1)
    lower = s.lower()
    has_url = int(bool(re.search(r'<url>', lower))) or int(bool(re.search(r'http[s]?://|www\.', lower)))
    has_phone = int(bool(re.search(r'<phone>', lower))) or int(bool(re.search(r'\d{9,}', lower)))
    num_digits = sum(c.isdigit() for c in s)
    num_words = len(re.findall(r"[\w√Ä-·ªπ]+", s))
    num_exclaim = s.count('!')
    num_question = s.count('?')
    num_at = s.count('@')
    num_hash = s.count('#')
    kw_pattern = r'gi·∫£m|sale|khuy·∫øn m√£i|mi·ªÖn ph√≠|voucher|mua ngay|click|link|inbox|zalo|b√°n|gi√° r·∫ª|t·∫∑ng'
    has_discount_kw = int(bool(re.search(kw_pattern, lower)))
    return pd.Series({
        "has_url": has_url,
        "has_phone": has_phone,
        "num_digits": num_digits,
        "num_words": num_words,
        "num_exclaim": num_exclaim,
        "num_question": num_question,
        "num_at": num_at,
        "num_hash": num_hash,
        "upper_ratio": upper_ratio,
        "has_discount_kw": has_discount_kw
    })

num_feats = df["raw_text"].apply(extract_numeric_features_from_text)
df = pd.concat([df.reset_index(drop=True), num_feats.reset_index(drop=True)], axis=1)

numeric_cols = num_feats.columns.tolist()
print("Numeric features:", numeric_cols)
display(df[numeric_cols + ["label_num"]].head(5))


Numeric features: ['has_url', 'has_phone', 'num_digits', 'num_words', 'num_exclaim', 'num_question', 'num_at', 'num_hash', 'upper_ratio', 'has_discount_kw']


Unnamed: 0,has_url,has_phone,num_digits,num_words,num_exclaim,num_question,num_at,num_hash,upper_ratio,has_discount_kw,label_num
0,0.0,0.0,4.0,27.0,0.0,0.0,0.0,0.0,0.027778,0.0,0
1,0.0,0.0,0.0,26.0,0.0,0.0,0.0,0.0,0.028037,0.0,1
2,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.031915,0.0,0
3,0.0,0.0,0.0,21.0,0.0,0.0,0.0,0.0,0.034483,0.0,1
4,0.0,0.0,0.0,25.0,0.0,0.0,0.0,0.0,0.027273,1.0,1


In [8]:
# 8. Chu·∫©n b·ªã X,y v√† chia train/test (stratify)
X = df[["text"] + numeric_cols].copy()
y = df["label_num"].copy()

# Ki·ªÉm tra ph√¢n b·ªë nh√£n tr∆∞·ªõc
print("Overall class counts:", np.bincount(y))

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)
print("Class distribution train:\n", y_train.value_counts())
print("Class distribution test:\n", y_test.value_counts())


Overall class counts: [404 396]
Class distribution train:
 label_num
0    323
1    317
Name: count, dtype: int64
Class distribution test:
 label_num
0    81
1    79
Name: count, dtype: int64


In [9]:
# 9. Oversample train n·∫øu mu·ªën
ros = RandomOverSampler(random_state=42)
X_train_res, y_train_res = ros.fit_resample(X_train, y_train)
X_train_res = pd.DataFrame(X_train_res, columns=X_train.columns)
print("After oversample train dist:", np.bincount(y_train_res))


After oversample train dist: [323 323]


In [10]:
# Tokenizer + TFIDF
def vi_tokenizer(text):
    return re.findall(r"[\w√Ä-·ªπ]+", text)

tf_word = TfidfVectorizer(tokenizer=vi_tokenizer, lowercase=False, max_df=0.95, min_df=2, ngram_range=(1,3), sublinear_tf=True, max_features=20000)
tf_char = TfidfVectorizer(analyzer='char_wb', ngram_range=(3,5), min_df=3)

preprocessor = ColumnTransformer(
    transformers=[
        ("tf_word", tf_word, "text"),
        ("tf_char", tf_char, "text"),
        ("num", MaxAbsScaler(), numeric_cols)
    ],
    remainder='drop'
)

In [None]:
#Pipeline + GridSearch v·ªõi MultinomialNB
pipeline = Pipeline([
    ("pre", preprocessor),
    ("clf", MultinomialNB())
])

param_grid = {
    "pre__tf_word__ngram_range": [(1,2),(1,3)],
    "pre__tf_word__max_df": [0.9, 0.95],
    "clf__alpha": [0.1, 0.5, 1.0]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(pipeline, param_grid, cv=cv, scoring="f1", n_jobs=-1, verbose=1)

# Fit tr√™n t·∫≠p oversampled
grid.fit(X_train_res, y_train_res)

best_model = grid.best_estimator_
print("Best params:", grid.best_params_)
print("Best CV f1:", grid.best_score_)


Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [None]:
#ƒê√°nh gi√° tr√™n test
y_pred = best_model.predict(X_test)
# n·∫øu model h·ªó tr·ª£ predict_proba
y_prob = best_model.predict_proba(X_test)[:,1] if hasattr(best_model, "predict_proba") else None

print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1:", f1_score(y_test, y_pred))
print("\nClassification report:\n", classification_report(y_test, y_pred, target_names=le.inverse_transform([0,1]) if len(le.classes_)==2 else le.classes_))

cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(5,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=le.classes_, yticklabels=le.classes_)
plt.title("Confusion matrix")
plt.show()

In [None]:
#  ROC/PR (ch·ªâ v·∫Ω n·∫øu y_test c√≥ c·∫£ 2 class)
unique_ytest = np.unique(y_test)
if len(unique_ytest) < 2:
    print("C·∫£nh b√°o: y_test ch·ªâ c√≥ 1 nh√£n ({}). ROC/PR s·∫Ω kh√¥ng h·ª£p l·ªá.".format(unique_ytest))
else:
    if y_prob is not None:
        fpr, tpr, _ = roc_curve(y_test, y_prob)
        print("AUC:", auc(fpr,tpr))
        plt.figure(figsize=(5,5)); plt.plot(fpr,tpr,label=f"AUC={auc(fpr,tpr):.4f}"); plt.plot([0,1],[0,1],'--',color='gray'); plt.legend(); plt.show()
        prec, rec, _ = precision_recall_curve(y_test, y_prob)
        plt.figure(figsize=(5,5)); plt.plot(rec,prec); plt.title("Precision-Recall"); plt.xlabel("Recall"); plt.ylabel("Precision"); plt.show()
    else:
        print("Model kh√¥ng h·ªó tr·ª£ predict_proba(), b·ªè qua ROC/PR.")


In [None]:
import pandas as pd
from sklearn.model_selection import cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

# 1. ƒê·ªçc l·∫°i dataset
df = pd.read_csv("/content/law_qa_dataset_with_answers_plus_300_UTF8.csv")

# 2. T√°ch ƒë·∫∑c tr∆∞ng v√† nh√£n
X = df["comment"]   # vƒÉn b·∫£n b√¨nh lu·∫≠n
y = df["label"]     # nh√£n spam / kh√¥ng spam

# 3. Pipeline TF-IDF + Naive Bayes
model = Pipeline([
    ("tfidf", TfidfVectorizer(max_features=5000, ngram_range=(1,2))),
    ("clf", MultinomialNB())
])

# 4. Cross-validation (5-fold)
print("Cross-val F1 (5-fold) on full data (this may be slow):")
scores = cross_val_score(model, X, y, cv=5, scoring="f1", n_jobs=-1)
print("F1 scores:", scores)
print("Mean F1:", scores.mean())

In [None]:
joblib.dump(best_model, "/content/best_spam_nb.joblib")
print("Saved model to /content/best_spam_nb.joblib")

In [None]:
#Chu·∫©n b·ªã ch·ª©c nƒÉng explain
pre = best_model.named_steps['pre']
tf_word_vect = pre.named_transformers_['tf_word']
tf_char_vect = pre.named_transformers_['tf_char']
clf = best_model.named_steps['clf']

vocab_size = len(tf_word_vect.get_feature_names_out())

def make_features_row_from_caption_comment(caption, comment):
    raw = ("" if caption is None else str(caption)) + " " + ("" if comment is None else str(comment))
    clean = clean_text(raw, remove_accent=False)
    feats = extract_numeric_features_from_text(raw)
    return pd.DataFrame([{ "text": clean, **feats }])

def explain_top_tokens(text, top_k=8):
    try:
        clean = clean_text(text, remove_accent=False)
        vec_word = tf_word_vect.transform([clean]).toarray()[0]   # size = vocab_size
        if vec_word.sum() == 0:
            return "Kh√¥ng c√≥ token TF-IDF n√†o kh·ªõp."
        # For MultinomialNB: feature_log_prob_ shape = (n_classes, n_features_total)
        # word features are first block (because ColumnTransformer concatenates in order)
        if not hasattr(clf, "feature_log_prob_"):
            return "M√¥ h√¨nh hi·ªán t·∫°i kh√¥ng c√≥ feature_log_prob_."
        # L·∫•y log-prob cho word-features
        # (n_classes, total_features) -> slice first vocab_size columns
        logprob = clf.feature_log_prob_  # shape (n_classes, n_total_features)
        if logprob.shape[1] < vocab_size:
            return "Kh√¥ng ƒë·ªß feature_log_prob_ cho word-vocab (b·∫•t th∆∞·ªùng)."
        # score = vec_word[i] * (logprob[class1,i] - logprob[class0,i])
        if logprob.shape[0] == 1:
            # ch·ªâ 1 class (b·∫•t th∆∞·ªùng)
            delta = logprob[0, :vocab_size]
        else:
            delta = logprob[1, :vocab_size] - logprob[0, :vocab_size]
        toks = tf_word_vect.get_feature_names_out()
        idxs = np.where(vec_word > 0)[0]
        token_scores = [(toks[i], vec_word[i]*delta[i]) for i in idxs]
        token_scores_sorted = sorted(token_scores, key=lambda x: x[1], reverse=True)
        top = token_scores_sorted[:top_k]
        return ", ".join([f"{t} ({s:.3f})" for t,s in top])
    except Exception as e:
        return f"L·ªói explain: {e}"


In [None]:
def predict_and_explain(caption, comment, threshold):
    text = (str(caption) + " " + str(comment)).strip()

    # üîπ N·∫øu c√≥ ƒë∆∞·ªùng link th√¨ t·ª± ƒë·ªông g√°n l√† SPAM
    if re.search(r"http[s]?://|www\.", text.lower()):
        return "K·∫øt qu·∫£ d·ª± ƒëo√°n: SPAM (ph√°t hi·ªán c√≥ ƒë∆∞·ªùng link)\n\nX√°c su·∫•t SPAM: 100.00%"

    # üîπ N·∫øu kh√¥ng c√≥ link, m√¥ h√¨nh d·ª± ƒëo√°n nh∆∞ b√¨nh th∆∞·ªùng
    X_input = vectorizer.transform([text])
    prob = best_model.predict_proba(X_input)[0][1]
    label = "SPAM" if prob >= threshold else "HAM"

    top_tokens = explain_top_tokens(vectorizer, best_model, X_input)

    return (
        f"K·∫øt qu·∫£ d·ª± ƒëo√°n: {label}\n"
        f"X√°c su·∫•t SPAM: {prob:.2%}\n"
        f"Ng∆∞·ª°ng ƒëang ch·ªçn: {threshold:.2f}\n\n"
        f"C√°c token ·∫£nh h∆∞·ªüng m·∫°nh nh·∫•t:\n" + top_tokens
    )


In [None]:
import gradio as gr
import re

def predict_and_explain(caption, comment, threshold=0.5):
    text = (str(caption) + " " + str(comment)).strip()

    # üîπ N·∫øu c√≥ link (http, https, www) th√¨ auto SPAM
    if re.search(r"http[s]?://|www\.", text.lower()):
        return (
            "K·∫øt qu·∫£: SPAM (ph√°t hi·ªán c√≥ ƒë∆∞·ªùng link)\n"
            "X√°c su·∫•t spam: 100.00%\n\n"
            "L√Ω do: Comment ch·ª©a ƒë∆∞·ªùng d·∫´n URL ‚Üí ƒë∆∞·ª£c x√°c ƒë·ªãnh l√† spam."
        )

    # üîπ N·∫øu kh√¥ng c√≥ link th√¨ ch·∫°y m√¥ h√¨nh b√¨nh th∆∞·ªùng
    row = make_features_row_from_caption_comment(caption, comment)
    prob = best_model.predict_proba(row)[0, 1] if hasattr(best_model, "predict_proba") else None

    # üîπ √Åp d·ª•ng ng∆∞·ª°ng ng∆∞·ªùi d√πng ch·ªçn
    pred = le.inverse_transform([1])[0] if (prob is not None and prob >= threshold) else le.inverse_transform([0])[0]
    tokens = explain_top_tokens(caption + " " + comment)

    if prob is None:
        prob_txt = "N/A"
    else:
        prob_txt = f"{prob:.2%}"

    return (
        f"K·∫øt qu·∫£: {pred}\n"
        f"X√°c su·∫•t spam: {prob_txt}\n"
        f"Ng∆∞·ª°ng ƒëang d√πng: {threshold:.2f}\n\n"
        f"Top token ·∫£nh h∆∞·ªüng:\n{tokens}"
    )


demo = gr.Interface(
    fn=predict_and_explain,
    inputs=[
        gr.Textbox(label="Caption (n·∫øu c√≥)"),
        gr.Textbox(label="Comment / N·ªôi dung c·∫ßn ki·ªÉm tra"),
        gr.Slider(minimum=0.1, maximum=0.9, value=0.5, step=0.05, label="Ng∆∞·ª°ng quy·∫øt ƒë·ªãnh (Threshold)")
    ],
    outputs=gr.Textbox(label="K·∫øt qu·∫£"),
    title="Nh·∫≠n di·ªán Spam trong b√¨nh lu·∫≠n Facebook",
    description="Nh·∫≠p caption + comment ƒë·ªÉ d·ª± ƒëo√°n v√† gi·∫£i th√≠ch. C√≥ th·ªÉ ƒëi·ªÅu ch·ªânh ng∆∞·ª°ng x√°c su·∫•t ƒë·ªÉ quy·∫øt ƒë·ªãnh SPAM / HAM."
)

demo.launch(share=True)

