Mục đích

Thử vài cấu hình TF-IDF (n-grams, min_df, sublinear_tf, stopwords).

Chọn cấu hình tốt (dùng 1 mô hình nhẹ làm “proxy” để so nhanh).

Fit lại TF-IDF trên toàn bộ dữ liệu và lưu.

In [1]:
import pandas as pd
import numpy as np
import joblib
from pathlib import Path

from sklearn.model_selection import StratifiedKFold, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression  # proxy nhanh
# (chỉ dùng để CHỌN vectorizer; model KNN nằm ở notebook 02)

ARTIF_DIR = Path("../app/artifacts")
ARTIF_DIR.mkdir(parents=True, exist_ok=True)


In [2]:
df = pd.read_csv("../data/df_file.csv")
X, y = df["Text"].values, df["Label"].values
print(df.shape, df["Label"].value_counts())


(2225, 2) Label
1    511
4    510
0    417
2    401
3    386
Name: count, dtype: int64


In [3]:
# Nếu muốn bỏ xuống dòng/dấu control:
X_clean = [str(t).replace("\n", " ").strip() for t in X]


In [4]:
candidates = [
    dict(ngram_range=(1,1), min_df=2, max_df=0.9, sublinear_tf=True, stop_words="english"),
    dict(ngram_range=(1,2), min_df=2, max_df=0.9, sublinear_tf=True, stop_words="english"),
    dict(ngram_range=(1,2), min_df=3, max_df=0.95, sublinear_tf=True, stop_words="english"),
    dict(ngram_range=(1,2), min_df=2, max_df=0.9, sublinear_tf=False, stop_words="english"),
]
print(f"Testing {len(candidates)} TF-IDF configs...")


Testing 4 TF-IDF configs...


In [5]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
results = []

for i, cfg in enumerate(candidates, 1):
    tfidf = TfidfVectorizer(**cfg)
    proxy_clf = LogisticRegression(max_iter=200, n_jobs=None)  # liblinear/saga auto
    pipe = make_pipeline(tfidf, proxy_clf)

    f1 = cross_val_score(pipe, X_clean, y, cv=skf, scoring="f1_macro", n_jobs=-1)
    acc = cross_val_score(pipe, X_clean, y, cv=skf, scoring="accuracy", n_jobs=-1)

    results.append({
        "idx": i,
        "cfg": cfg,
        "F1_macro_mean": f1.mean(), "F1_macro_std": f1.std(),
        "ACC_mean": acc.mean(), "ACC_std": acc.std()
    })
    print(f"[{i}] {cfg} -> F1={f1.mean():.4f}±{f1.std():.3f} | ACC={acc.mean():.4f}±{acc.std():.3f}")

res_df = pd.DataFrame(results).sort_values(["F1_macro_mean","ACC_mean"], ascending=False)
res_df


[1] {'ngram_range': (1, 1), 'min_df': 2, 'max_df': 0.9, 'sublinear_tf': True, 'stop_words': 'english'} -> F1=0.9801±0.007 | ACC=0.9802±0.007
[2] {'ngram_range': (1, 2), 'min_df': 2, 'max_df': 0.9, 'sublinear_tf': True, 'stop_words': 'english'} -> F1=0.9805±0.007 | ACC=0.9807±0.007
[3] {'ngram_range': (1, 2), 'min_df': 3, 'max_df': 0.95, 'sublinear_tf': True, 'stop_words': 'english'} -> F1=0.9800±0.008 | ACC=0.9802±0.008
[4] {'ngram_range': (1, 2), 'min_df': 2, 'max_df': 0.9, 'sublinear_tf': False, 'stop_words': 'english'} -> F1=0.9797±0.004 | ACC=0.9798±0.004


Unnamed: 0,idx,cfg,F1_macro_mean,F1_macro_std,ACC_mean,ACC_std
1,2,"{'ngram_range': (1, 2), 'min_df': 2, 'max_df':...",0.980493,0.006924,0.980674,0.00733
0,1,"{'ngram_range': (1, 1), 'min_df': 2, 'max_df':...",0.980106,0.006848,0.980225,0.007303
2,3,"{'ngram_range': (1, 2), 'min_df': 3, 'max_df':...",0.980023,0.007982,0.980225,0.008336
3,4,"{'ngram_range': (1, 2), 'min_df': 2, 'max_df':...",0.979685,0.003883,0.979775,0.004264


In [6]:
best_cfg = res_df.iloc[0]["cfg"]
best_cfg


{'ngram_range': (1, 2),
 'min_df': 2,
 'max_df': 0.9,
 'sublinear_tf': True,
 'stop_words': 'english'}

In [7]:
best_tfidf = TfidfVectorizer(**best_cfg).fit(X_clean)
print("Vocab size:", len(best_tfidf.vocabulary_))
joblib.dump(best_tfidf, ARTIF_DIR / "tfidf.pkl")
print("Saved ->", ARTIF_DIR / "tfidf.pkl")


Vocab size: 77505
Saved -> ..\app\artifacts\tfidf.pkl


In [8]:
# Lấy df (document frequency) gần đúng qua inverse idf: idf = log((n+1)/(df+1)) + 1
idf = best_tfidf.idf_
terms = np.array(best_tfidf.get_feature_names_out())
report = pd.DataFrame({
    "term": terms,
    "idf": idf
}).sort_values("idf", ascending=False)

# Lưu 100 terms "đặc trưng" nhất (idf cao) để kiểm tra nhanh
report.head(100).to_csv(ARTIF_DIR / "tfidf_top_idf_terms.csv", index=False)
report.tail(100).to_csv(ARTIF_DIR / "tfidf_common_terms.csv", index=False)
print("Saved top/bottom IDF term lists.")


Saved top/bottom IDF term lists.


In [9]:
X_vec = best_tfidf.transform(X_clean[:3])
for i in range(X_vec.shape[0]):
    row = X_vec.getrow(i)
    coo = list(zip(row.indices, row.data))
    top = sorted(coo, key=lambda x: x[1], reverse=True)[:10]
    words = [(terms[j], float(w)) for j, w in top]
    print(f"\nDoc {i} top tf-idf terms:")
    for w, s in words:
        print(f"  {w:25s}  {s:.3f}")



Doc 0 top tf-idf terms:
  stamp duty                 0.174
  stamp                      0.163
  duty threshold             0.154
  threshold                  0.140
  duty                       0.136
  60 000                     0.114
  freeze petrol              0.110
  raise stamp                0.110
  means testing              0.105
  petrol duty                0.105

Doc 1 top tf-idf terms:
  regiments                  0.270
  regiment                   0.216
  army                       0.178
  battalion                  0.131
  black watch                0.131
  royal scots                0.131
  scottish regiments         0.131
  super                      0.128
  scottish                   0.122
  campaigners                0.117

Doc 2 top tf-idf terms:
  id cards                   0.126
  id                         0.124
  cards                      0.106
  shadow cabinet             0.099
  howard denied              0.087
  yeo                        0.084
  howard       