In [8]:
import pandas as pd
import re
import unicodedata
import random

# ================== CONFIG ==================
TRAIN_PATH = "combined_train.csv"
TEST_PATH  = "combined_test.csv"
OUT_TRAIN  = "combined_train_clean_mix.csv"
OUT_TEST   = "combined_test_clean_mix.csv"
TEXT_COL   = "Comment"   # đổi nếu cột khác
RANDOM_SEED = 42

random.seed(RANDOM_SEED)

# ================== FUNCTIONS ==================
def normalize_text(text):
    if not isinstance(text, str):
        return ""

    text = unicodedata.normalize("NFC", text)
    text = text.lower()

    text = re.sub(r"http\S+|www\S+", "", text)
    text = re.sub(r"\S+@\S+", "", text)

    text = re.sub(
        r"[^a-z0-9àáạảãâầấậẩẫăằắặẳẵ"
        r"èéẹẻẽêềếệểễ"
        r"ìíịỉĩ"
        r"òóọỏõôồốộổỗơờớợởỡ"
        r"ùúụủũưừứựửữ"
        r"ỳýỵỷỹđ\s\.,!?]",
        " ",
        text
    )

    text = re.sub(r"\s+", " ", text).strip()
    return text


def detect_lang_simple(text):
    if re.search(r"[àáạảãâăđêôơưỳýị]", text):
        return "vi"
    return "en"


def mix_vi_en(text, vi_pool, en_pool):
    r = random.random()
    lang = detect_lang_simple(text)

    # 40% giữ nguyên
    if r < 0.4:
        return text

    # Việt + Anh
    if lang == "vi" and en_pool:
        en = random.choice(en_pool)
        return f"{text}. {en}" if r < 0.7 else f"{en}. {text}"

    # Anh + Việt
    if lang == "en" and vi_pool:
        vi = random.choice(vi_pool)
        return f"{vi}. {text}" if r < 0.7 else f"{text}. {vi}"

    return text


def process_csv(input_path, output_path):
    df = pd.read_csv(input_path)

    # Clean
    df[TEXT_COL] = df[TEXT_COL].apply(normalize_text)
    df = df[df[TEXT_COL].str.len() > 5]

    # Pool ngôn ngữ
    vi_pool = df[df[TEXT_COL].apply(detect_lang_simple) == "vi"][TEXT_COL].tolist()
    en_pool = df[df[TEXT_COL].apply(detect_lang_simple) == "en"][TEXT_COL].tolist()

    # Mix
    df[TEXT_COL] = df[TEXT_COL].apply(
        lambda x: mix_vi_en(x, vi_pool, en_pool)
    )

    # Shuffle
    df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

    df.to_csv(output_path, index=False)
    print(f"Saved {output_path} | samples = {len(df)}")


# ================== RUN ==================
process_csv(TRAIN_PATH, OUT_TRAIN)
process_csv(TEST_PATH, OUT_TEST)


Saved combined_train_clean_mix.csv | samples = 11997
Saved combined_test_clean_mix.csv | samples = 5999


In [1]:
import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer
)
from sklearn.metrics import accuracy_score, precision_recall_fscore_support

# ================== CẤU HÌNH ==================
TRAIN_CSV = "combined_train_clean_mix.csv"   
TEST_CSV  = "combined_test_clean_mix.csv"    
TEXT_COL  = "Comment"                
LABEL_COL = "toxicity"
MODEL_NAME = "distilbert-base-multilingual-cased"

MAX_LEN = 128
BATCH_SIZE = 64
EPOCHS = 10
LR = 2e-5

# ================== LOAD DATA ==================
train_df = pd.read_csv(TRAIN_CSV)
test_df  = pd.read_csv(TEST_CSV)

train_ds = Dataset.from_pandas(train_df[[TEXT_COL, LABEL_COL]])
test_ds  = Dataset.from_pandas(test_df[[TEXT_COL, LABEL_COL]])

num_labels = train_df[LABEL_COL].nunique()

# ================== TOKENIZER ==================
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

def tokenize(batch):
    return tokenizer(
        batch[TEXT_COL],
        padding="max_length",
        truncation=True,
        max_length=MAX_LEN
    )

train_ds = train_ds.map(tokenize, batched=True)
test_ds  = test_ds.map(tokenize, batched=True)

train_ds = train_ds.rename_column(LABEL_COL, "labels")
test_ds  = test_ds.rename_column(LABEL_COL, "labels")

train_ds.set_format("torch")
test_ds.set_format("torch")

# ================== MODEL ==================
model = AutoModelForSequenceClassification.from_pretrained(
    MODEL_NAME,
    num_labels=num_labels
)

# ================== METRICS ==================
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="weighted"
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1
    }

# ================== TRAINING ARGS ==================
training_args = TrainingArguments(
    output_dir="./outputs",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=LR,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    num_train_epochs=EPOCHS,
    weight_decay=0.01,
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"
)

# ================== TRAINER ==================
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# ================== TRAIN ==================
trainer.train()

# ================== TEST ==================
metrics = trainer.evaluate(test_ds)

print("\n===== TEST METRICS =====")
for k, v in metrics.items():
    if k.startswith("eval_"):
        print(f"{k.replace('eval_', ''):10s}: {v:.4f}")


  from .autonotebook import tqdm as notebook_tqdm
Map: 100%|██████████| 11997/11997 [00:00<00:00, 24014.04 examples/s]
Map: 100%|██████████| 5999/5999 [00:00<00:00, 24915.49 examples/s]
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-multilingual-cased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5744,0.489556,0.766294,0.782454,0.766294,0.772029
2,0.4863,0.475041,0.669278,0.79654,0.669278,0.684622
3,0.4503,0.485314,0.634439,0.798003,0.634439,0.648258
4,0.4168,0.500843,0.680113,0.776906,0.680113,0.69607
5,0.3696,0.463761,0.767128,0.799223,0.767128,0.775855
6,0.33,0.547342,0.710618,0.779918,0.710618,0.724772
7,0.2878,0.61905,0.728788,0.789331,0.728788,0.74168
8,0.2546,0.605963,0.744791,0.783847,0.744791,0.755157
9,0.2299,0.724871,0.716953,0.782417,0.716953,0.730643
10,0.2033,0.738096,0.723454,0.78255,0.723454,0.7365



===== TEST METRICS =====
loss      : 0.4638
accuracy  : 0.7671
precision : 0.7992
recall    : 0.7671
f1        : 0.7759
runtime   : 12.7879
samples_per_second: 469.1160
steps_per_second: 7.3510


In [1]:
import torch
import gradio as gr
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

# ===== CONFIG =====
MODEL_DIR = r"C:\Users\Admin\Desktop\web\outputs\checkpoint-1880"  # <-- điền đường dẫn
LABEL_NAMES = ["non-toxic", "toxic"]  # <-- đổi nếu label khác

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# ===== LOAD MODEL =====
tokenizer = AutoTokenizer.from_pretrained(MODEL_DIR)
model = AutoModelForSequenceClassification.from_pretrained(MODEL_DIR)
model.to(device)
model.eval()

# ===== PREDICT FUNCTION =====
def predict(text):
    if not text or text.strip() == "":
        return "Please enter some text", {}

    inputs = tokenizer(
        text,
        return_tensors="pt",
        truncation=True,
        padding=True,
        max_length=64
    )
    inputs = {k: v.to(device) for k, v in inputs.items()}

    with torch.no_grad():
        logits = model(**inputs).logits
        probs = F.softmax(logits, dim=1)[0].cpu().numpy()

    result = {LABEL_NAMES[i]: float(probs[i]) for i in range(len(LABEL_NAMES))}
    pred_label = LABEL_NAMES[int(probs.argmax())]

    return pred_label, result

# ===== GRADIO UI =====
demo = gr.Interface(
    fn=predict,
    inputs=gr.Textbox(lines=3, placeholder="Enter text here..."),
    outputs=[
        gr.Label(label="Prediction"),
        gr.JSON(label="Confidence scores")
    ],
    title="Toxicity Detection Demo",
    description="DistilBERT multilingual fine-tuned for toxicity classification"
)

demo.launch()


  from .autonotebook import tqdm as notebook_tqdm
The tokenizer you are loading from 'C:\Users\Admin\Desktop\web\outputs\checkpoint-1880' with an incorrect regex pattern: https://huggingface.co/mistralai/Mistral-Small-3.1-24B-Instruct-2503/discussions/84#69121093e8b480e709447d5e. This will lead to incorrect tokenization. You should set the `fix_mistral_regex=True` flag when loading this tokenizer to fix this issue.


* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


