In [2]:
#!pip -q install -U datasets transformers accelerate evaluate scikit-learn
!pip -q install -U evaluate
import os, zipfile, numpy as np
from datasets import load_dataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    DataCollatorWithPadding, TrainingArguments, Trainer
)
import evaluate


In [3]:
ds = load_dataset("ailsntua/QEvasion")  # train: 3448, test: 308


In [4]:
# Tạo validation từ train để theo dõi macro-F1
splits = ds["train"].train_test_split(test_size=0.1, seed=42)
train_ds = splits["train"]
val_ds   = splits["test"]
test_ds  = ds["test"]

# Nhãn Task 1
label_list = ["Ambivalent", "Clear Non-Reply", "Clear Reply"]
label2id = {l:i for i,l in enumerate(label_list)}
id2label = {i:l for l,i in label2id.items()}

def encode_label(example):
    example["labels"] = label2id[example["clarity_label"]]
    return example

train_ds = train_ds.map(encode_label)
val_ds   = val_ds.map(encode_label)


In [5]:
from transformers import AutoTokenizer, DataCollatorWithPadding

model_name = "xlm-roberta-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

Q_COL = "question"
A_COL = "interview_answer"

def preprocess(batch):
    # Tokenize sequence pair: (text, text_pair)
    return tokenizer(
        batch[Q_COL],
        batch[A_COL],
        truncation=True,
        max_length=512,
    )

train_tok = train_ds.map(preprocess, batched=True)
val_tok   = val_ds.map(preprocess, batched=True)

# Giữ cột cần cho Trainer: input_ids/attention_mask/labels
keep = {"input_ids", "attention_mask", "labels"}
train_tok = train_tok.remove_columns([c for c in train_tok.column_names if c not in keep])
val_tok   = val_tok.remove_columns([c for c in val_tok.column_names if c not in keep])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Map:   0%|          | 0/3103 [00:00<?, ? examples/s]

Map:   0%|          | 0/345 [00:00<?, ? examples/s]

In [None]:
import numpy as np
import evaluate
from transformers import AutoModelForSequenceClassification, TrainingArguments, Trainer

# metric (macro F1)
f1 = evaluate.load("f1")  # load once (đúng best practice) [web:84]

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"macro_f1": f1.compute(predictions=preds, references=labels, average="macro")["f1"]}

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id,
)

args = TrainingArguments(
    output_dir="xlmr_clarity_baseline",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",     # dùng key chuẩn/ổn định [web:71]
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",  # Trainer chấp nhận tên metric trả về (có/không "eval_" prefix) [web:118]
    greater_is_better=True,
    logging_steps=50,
    fp16=True,                        # bật nếu có CUDA; nếu máy bạn không có GPU thì set False [web:126]
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=train_tok,
    eval_dataset=val_tok,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

trainer.train()


Some weights of XLMRobertaForSequenceClassification were not initialized from the model checkpoint at xlm-roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Macro F1
1,0.9092,0.843346,0.255952


In [None]:
# tokenize test (không có labels)
test_tok = test_ds.map(preprocess, batched=True, remove_columns=test_ds.column_names)

pred = trainer.predict(test_tok)
pred_ids = np.argmax(pred.predictions, axis=-1)
pred_labels = [id2label[i] for i in pred_ids]

# ghi file extensionless: "prediction"
with open("prediction", "w", encoding="utf-8") as f:
    for lab in pred_labels:
        f.write(lab + "\n")

print("Wrote:", os.path.abspath("prediction"))
print("Num lines:", sum(1 for _ in open("prediction", "r", encoding="utf-8")))


In [None]:
zip_path = "submission.zip"
with zipfile.ZipFile(zip_path, "w", compression=zipfile.ZIP_DEFLATED) as z:
    z.write("prediction", arcname="prediction")  # quan trọng: arcname không có folder

print("Zipped:", os.path.abspath(zip_path))

# sanity check: xem zip có đúng 1 file prediction không
with zipfile.ZipFile(zip_path, "r") as z:
    print("ZIP content:", z.namelist())
