
# Transformer Fine-tuning â€” Data4Good Factuality (DeBERTa/Roberta)

This notebook fine-tunes a LM (default **microsoft/deberta-v3-base**) on the dataset and exports `outputs/predictions.json`.
Run it separately from your classical notebook if you like.


In [1]:

# Install deps (internet + GPU recommended)
# %%capture
!pip -q install -U transformers datasets accelerate evaluate scikit-learn sentencepiece


In [2]:
import os, json, re, numpy as np, pandas as pd, torch, random
from datasets import Dataset
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import precision_recall_fscore_support, classification_report
from transformers import (AutoTokenizer, AutoModelForSequenceClassification,
                          TrainingArguments, Trainer, DataCollatorWithPadding)

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)
if hasattr(torch.backends, "mps") and torch.backends.mps.is_available():
    try:
        torch.mps.manual_seed(SEED)  # type: ignore[attr-defined]
    except Exception:
        pass

print("Torch CUDA available:", torch.cuda.is_available())
print("Torch MPS available:", getattr(torch.backends, "mps", None) is not None and torch.backends.mps.is_available())


Torch CUDA available: False
Torch MPS available: True


## Load & normalize JSON

In [3]:

TRAIN_PATH = "data/train.json"
TEST_PATH  = "data/test.json"

def load_json_df(path):
    with open(path, "r", encoding="utf-8") as f:
        raw = json.load(f)
    if isinstance(raw, list):
        df = pd.DataFrame(raw)
    elif isinstance(raw, dict):
        for k in ("data","records","items"):
            if k in raw and isinstance(raw[k], list):
                df = pd.DataFrame(raw[k]); break
        else:
            df = pd.DataFrame(raw)
    else:
        df = pd.DataFrame(raw)
    df.columns = [c.strip().lower() for c in df.columns]
    return df

train_df = load_json_df(TRAIN_PATH).rename(columns={
    "question":"Question","context":"Context","answer":"Answer","type":"Type"
})
test_df  = load_json_df(TEST_PATH).rename(columns={
    "id":"ID","question":"Question","context":"Context","answer":"Answer"
})

for col in ["Question","Context","Answer"]:
    train_df[col] = train_df[col].fillna("").astype(str).str.replace(r"\s+"," ", regex=True).str.strip()
    test_df[col]  = test_df[col] .fillna("").astype(str).str.replace(r"\s+"," ", regex=True).str.strip()

label_map = {
    "factual":"Factual","contradiction":"Contradiction","irrelevant":"Irrelevant",
    "Factual":"Factual","Contradiction":"Contradiction","Irrelevant":"Irrelevant"
}
train_df["Type"] = train_df["Type"].astype(str).str.strip().map(label_map)
assert train_df["Type"].notna().all(), "Unknown label in Type"

print("Train shape:", train_df.shape, "| Test shape:", test_df.shape)


FileNotFoundError: [Errno 2] No such file or directory: 'data/train.json'

## Tokenization & dataset

In [None]:
def build_text(q,c,a):
    return f"[Q] {q} [C] {c} [A] {a}"

train_texts = [build_text(q,c,a) for q,c,a in zip(train_df.Question, train_df.Context, train_df.Answer)]
test_texts  = [build_text(q,c,a) for q,c,a in zip(test_df.Question,  test_df.Context,  test_df.Answer )]

label_enc = LabelEncoder()
y = label_enc.fit_transform(train_df["Type"])
num_labels = len(label_enc.classes_)

MODEL_NAME = "distilroberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
model     = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=num_labels)

from datasets import Dataset, ClassLabel

max_len = 256
def tok_func(batch):
    return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=max_len)

# Convert integer labels to a proper ClassLabel feature so that stratified splitting works
label_names = list(label_enc.classes_)
label_feature = ClassLabel(num_classes=num_labels, names=label_names)

ds = Dataset.from_dict({"text": train_texts, "label": y})
ds = ds.cast_column("label", label_feature)

ds = ds.train_test_split(test_size=0.1, stratify_by_column="label", seed=SEED)
train_ds, val_ds = ds["train"], ds["test"]

train_ds = train_ds.map(tok_func, batched=True)
val_ds   = val_ds.map(tok_func, batched=True)
train_ds.set_format(type="torch", columns=["input_ids","attention_mask","label"])
val_ds.set_format(type="torch", columns=["input_ids","attention_mask","label"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at distilroberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Casting the dataset:   0%|          | 0/21021 [00:00<?, ? examples/s]

Map:   0%|          | 0/18918 [00:00<?, ? examples/s]

Map:   0%|          | 0/2103 [00:00<?, ? examples/s]

## Train

In [None]:

def compute_metrics(eval_pred):
    import numpy as np
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    prec, rec, f1, _ = precision_recall_fscore_support(labels, preds, average="macro", zero_division=0)
    return {"macro_f1": f1, "precision": prec, "recall": rec}

args = TrainingArguments(
    output_dir="hf_runs",
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    learning_rate=2e-5,
    num_train_epochs=3,
    weight_decay=0.01,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="macro_f1",
    fp16=torch.cuda.is_available(),
    logging_steps=50,
    report_to="none"
)

trainer = Trainer(
    model=model,
    args=args,
    tokenizer=tokenizer,
    data_collator=data_collator,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    compute_metrics=compute_metrics
)

trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss


## Evaluate

In [None]:

pred = trainer.predict(val_ds)
y_true = pred.label_ids
y_pred = pred.predictions.argmax(axis=1)
print(classification_report(y_true, y_pred, target_names=label_enc.classes_))


## Save & Export predictions

In [None]:

import joblib, os, json
os.makedirs("artifacts/best_transformer", exist_ok=True)
trainer.save_model("artifacts/best_transformer")
tokenizer.save_pretrained("artifacts/best_transformer")
joblib.dump(label_enc, "artifacts/label_encoder.joblib")

# Test inference
from datasets import Dataset
def build_text(q,c,a): return f"[Q] {q} [C] {c} [A] {a}"
test_texts = [build_text(q,c,a) for q,c,a in zip(test_df.Question, test_df.Context, test_df.Answer)]

max_len = 256
def tok_func(batch): return tokenizer(batch["text"], padding="max_length", truncation=True, max_length=max_len)
test_ds = Dataset.from_dict({"text": test_texts}).map(tok_func, batched=True)
test_ds.set_format(type="torch", columns=["input_ids","attention_mask"])

logits = trainer.predict(test_ds).predictions
pred_ids = logits.argmax(axis=1)
pred_labels = label_enc.inverse_transform(pred_ids)

predictions = []
for row, lab in zip(test_df.itertuples(index=False), pred_labels):
    id_value = getattr(row, "ID", None)
    if hasattr(id_value, "item"): id_value = id_value.item()
    predictions.append({"ID": id_value, "Type": str(lab)})

os.makedirs("outputs", exist_ok=True)
with open("outputs/predictions.json","w", encoding="utf-8") as f:
    json.dump(predictions, f, ensure_ascii=False, indent=2)
print("Wrote", len(predictions), "to outputs/predictions.json")
