In [1]:
# =========================
# CELL 1 — Install + restart hint (Colab-safe)
# =========================
!pip -q install "transformers>=4.40" "datasets>=2.18" "accelerate>=0.27" "evaluate>=0.4" "scikit-learn>=1.3"

# If you previously broke numpy/pandas (errors importing sklearn), uncomment:
# !pip -q install -U "numpy==1.26.4" "pandas==2.2.2" "scikit-learn==1.4.2"
# Then do: Runtime -> Restart runtime
# =========================
# CELL 2 — Imports + config
# =========================
import os, glob
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score
from datasets import Dataset as HFDataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding, set_seed
)
import torch

TRAIN_DIR = "/content/train"
DEV_DIR   = "/content/dev"

MODEL_NAME = "xlm-roberta-base"
MAX_LENGTH = 128
SEED = 42
set_seed(SEED)

# training defaults (reasonable Colab baseline)
LR = 2e-5
EPOCHS = 3
TRAIN_BS = 16
EVAL_BS = 32
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
GRAD_ACCUM = 1
FP16 = True

USE_LANG_PREFIX = True
OUTPUT_DIR = "./outputs_xlmr"



In [2]:
# =========================
# CELL 3 — Load train/dev CSVs (robust labels) + DROP invalid labels for training/eval
# Produces:
#   train_df: id, text, lang, labels   (labels are ONLY 0/1)
#   dev_df_labeled: id, text, lang, labels (ONLY 0/1; may be empty if dev unlabeled)
#   dev_df_all: id, text, lang, labels (labels may be -1 if missing)
# =========================
import os, glob
import numpy as np
import pandas as pd

def detect_cols(df: pd.DataFrame):
    df = df.copy()
    df.columns = [c.strip() for c in df.columns]

    id_candidates = ["id", "ID", "tweet_id", "post_id", "sample_id"]
    text_candidates = ["text", "Text", "content", "sentence", "post"]
    label_candidates = ["labels", "label", "polarization", "gold", "y", "target"]

    id_col = next((c for c in id_candidates if c in df.columns), df.columns[0])

    text_col = next((c for c in text_candidates if c in df.columns), None)
    if text_col is None:
        # fallback: first non-id object column
        for c in df.columns:
            if c != id_col and df[c].dtype == "object":
                text_col = c
                break
    if text_col is None:
        raise ValueError(f"Could not detect text column. Columns: {list(df.columns)}")

    label_col = next((c for c in label_candidates if c in df.columns), None)
    return id_col, text_col, label_col


def normalize_label_series(s: pd.Series) -> pd.Series:
    """
    Converts labels to {0,1} where possible; missing/unknown -> -1.
    Accepts numeric, bool, and common strings.
    """
    if s is None:
        return pd.Series(dtype="int64")

    raw = s.copy()

    # Already bool?
    if raw.dtype == "bool":
        return raw.astype(int)

    # numeric coercion first
    num = pd.to_numeric(raw, errors="coerce")

    out = pd.Series(np.full(len(raw), -1, dtype=np.int64))

    mask_num = num.notna()
    if mask_num.any():
        out.loc[mask_num] = num.loc[mask_num].round().astype(int)

    # For non-numeric: string mapping
    mask_rest = ~mask_num
    if mask_rest.any():
        strv = raw.loc[mask_rest].astype(str).str.strip().str.lower()

        str_map = {
            "true": 1, "false": 0,
            "t": 1, "f": 0,
            "yes": 1, "no": 0,
            "y": 1, "n": 0,
            "polarized": 1, "non-polarized": 0, "nonpolarized": 0,
            "pos": 1, "neg": 0,
            "1": 1, "0": 0,
        }
        out.loc[mask_rest] = strv.map(str_map).fillna(-1).astype(int)

    # Ensure only 0/1 stay; anything else -> -1
    out.loc[~out.isin([0, 1])] = -1
    return out.astype(int)


def load_dir(data_dir: str) -> pd.DataFrame:
    paths = sorted(glob.glob(os.path.join(data_dir, "*.csv")))
    if not paths:
        raise FileNotFoundError(f"No CSVs found in {data_dir}")

    frames = []
    for p in paths:
        lang = os.path.splitext(os.path.basename(p))[0]
        df = pd.read_csv(p)

        id_col, text_col, label_col = detect_cols(df)

        out = pd.DataFrame({
            "id": df[id_col].astype(str),
            "text": df[text_col].astype(str),
            "lang": lang
        })

        if label_col is not None:
            out["labels"] = normalize_label_series(df[label_col])
        else:
            out["labels"] = -1

        frames.append(out)

    return pd.concat(frames, ignore_index=True)


# --- Load raw ---
train_df_raw = load_dir(TRAIN_DIR)
dev_df_all   = load_dir(DEV_DIR)

# --- Optional language prefix (NO special tokens needed) ---
if USE_LANG_PREFIX:
    train_df_raw["text"] = "[LANG=" + train_df_raw["lang"] + "] " + train_df_raw["text"]
    dev_df_all["text"]   = "[LANG=" + dev_df_all["lang"] + "] " + dev_df_all["text"]

# --- Drop invalid labels for TRAIN (ignore NA/invalid as you want) ---
train_df = train_df_raw[train_df_raw["labels"].isin([0, 1])].reset_index(drop=True)

# --- Build a labeled dev for evaluation (if dev has labels) ---
dev_df_labeled = dev_df_all[dev_df_all["labels"].isin([0, 1])].reset_index(drop=True)

# --- Print diagnostics ---
print("Train raw shape:", train_df_raw.shape, "-> cleaned train shape:", train_df.shape)
print("Dev raw shape:", dev_df_all.shape, "-> labeled dev shape:", dev_df_labeled.shape)

print("\nCleaned TRAIN label counts:")
print(train_df["labels"].value_counts(dropna=False))

print("\nLabeled DEV label counts:")
if len(dev_df_labeled) == 0:
    print("DEV appears unlabeled (no 0/1 labels found).")
else:
    print(dev_df_labeled["labels"].value_counts(dropna=False))

dev_labeled_ratio = (dev_df_all["labels"].isin([0, 1])).mean()
print(f"\nDEV labeled ratio: {dev_labeled_ratio*100:.1f}%")

if dev_labeled_ratio < 90:
    print("WARNING: Dev is partially/fully unlabeled. For evaluation, either:")
    print("  (a) use dev_df_labeled (only labeled rows), or")
    print("  (b) create your own validation split from train (recommended if dev mostly unlabeled).")

train_df.head()


Train raw shape: (73681, 4) -> cleaned train shape: (73681, 4)
Dev raw shape: (3687, 4) -> labeled dev shape: (0, 4)

Cleaned TRAIN label counts:
labels
1    39145
0    34536
Name: count, dtype: int64

Labeled DEV label counts:
DEV appears unlabeled (no 0/1 labels found).

DEV labeled ratio: 0.0%
  (a) use dev_df_labeled (only labeled rows), or
  (b) create your own validation split from train (recommended if dev mostly unlabeled).


Unnamed: 0,id,text,lang,labels
0,amh_6713e86058c564a4b874dd62227b7fbc,[LANG=amh] ወፈፌ ቀን አልፎ ዕብድ ቀን ሲመጣ፣ ሰይጣን ፀበል ገብቶ...,amh,1
1,amh_50c28694a056e584ee76da86ed1875ef,[LANG=amh] የአማራ ባንክ የምስጋና እና የዕውቅና መርሐ-ግብር አማራ...,amh,0
2,amh_3fe8faab2cf4c60b9bed28eed5f1c864,[LANG=amh] ራያ ግንባር ጎብዬ መከላከያ፣ የአማራ ልዩ ሀይል እና የ...,amh,0
3,amh_9b7badaab07f0e9e3dd77b99894bbb9d,[LANG=amh] ሩሲያ ቴርሞባሪክ ቦምብ ከመጠቀሟ ጋር ተያይዞ መላው አው...,amh,0
4,amh_be6049aa059a1ccfce6077d0cb8fd9f2,[LANG=amh] ዮኒ ማኛ ለማኝ ስግብግብ ሚዲዳዎችን ዱቄት በዱቄት። እው...,amh,1


In [3]:
# =========================
# CELL 4 — Build HF datasets + tokenize (FIXED)
# Uses:
#   train_df        -> labels are ONLY 0/1 (cleaned in Cell 3)
#   dev_df_labeled  -> ONLY labeled dev rows (0/1). If empty, we skip eval.
# =========================
from datasets import Dataset as HFDataset
from transformers import AutoTokenizer, DataCollatorWithPadding

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

# --- Train set (always labeled) ---
hf_train = HFDataset.from_pandas(train_df[["text", "labels"]], preserve_index=False)

# --- Dev set (ONLY labeled rows). If dev is unlabeled, set hf_dev=None ---
if "dev_df_labeled" in globals() and len(dev_df_labeled) > 0:
    hf_dev = HFDataset.from_pandas(dev_df_labeled[["text", "labels"]], preserve_index=False)
else:
    hf_dev = None
    print("NOTE: dev_df_labeled is empty -> Dev appears unlabeled. We'll train without eval (or create a split).")

def tokenize_fn(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

hf_train = hf_train.map(tokenize_fn, batched=True, remove_columns=["text"])

if hf_dev is not None:
    hf_dev = hf_dev.map(tokenize_fn, batched=True, remove_columns=["text"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

print("hf_train columns:", hf_train.column_names)
if hf_dev is not None:
    print("hf_dev columns:", hf_dev.column_names)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


NOTE: dev_df_labeled is empty -> Dev appears unlabeled. We'll train without eval (or create a split).


Map:   0%|          | 0/73681 [00:00<?, ? examples/s]

hf_train columns: ['labels', 'input_ids', 'attention_mask']


In [5]:
# =========================
# CELL 5 — Trainer setup (works whether hf_dev exists or not)
# =========================
import math
from transformers import Trainer, TrainingArguments, AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {"macro_f1": f1_score(labels, preds, average="macro")}

steps_per_epoch = math.ceil(len(hf_train) / (TRAIN_BS * GRAD_ACCUM))
total_steps = steps_per_epoch * EPOCHS
warmup_steps = int(total_steps * WARMUP_RATIO)

# If hf_dev is None, disable eval
do_eval = (hf_dev is not None)

training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,

    eval_strategy="epoch" if do_eval else "no",
    save_strategy="epoch" if do_eval else "no",

    load_best_model_at_end=True if do_eval else False,
    metric_for_best_model="macro_f1" if do_eval else None,
    greater_is_better=True,

    learning_rate=LR,
    num_train_epochs=EPOCHS,
    per_device_train_batch_size=TRAIN_BS,
    per_device_eval_batch_size=EVAL_BS,
    gradient_accumulation_steps=GRAD_ACCUM,
    weight_decay=WEIGHT_DECAY,
    warmup_steps=warmup_steps,

    fp16=FP16,
    logging_steps=50,
    save_total_limit=2,
    report_to="none",
    seed=SEED,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=hf_train,
    eval_dataset=hf_dev if do_eval else None,
    data_collator=data_collator,
    compute_metrics=compute_metrics if do_eval else None,
)

trainer


Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mXLMRobertaForSequenceClassification LOAD REPORT[0m from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
lm_head.dense.weight        | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.bias       | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.out_proj.weight  | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


<transformers.trainer.Trainer at 0x7ff6a0da0230>

In [6]:
# =========================
# CELL 6 — Train + Evaluate
# =========================
train_out = trainer.train()
print(train_out)

metrics = trainer.evaluate()
print("DEV metrics:", metrics)


Step,Training Loss
50,0.785416
100,0.736479
150,0.719389
200,0.676095
250,0.658557
300,0.678762
350,0.632576
400,0.672012
450,0.65867
500,0.621212


TrainOutput(global_step=13818, training_loss=0.3948806841236445, metrics={'train_runtime': 1982.9735, 'train_samples_per_second': 111.47, 'train_steps_per_second': 6.968, 'total_flos': 1.015784269485726e+16, 'train_loss': 0.3948806841236445, 'epoch': 3.0})


ValueError: Trainer: evaluation requires an eval_dataset.

In [8]:
# Skip eval if there is no eval dataset
if hasattr(trainer, "eval_dataset") and trainer.eval_dataset is not None:
    print(trainer.evaluate())
else:
    print("No eval dataset attached. Training is done; skipping evaluation.")


No eval dataset attached. Training is done; skipping evaluation.


In [10]:
# =========================
# CELL 7 — Threshold tuning on a held-out validation split (works even if dev is unlabeled)
# Requires: train_df (clean 0/1), trainer, tokenizer
# =========================
from sklearn.model_selection import train_test_split
from datasets import Dataset as HFDataset
from sklearn.metrics import f1_score
import numpy as np

# 1) Make a labeled validation split
_, val_df = train_test_split(
    train_df,
    test_size=0.1,
    random_state=SEED,
    stratify=train_df["labels"]
)

hf_val = HFDataset.from_pandas(val_df[["text", "labels"]], preserve_index=False)

def tok(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

hf_val = hf_val.map(tok, batched=True, remove_columns=["text"])

# 2) Predict on val
pred = trainer.predict(hf_val)
logits = pred.predictions
y_true = pred.label_ids

def softmax(x):
    x = x - np.max(x, axis=1, keepdims=True)
    e = np.exp(x)
    return e / np.sum(e, axis=1, keepdims=True)

p1 = softmax(logits)[:, 1]

# 3) Sweep thresholds and pick the best Macro F1
best_t, best_f1 = 0.5, -1.0
for t in np.linspace(0.0, 1.0, 101):
    y_pred = (p1 >= t).astype(int)
    f1 = f1_score(y_true, y_pred, average="macro")
    if f1 > best_f1:
        best_f1 = f1
        best_t = float(t)

print(f"Best threshold (val) = {best_t:.2f}  MacroF1 = {best_f1:.4f}")


Map:   0%|          | 0/7369 [00:00<?, ? examples/s]

Step,Training Loss


Best threshold (val) = 0.42  MacroF1 = 0.9128


In [11]:
# =========================
# CELL 7 — Threshold tuning on a FIXED held-out split + compare to argmax
# =========================
from sklearn.model_selection import train_test_split
from datasets import Dataset as HFDataset
from sklearn.metrics import f1_score
import numpy as np

# Make a fixed split once (save it so reruns are consistent)
train_split_df, val_df = train_test_split(
    train_df,
    test_size=0.1,
    random_state=SEED,          # fixed
    stratify=train_df["labels"]
)

hf_val = HFDataset.from_pandas(val_df[["text", "labels"]], preserve_index=False)

def tok(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

hf_val = hf_val.map(tok, batched=True, remove_columns=["text"])

pred = trainer.predict(hf_val)
logits = pred.predictions
y_true = pred.label_ids

# Argmax baseline
y_pred_argmax = np.argmax(logits, axis=1)
f1_argmax = f1_score(y_true, y_pred_argmax, average="macro")

def softmax(x):
    x = x - np.max(x, axis=1, keepdims=True)
    e = np.exp(x)
    return e / np.sum(e, axis=1, keepdims=True)

p1 = softmax(logits)[:, 1]

best_t, best_f1 = 0.5, -1.0
for t in np.linspace(0.0, 1.0, 201):  # finer grid
    y_pred = (p1 >= t).astype(int)
    f1 = f1_score(y_true, y_pred, average="macro")
    if f1 > best_f1:
        best_f1 = f1
        best_t = float(t)

print(f"Macro F1 (argmax)   = {f1_argmax:.4f}")
print(f"Best threshold      = {best_t:.3f}")
print(f"Macro F1 (threshold)= {best_f1:.4f}")


Map:   0%|          | 0/7369 [00:00<?, ? examples/s]

Step,Training Loss


Macro F1 (argmax)   = 0.9124
Best threshold      = 0.445
Macro F1 (threshold)= 0.9132
