In [1]:
!pip -q install "transformers>=4.40" "datasets>=2.18" "accelerate>=0.27" "scikit-learn>=1.4" "pandas>=2.2"

In [2]:
import os, glob, math, shutil
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

from datasets import Dataset as HFDataset
from transformers import (
    AutoTokenizer, AutoModelForSequenceClassification,
    TrainingArguments, Trainer, DataCollatorWithPadding, set_seed
)

import torch

TRAIN_DIR = "/content/train"
MODEL_NAME = "xlm-roberta-base"
MAX_LENGTH = 128
SEED = 42
set_seed(SEED)

# per-language training settings (keep small to be practical)
LR = 2e-5
EPOCHS = 2           # start with 2; raise later if you have time
TRAIN_BS = 16
EVAL_BS = 32
WEIGHT_DECAY = 0.01
WARMUP_RATIO = 0.06
GRAD_ACCUM = 1
FP16 = True

RESULTS_DIR = "/content/per_lang_results"
MODELS_DIR  = "/content/per_lang_models"
os.makedirs(RESULTS_DIR, exist_ok=True)
os.makedirs(MODELS_DIR, exist_ok=True)

device = "cuda" if torch.cuda.is_available() else "cpu"
print("Device:", device)

Device: cuda


In [3]:
def load_language_csv(path: str, lang: str) -> pd.DataFrame:
    df = pd.read_csv(path)

    need = {"id", "text", "polarization"}
    if not need.issubset(df.columns):
        raise ValueError(f"{path} columns={df.columns.tolist()} expected {need}")

    out = pd.DataFrame({
        "id": df["id"].astype(str),
        "text": df["text"].astype(str),
        "labels": pd.to_numeric(df["polarization"], errors="coerce")
    })

    out = out[out["labels"].isin([0, 1])].copy()
    out["labels"] = out["labels"].astype(int)
    out["lang"] = lang
    return out

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize_batch(batch):
    return tokenizer(batch["text"], truncation=True, max_length=MAX_LENGTH)

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/615 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

sentencepiece.bpe.model:   0%|          | 0.00/5.07M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.10M [00:00<?, ?B/s]

In [5]:
def train_eval_one_language(lang: str, df: pd.DataFrame, save_model: bool = False):
    # if a language has only one class, Macro F1 is not meaningful
    if df["labels"].nunique() < 2:
        return {
            "lang": lang,
            "n_total": len(df),
            "n_train": len(df),
            "n_val": 0,
            "macro_f1": None,
            "note": "Only one label present; skipped training/eval."
        }

    train_df, val_df = train_test_split(
        df,
        test_size=0.1,
        random_state=SEED,
        stratify=df["labels"]
    )

    hf_train = HFDataset.from_pandas(train_df[["text", "labels"]], preserve_index=False)
    hf_val   = HFDataset.from_pandas(val_df[["text", "labels"]], preserve_index=False)

    hf_train = hf_train.map(tokenize_batch, batched=True, remove_columns=["text"])
    hf_val   = hf_val.map(tokenize_batch, batched=True, remove_columns=["text"])

    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

    # warmup_steps from ratio
    steps_per_epoch = math.ceil(len(hf_train) / (TRAIN_BS * GRAD_ACCUM))
    total_steps = steps_per_epoch * EPOCHS
    warmup_steps = int(total_steps * WARMUP_RATIO)

    out_dir = os.path.join(RESULTS_DIR, f"tmp_{lang}")
    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)

    training_args = TrainingArguments(
        output_dir=out_dir,
        eval_strategy="epoch",
        save_strategy="no",              # no checkpoint spam
        learning_rate=LR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=TRAIN_BS,
        per_device_eval_batch_size=EVAL_BS,
        gradient_accumulation_steps=GRAD_ACCUM,
        weight_decay=WEIGHT_DECAY,
        warmup_steps=warmup_steps,
        fp16=FP16,
        logging_steps=50,
        report_to="none",
        seed=SEED,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=hf_train,
        eval_dataset=hf_val,
        data_collator=data_collator,
    )

    trainer.train()

    # Predict + macro F1 (argmax)
    pred = trainer.predict(hf_val)
    logits = pred.predictions
    y_true = pred.label_ids
    y_pred = np.argmax(logits, axis=1)
    macro_f1 = f1_score(y_true, y_pred, average="macro")

    if save_model:
        model_dir = os.path.join(MODELS_DIR, lang)
        trainer.save_model(model_dir)
        tokenizer.save_pretrained(model_dir)

    return {
        "lang": lang,
        "n_total": len(df),
        "n_train": len(train_df),
        "n_val": len(val_df),
        "macro_f1": float(macro_f1),
        "note": ""
    }

In [6]:
# Choose which languages to run:
all_paths = sorted(glob.glob(os.path.join(TRAIN_DIR, "*.csv")))
all_langs = [os.path.splitext(os.path.basename(p))[0] for p in all_paths]

# If you want to test only a few first, set:
# LANGS_TO_RUN = ["eng", "deu", "urd", "zho"]
LANGS_TO_RUN = all_langs

results = []

for p in all_paths:
    lang = os.path.splitext(os.path.basename(p))[0]
    if lang not in LANGS_TO_RUN:
        continue

    print("\n==============================")
    print(f"Language: {lang}")
    df_lang = load_language_csv(p, lang)
    print("Rows:", len(df_lang), "Label counts:", dict(df_lang["labels"].value_counts()))

    r = train_eval_one_language(lang, df_lang, save_model=False)
    results.append(r)

    if r["macro_f1"] is not None:
        print(f"Macro F1 (argmax) for {lang}: {r['macro_f1']:.4f}")
    else:
        print(f"Skipped {lang}: {r['note']}")

res_df = pd.DataFrame(results).sort_values(
    by="macro_f1", ascending=False, na_position="last"
)

out_csv = os.path.join(RESULTS_DIR, "per_language_macro_f1.csv")
res_df.to_csv(out_csv, index=False)

print("\nSaved:", out_csv)
res_df


Language: amh
Rows: 3332 Label counts: {1: np.int64(2518), 0: np.int64(814)}


Map:   0%|          | 0/2998 [00:00<?, ? examples/s]

Map:   0%|          | 0/334 [00:00<?, ? examples/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.536255,0.625013
2,0.436686,0.529616


Macro F1 (argmax) for amh: 0.6328

Language: arb
Rows: 3380 Label counts: {0: np.int64(1868), 1: np.int64(1512)}


Map:   0%|          | 0/3042 [00:00<?, ? examples/s]

Map:   0%|          | 0/338 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.57826,0.523328
2,0.451556,0.44494


Macro F1 (argmax) for arb: 0.7740

Language: ben
Rows: 3333 Label counts: {0: np.int64(1909), 1: np.int64(1424)}


Map:   0%|          | 0/2999 [00:00<?, ? examples/s]

Map:   0%|          | 0/334 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.555353,0.481995
2,0.42015,0.462345


Macro F1 (argmax) for ben: 0.8158

Language: deu
Rows: 3180 Label counts: {0: np.int64(1668), 1: np.int64(1512)}


Map:   0%|          | 0/2862 [00:00<?, ? examples/s]

Map:   0%|          | 0/318 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.689529,0.642722
2,0.639389,0.607229


Macro F1 (argmax) for deu: 0.6697

Language: eng
Rows: 3222 Label counts: {0: np.int64(2047), 1: np.int64(1175)}


Map:   0%|          | 0/2899 [00:00<?, ? examples/s]

Map:   0%|          | 0/323 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.534079,0.501062
2,0.457138,0.463575


Macro F1 (argmax) for eng: 0.7820

Language: fas
Rows: 3295 Label counts: {1: np.int64(2440), 0: np.int64(855)}


Map:   0%|          | 0/2965 [00:00<?, ? examples/s]

Map:   0%|          | 0/330 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.595612,0.573597
2,0.583106,0.573812


Macro F1 (argmax) for fas: 0.4251

Language: hau
Rows: 3651 Label counts: {0: np.int64(3259), 1: np.int64(392)}


Map:   0%|          | 0/3285 [00:00<?, ? examples/s]

Map:   0%|          | 0/366 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.354005,0.24266
2,0.217589,0.211565


Macro F1 (argmax) for hau: 0.8306

Language: hin
Rows: 2744 Label counts: {1: np.int64(2346), 0: np.int64(398)}


Map:   0%|          | 0/2469 [00:00<?, ? examples/s]

Map:   0%|          | 0/275 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.40729,0.35213
2,0.357306,0.325383


Macro F1 (argmax) for hin: 0.7147

Language: ita
Rows: 3334 Label counts: {0: np.int64(1966), 1: np.int64(1368)}


Map:   0%|          | 0/3000 [00:00<?, ? examples/s]

Map:   0%|          | 0/334 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.679037,0.677535
2,0.683855,0.669102


Macro F1 (argmax) for ita: 0.3710

Language: khm
Rows: 6640 Label counts: {1: np.int64(6029), 0: np.int64(611)}


Map:   0%|          | 0/5976 [00:00<?, ? examples/s]

Map:   0%|          | 0/664 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.331471,0.330789
2,0.293188,0.312961


Macro F1 (argmax) for khm: 0.4759

Language: mya
Rows: 2889 Label counts: {1: np.int64(1682), 0: np.int64(1207)}


Map:   0%|          | 0/2600 [00:00<?, ? examples/s]

Map:   0%|          | 0/289 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.572646,0.413739
2,0.394153,0.40659


Macro F1 (argmax) for mya: 0.8201

Language: nep
Rows: 2005 Label counts: {1: np.int64(1008), 0: np.int64(997)}


Map:   0%|          | 0/1804 [00:00<?, ? examples/s]

Map:   0%|          | 0/201 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.668006,0.540144
2,0.406917,0.385883


Macro F1 (argmax) for nep: 0.8607

Language: ori
Rows: 2368 Label counts: {0: np.int64(1685), 1: np.int64(683)}


Map:   0%|          | 0/2131 [00:00<?, ? examples/s]

Map:   0%|          | 0/237 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.579642,0.492594
2,0.462465,0.438005


Macro F1 (argmax) for ori: 0.7310

Language: pan
Rows: 1700 Label counts: {0: np.int64(860), 1: np.int64(840)}


Map:   0%|          | 0/1530 [00:00<?, ? examples/s]

Map:   0%|          | 0/170 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.693557,0.655091
2,0.604564,0.578597


Macro F1 (argmax) for pan: 0.6428

Language: pol
Rows: 2391 Label counts: {0: np.int64(1388), 1: np.int64(1003)}


Map:   0%|          | 0/2151 [00:00<?, ? examples/s]

Map:   0%|          | 0/240 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.663567,0.572165
2,0.505389,0.458877


Macro F1 (argmax) for pol: 0.7989

Language: rus
Rows: 3348 Label counts: {0: np.int64(2325), 1: np.int64(1023)}


Map:   0%|          | 0/3013 [00:00<?, ? examples/s]

Map:   0%|          | 0/335 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.617445,0.485528
2,0.490982,0.430296


Macro F1 (argmax) for rus: 0.7748

Language: spa
Rows: 3305 Label counts: {1: np.int64(1660), 0: np.int64(1645)}


Map:   0%|          | 0/2974 [00:00<?, ? examples/s]

Map:   0%|          | 0/331 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.693087,0.652137
2,0.59229,0.571353


Macro F1 (argmax) for spa: 0.7066

Language: swa
Rows: 6991 Label counts: {1: np.int64(3504), 0: np.int64(3487)}


Map:   0%|          | 0/6291 [00:00<?, ? examples/s]

Map:   0%|          | 0/700 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.548819,0.60933
2,0.506113,0.50301


Macro F1 (argmax) for swa: 0.7643

Language: tel
Rows: 2366 Label counts: {1: np.int64(1274), 0: np.int64(1092)}


Map:   0%|          | 0/2129 [00:00<?, ? examples/s]

Map:   0%|          | 0/237 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.554763,0.530975
2,0.388803,0.414272


Macro F1 (argmax) for tel: 0.8136

Language: tur
Rows: 2364 Label counts: {0: np.int64(1209), 1: np.int64(1155)}


Map:   0%|          | 0/2127 [00:00<?, ? examples/s]

Map:   0%|          | 0/237 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.654473,0.626124
2,0.539475,0.58986


Macro F1 (argmax) for tur: 0.7046

Language: urd
Rows: 3563 Label counts: {1: np.int64(2476), 0: np.int64(1087)}


Map:   0%|          | 0/3206 [00:00<?, ? examples/s]

Map:   0%|          | 0/357 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.621669,0.56062
2,0.499359,0.544605


Macro F1 (argmax) for urd: 0.6670

Language: zho
Rows: 4280 Label counts: {0: np.int64(2159), 1: np.int64(2121)}


Map:   0%|          | 0/3852 [00:00<?, ? examples/s]

Map:   0%|          | 0/428 [00:00<?, ? examples/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

XLMRobertaForSequenceClassification LOAD REPORT from: xlm-roberta-base
Key                         | Status     | 
----------------------------+------------+-
roberta.pooler.dense.bias   | UNEXPECTED | 
lm_head.dense.weight        | UNEXPECTED | 
roberta.pooler.dense.weight | UNEXPECTED | 
lm_head.layer_norm.bias     | UNEXPECTED | 
lm_head.layer_norm.weight   | UNEXPECTED | 
lm_head.bias                | UNEXPECTED | 
lm_head.dense.bias          | UNEXPECTED | 
classifier.out_proj.weight  | MISSING    | 
classifier.out_proj.bias    | MISSING    | 
classifier.dense.weight     | MISSING    | 
classifier.dense.bias       | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss
1,0.442975,0.402317
2,0.351062,0.388851


Macro F1 (argmax) for zho: 0.8388

Saved: /content/per_lang_results/per_language_macro_f1.csv


Unnamed: 0,lang,n_total,n_train,n_val,macro_f1,note
11,nep,2005,1804,201,0.860665,
21,zho,4280,3852,428,0.838784,
6,hau,3651,3285,366,0.830556,
10,mya,2889,2600,289,0.820083,
2,ben,3333,2999,334,0.815778,
18,tel,2366,2129,237,0.813599,
14,pol,2391,2151,240,0.798855,
4,eng,3222,2899,323,0.781953,
15,rus,3348,3013,335,0.77483,
1,arb,3380,3042,338,0.774009,


In [7]:
import os, glob
import pandas as pd

TRAIN_DIR = "/content/train"

rows = []
for p in sorted(glob.glob(os.path.join(TRAIN_DIR, "*.csv"))):
    lang = os.path.splitext(os.path.basename(p))[0]
    df = pd.read_csv(p)

    if not {"polarization"}.issubset(df.columns):
        raise ValueError(f"{lang}: missing 'polarization' column. Columns={df.columns.tolist()}")

    y = pd.to_numeric(df["polarization"], errors="coerce")
    y = y[y.isin([0, 1])].astype(int)

    n = len(y)
    n1 = int((y == 1).sum())
    n0 = int((y == 0).sum())
    p1 = n1 / n if n else 0.0
    ratio_maj_min = (max(n0, n1) / max(1, min(n0, n1))) if n else None

    rows.append({
        "lang": lang,
        "n_total": n,
        "n_0": n0,
        "n_1": n1,
        "p(1)": round(p1, 4),
        "majority/minority": round(ratio_maj_min, 2) if ratio_maj_min is not None else None
    })

stats = pd.DataFrame(rows).sort_values("p(1)", ascending=False)
print(stats.to_string(index=False))

lang  n_total  n_0  n_1   p(1)  majority/minority
 khm     6640  611 6029 0.9080               9.87
 hin     2744  398 2346 0.8550               5.89
 amh     3332  814 2518 0.7557               3.09
 fas     3295  855 2440 0.7405               2.85
 urd     3563 1087 2476 0.6949               2.28
 mya     2889 1207 1682 0.5822               1.39
 tel     2366 1092 1274 0.5385               1.17
 nep     2005  997 1008 0.5027               1.01
 spa     3305 1645 1660 0.5023               1.01
 swa     6991 3487 3504 0.5012               1.00
 zho     4280 2159 2121 0.4956               1.02
 pan     1700  860  840 0.4941               1.02
 tur     2364 1209 1155 0.4886               1.05
 deu     3180 1668 1512 0.4755               1.10
 arb     3380 1868 1512 0.4473               1.24
 ben     3333 1909 1424 0.4272               1.34
 pol     2391 1388 1003 0.4195               1.38
 ita     3334 1966 1368 0.4103               1.44
 eng     3222 2047 1175 0.3647               1.74
