In [1]:
# CELL 1 — Install deps (do NOT pin numpy/pandas in modern Colab)
!pip -q install -U "transformers>=4.40" "accelerate>=0.27" "sentencepiece" "sacremoses" \
               "scikit-learn>=1.4" "pandas>=2.2" "tqdm" "empath"


[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.5/79.5 kB[0m [31m4.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.7/57.7 kB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.6/57.6 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.3/10.3 MB[0m [31m46.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m897.5/897.5 kB[0m [31m16.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.9/8.9 MB[0m [31m35.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.9/10.9 MB[0m [31m44.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m78.4/78.4 kB[0m [31m3.8 MB/s[0m eta [36

In [2]:
import os, glob, json
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

import torch
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

from empath import Empath

TRAIN_DIR = "/content/train"
CACHE_DIR = "/content/cache_translate"
os.makedirs(CACHE_DIR, exist_ok=True)

SEED = 42
np.random.seed(SEED)


In [3]:
def load_all_train(train_dir: str) -> pd.DataFrame:
    paths = sorted(glob.glob(os.path.join(train_dir, "*.csv")))
    if not paths:
        raise FileNotFoundError(f"No CSVs found in {train_dir}")

    dfs = []
    for p in paths:
        lang = os.path.splitext(os.path.basename(p))[0]  # e.g., 'urd'
        df = pd.read_csv(p)

        # Minimal schema enforcement
        if not {"id","text","polarization"}.issubset(df.columns):
            raise ValueError(f"{p} columns={df.columns.tolist()} expected id,text,polarization")

        out = pd.DataFrame({
            "id": df["id"].astype(str),
            "lang": lang,
            "text": df["text"].astype(str),
            "labels": pd.to_numeric(df["polarization"], errors="coerce")
        })

        # Keep only labeled 0/1
        out = out[out["labels"].isin([0,1])].copy()
        out["labels"] = out["labels"].astype(int)

        dfs.append(out)

    all_df = pd.concat(dfs, ignore_index=True)
    return all_df

data = load_all_train(TRAIN_DIR)
print("Total rows:", len(data))
print("Languages:", sorted(data["lang"].unique()))
print("Label counts:\n", data["labels"].value_counts())
data.head()


Total rows: 73681
Languages: ['amh', 'arb', 'ben', 'deu', 'eng', 'fas', 'hau', 'hin', 'ita', 'khm', 'mya', 'nep', 'ori', 'pan', 'pol', 'rus', 'spa', 'swa', 'tel', 'tur', 'urd', 'zho']
Label counts:
 labels
1    39145
0    34536
Name: count, dtype: int64


Unnamed: 0,id,lang,text,labels
0,amh_6713e86058c564a4b874dd62227b7fbc,amh,ወፈፌ ቀን አልፎ ዕብድ ቀን ሲመጣ፣ ሰይጣን ፀበል ገብቶ ሰው ከሰይጣን ወጣ።,1
1,amh_50c28694a056e584ee76da86ed1875ef,amh,የአማራ ባንክ የምስጋና እና የዕውቅና መርሐ-ግብር አማራ ባንክ ከባንክ ባ...,0
2,amh_3fe8faab2cf4c60b9bed28eed5f1c864,amh,ራያ ግንባር ጎብዬ መከላከያ፣ የአማራ ልዩ ሀይል እና የምስራቅ አማራ ፋኖ...,0
3,amh_9b7badaab07f0e9e3dd77b99894bbb9d,amh,ሩሲያ ቴርሞባሪክ ቦምብ ከመጠቀሟ ጋር ተያይዞ መላው አውሮፓ ላይ ጭንቀት ...,0
4,amh_be6049aa059a1ccfce6077d0cb8fd9f2,amh,ዮኒ ማኛ ለማኝ ስግብግብ ሚዲዳዎችን ዱቄት በዱቄት። እውነት ነው በጣም ያ...,1


In [4]:
def per_language_split(df: pd.DataFrame, val_ratio=0.1, seed=42):
    train_parts, val_parts = [], []
    for lang, g in df.groupby("lang"):
        if g["labels"].nunique() < 2:
            # if only one class present, put all in train
            train_parts.append(g)
            continue
        tr, va = train_test_split(
            g, test_size=val_ratio, random_state=seed, stratify=g["labels"]
        )
        train_parts.append(tr)
        val_parts.append(va)
    train_df = pd.concat(train_parts, ignore_index=True)
    val_df = pd.concat(val_parts, ignore_index=True) if val_parts else pd.DataFrame(columns=df.columns)
    return train_df, val_df

train_df, val_df = per_language_split(data, val_ratio=0.1, seed=SEED)

print("Train rows:", len(train_df), "Val rows:", len(val_df))
print("Train label counts:\n", train_df["labels"].value_counts())
print("Val label counts:\n", val_df["labels"].value_counts())


Train rows: 66303 Val rows: 7378
Train label counts:
 labels
1    35227
0    31076
Name: count, dtype: int64
Val label counts:
 labels
1    3918
0    3460
Name: count, dtype: int64


In [5]:
LANG2NLLB = {
    "amh": "amh_Ethi",
    "arb": "arb_Arab",
    "ben": "ben_Beng",
    "deu": "deu_Latn",
    "eng": "eng_Latn",
    "fas": "pes_Arab",   # Persian in NLLB is typically 'pes_Arab'
    "hau": "hau_Latn",
    "hin": "hin_Deva",
    "ita": "ita_Latn",
    "khm": "khm_Khmr",
    "mya": "mya_Mymr",
    "nep": "npi_Deva",   # Nepali in NLLB often uses npi_Deva
    "ori": "ory_Orya",   # Odia uses ory_Orya
    "pan": "pan_Guru",   # Punjabi (Gurmukhi). If your text is Shahmukhi, adjust.
    "pol": "pol_Latn",
    "rus": "rus_Cyrl",
    "spa": "spa_Latn",
    "swa": "swh_Latn",   # Swahili code in NLLB is swh_Latn
    "tel": "tel_Telu",
    "tur": "tur_Latn",
    "urd": "urd_Arab",
    "zho": "zho_Hans",
}

# quick sanity check
missing = sorted(set(data["lang"].unique()) - set(LANG2NLLB.keys()))
print("Missing mappings:", missing)


Missing mappings: []


In [6]:
MT_MODEL = "facebook/nllb-200-distilled-600M"

tokenizer_mt = AutoTokenizer.from_pretrained(MT_MODEL)
model_mt = AutoModelForSeq2SeqLM.from_pretrained(MT_MODEL)

device = "cuda" if torch.cuda.is_available() else "cpu"
model_mt = model_mt.to(device)
model_mt.eval()

print("Device:", device)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/846 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/564 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.3M [00:00<?, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]



pytorch_model.bin:   0%|          | 0.00/2.46G [00:00<?, ?B/s]

Loading weights:   0%|          | 0/512 [00:00<?, ?it/s]

model.safetensors:   0%|          | 0.00/2.46G [00:00<?, ?B/s]



generation_config.json:   0%|          | 0.00/189 [00:00<?, ?B/s]

Device: cuda


In [7]:
# =========================
# CELL 7b — Fast translation with progress bar (NLLB)
# =========================
from tqdm.auto import tqdm

TARGET_LANG = "eng_Latn"

def get_bos_id(lang_code: str) -> int:
    bos_id = tokenizer_mt.convert_tokens_to_ids(lang_code)
    if bos_id is None or bos_id == tokenizer_mt.unk_token_id:
        raise ValueError(f"Could not get BOS id for lang_code={lang_code}.")
    return int(bos_id)

TARGET_BOS_ID = get_bos_id(TARGET_LANG)

def translate_texts_nllb_fast(texts, src_lang_code, batch_size=16, max_new_tokens=128, num_beams=1):
    tokenizer_mt.src_lang = src_lang_code
    out = []

    # number of batches for progress bar
    n_batches = (len(texts) + batch_size - 1) // batch_size

    for bi in tqdm(range(n_batches), desc=f"MT {src_lang_code}→eng", leave=False):
        batch = texts[bi*batch_size:(bi+1)*batch_size]

        inputs = tokenizer_mt(
            batch, return_tensors="pt", padding=True, truncation=True, max_length=256
        ).to(device)

        with torch.no_grad():
            gen = model_mt.generate(
                **inputs,
                forced_bos_token_id=TARGET_BOS_ID,
                max_new_tokens=max_new_tokens,
                num_beams=num_beams,
                do_sample=False
            )
        out.extend(tokenizer_mt.batch_decode(gen, skip_special_tokens=True))

    return out


def translate_one_language_with_cache(df_lang: pd.DataFrame, lang: str, cache_dir=CACHE_DIR):
    src_code = LANG2NLLB[lang]
    cache_path = os.path.join(cache_dir, f"translate_{lang}.csv")

    df_lang = df_lang.copy()
    df_lang["id"] = df_lang["id"].astype(str)

    # IMPORTANT: ensure text_en is object dtype (so we can assign strings safely)
    df_lang["text_en"] = pd.Series([None] * len(df_lang), index=df_lang.index, dtype="object")

    # load existing cache
    cached_map = {}
    if os.path.exists(cache_path):
        cached = pd.read_csv(cache_path)
        cached["id"] = cached["id"].astype(str)
        cached_map = dict(zip(cached["id"], cached["text_en"].astype(str)))

    df_lang.loc[:, "text_en"] = df_lang["id"].map(cached_map).astype("object")

    missing = df_lang[df_lang["text_en"].isna()]
    if len(missing) == 0:
        return df_lang

    print(f"Translating {lang}: {len(missing)} rows -> EN (src={src_code})")

    texts = missing["text"].astype(str).tolist()
    en_texts = translate_texts_nllb_fast(
        texts, src_code,
        batch_size=16,
        max_new_tokens=128,
        num_beams=1
    )

    # Assign safely (index-aligned Series)
    df_lang.loc[missing.index, "text_en"] = pd.Series(en_texts, index=missing.index, dtype="object")

    # update cache
    new_cache = pd.DataFrame({"id": missing["id"].tolist(), "text_en": en_texts})
    if os.path.exists(cache_path):
        cached = pd.read_csv(cache_path)
        cached["id"] = cached["id"].astype(str)
        merged = pd.concat([cached, new_cache], ignore_index=True).drop_duplicates("id", keep="last")
    else:
        merged = new_cache
    merged.to_csv(cache_path, index=False)

    return df_lang

def translate_df_with_cache_fast(df: pd.DataFrame, cache_dir=CACHE_DIR) -> pd.DataFrame:
    parts = []
    for lang, g in df.groupby("lang"):
        parts.append(translate_one_language_with_cache(g, lang, cache_dir=cache_dir))
    return pd.concat(parts, ignore_index=True)


In [1]:
train_tr = translate_df_with_cache_fast(train_df)
val_tr   = translate_df_with_cache_fast(val_df)

train_tr[["lang", "text_en", "labels"]].head()


NameError: name 'translate_df_with_cache_fast' is not defined

In [None]:
lex = Empath()
CATS = sorted(list(lex.cats.keys()))
print("Empath categories:", len(CATS))

def empath_features(texts):
    feats = np.zeros((len(texts), len(CATS)), dtype=np.float32)
    for i, t in enumerate(tqdm(texts, desc="Empath")):
        d = lex.analyze(t, normalize=True)  # normalize=True gives comparable scale
        for j, c in enumerate(CATS):
            feats[i, j] = d.get(c, 0.0)
    return feats

X_train = empath_features(train_tr["text_en"].fillna("").tolist())
y_train = train_tr["labels"].to_numpy()

X_val = empath_features(val_tr["text_en"].fillna("").tolist())
y_val = val_tr["labels"].to_numpy()

print("X_train:", X_train.shape, "X_val:", X_val.shape)


In [None]:
clf = LogisticRegression(
    max_iter=2000,
    class_weight="balanced",
    solver="liblinear",   # stable for small/medium dense features
    random_state=SEED
)

clf.fit(X_train, y_train)

val_pred = clf.predict(X_val)
macro_f1 = f1_score(y_val, val_pred, average="macro")
print("Macro F1 (val):", round(macro_f1, 4))
