In [1]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

GPU available: True
GPU name: Tesla T4


In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import os
import gc
from datasets import Dataset

from sklearn.metrics import confusion_matrix, classification_report, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

from sklearn.utils.class_weight import compute_class_weight

import torch.nn as nn
from transformers import Trainer
from torch.utils.data import DataLoader

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Configurations

In [5]:
TRAIN_DATA_PATH = "PCL_train_dataset.tsv"
VAL_DATA_PATH = "PCL_val_dataset.tsv"
TEST_DATA_PATH = "PCL_test_dataset.tsv"

SAVED_MODEL_PATH = "/content/best_model"

MODEL_NAME = "roberta-base"
RANDOM_SEED = 42
MAX_LENGTH = 256  # roberta-base limit: 512

# Data Loading and Preprocessing



In [6]:
# import sys
# if "google.colab" in sys.modules:
#     from google.colab import files
#     files.upload()

In [7]:
# def clean_df(df: pd.DataFrame, text_col: str = "text") -> pd.DataFrame:
#     # Use character length (as in your code). If you prefer word length, switch to .str.split().str.len()
#     sentence_len = df[text_col].astype(str).str.len()

#     Q1 = sentence_len.quantile(0.25)
#     Q3 = sentence_len.quantile(0.75)
#     IQR = Q3 - Q1

#     train_text_max_len = TRAIN_TEXT_MIN_LEN + 1.5 * IQR
#     outliers = df[(sentence_len < TRAIN_TEXT_MIN_LEN) | (sentence_len > train_text_max_len)]

#     outlier_percentage = (len(outliers) / len(df)) * 100
#     if outlier_percentage <= 5:
#         df = df.drop(outliers.index)

#     return df.reset_index(drop=True)

# def data_read_split(data_path: str, test_size: float = 0.2, seed: int = 42, clean: bool = False):
#     df = pd.read_csv(
#         data_path,
#         sep="\t",
#         skiprows=9,
#         engine="python",
#         index_col=0,
#         header=None,
#         names=["article_id", "keyword", "country", "text", "PCL_category"],
#     )

#     # binary label
#     df["is_PCL"] = (df["PCL_category"] >= 2).astype(int)

#     # basic NA handling
#     df["text"] = df["text"].astype(str).fillna("")
#     # df["text"] = df["text"].astype(str).dropna()

#     df_train, df_test = train_test_split(
#         df, test_size=test_size, random_state=seed, shuffle=True, stratify=df["is_PCL"]
#     )

#     if clean:
#         df_train = clean_df(df_train, "text")
#         # Usually do NOT clean test set to keep evaluation realistic
#         # df_test = clean_df(df_test, "text")

#     return df_train, df_test

# def roberta_filtering(df: pd.DataFrame) -> pd.DataFrame:
#     # Keep only what you need; Trainer expects "labels"
#     out = df[["text", "is_PCL"]].copy()
#     out = out.rename(columns={"is_PCL": "labels"})
#     out["labels"] = out["labels"].astype(int)
#     return out

# def make_hf_datasets(df_train: pd.DataFrame, df_test: pd.DataFrame, tokenizer, max_length: int = 256):
#     train_ds = Dataset.from_pandas(df_train, preserve_index=False)
#     test_ds  = Dataset.from_pandas(df_test, preserve_index=False)

#     def tokenize(batch):
#         return tokenizer(
#             batch["text"],
#             truncation=True,
#             padding=False,   # let DataCollatorWithPadding handle dynamic padding
#             max_length=max_length,
#         )

#     train_ds = train_ds.map(tokenize, batched=True)
#     test_ds  = test_ds.map(tokenize, batched=True)

#     # Set format for PyTorch
#     cols = ["input_ids", "attention_mask", "labels"]
#     train_ds.set_format(type="torch", columns=cols)
#     test_ds.set_format(type="torch", columns=cols)
#     return train_ds, test_ds

# data_path = "dontpatronizeme_pcl.tsv"
# df_train_raw, df_test_raw = data_read_split(data_path)

# df_train = roberta_filtering(df_train_raw)
# df_test  = roberta_filtering(df_test_raw)

# tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)
# train_ds, val_ds = make_hf_datasets(df_train, df_test, tokenizer, max_length=256)
# print("Length of training dataset:", len(train_ds))
# print("Length of validation dataset:", len(train_ds))


In [8]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize(batch):
  return tokenizer(
      batch["text"],
      truncation=True,
      padding=False,   # let DataCollatorWithPadding handle dynamic padding
      max_length=MAX_LENGTH,
  )

def make_hf_dataset(data_path: str, is_train: bool = False, is_test: bool = False):
  columns = ["article_id", "keyword", "country", "text"]
  if not is_test:
    columns += ["PCL_category"]

  df = pd.read_csv(
        data_path,
        sep="\t",
        skiprows=9,
        engine="python",
        index_col=0,
        header=None,
        names=columns,
    )

  if is_train:
    df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

  if not is_test:
    df["labels"] = (df["PCL_category"] >= 2).astype("int64")

  df["text"] = df["text"].astype(str).fillna("")

  ds = Dataset.from_pandas(df, preserve_index=False)
  ds = ds.map(tokenize, batched=True)

  cols = ["input_ids", "attention_mask"]
  if not is_test:
    cols += ["labels"]

  ds.set_format(type="torch", columns=cols)

  return ds, df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


In [9]:
train_ds, df_train = make_hf_dataset(TRAIN_DATA_PATH, is_train = True)
val_ds, _ = make_hf_dataset(VAL_DATA_PATH)
test_ds, _ = make_hf_dataset(TEST_DATA_PATH, is_test = True)

print("Length of training dataset:", len(train_ds))
print("Length of validation dataset:", len(val_ds))

Map:   0%|          | 0/8367 [00:00<?, ? examples/s]

Map:   0%|          | 0/2086 [00:00<?, ? examples/s]

Map:   0%|          | 0/3823 [00:00<?, ? examples/s]

Length of training dataset: 8367
Length of validation dataset: 2086


# Class Weights

In [10]:
train_labels = df_train["labels"].values
labels = np.array(train_labels)  # 0/1 labels
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=labels
)
class_weights[1] *= 2
class_weights = torch.tensor(class_weights, dtype=torch.float)

print("Class weights:", class_weights)

Class weights: tensor([ 0.5524, 10.5378])


In [11]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").long()
        outputs = model(**inputs)
        logits = outputs.logits

        # Compute loss in fp32 to avoid fp16 dtype issues
        logits_fp32 = logits.float()
        w_fp32 = self.class_weights.to(device=logits.device, dtype=torch.float32)

        loss = torch.nn.functional.cross_entropy(logits_fp32, labels, weight=w_fp32)
        return (loss, outputs) if return_outputs else loss

# Evaluation Metrics

In [12]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, average_precision_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", zero_division=0
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Model Training

In [13]:
# def check_labels(ds, name="dataset", n=2000):
#     # sample a subset to be quick
#     n = min(n, len(ds))
#     labels = [ds[i]["labels"] for i in range(n)]
#     uniq = sorted(set(labels))
#     print(f"{name}: sample={n}, unique labels={uniq[:50]}{'...' if len(uniq)>50 else ''}")
#     # show suspicious values
#     bad = [x for x in uniq if x is None or (isinstance(x, float) and (x != x)) or x not in (0, 1)]
#     if bad:
#         print(f"❌ {name} has bad labels:", bad)
#     else:
#         print(f"✅ {name} labels look OK (0/1).")

# check_labels(train_ds, "train_ds")
# check_labels(val_ds, "val_ds")

train_ds: sample=2000, unique labels=[tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0)]...
✅ train_ds labels look OK (0/1).
val_ds: sample=2000, unique labels=[tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0), tensor(0),

In [14]:
# bad_idxs = []
# for i in range(len(train_ds)):
#     y = train_ds[i]["label"] if "label" in train_ds.column_names else train_ds[i]["labels"]
#     if y not in (0, 1):
#         bad_idxs.append(i)
#         if len(bad_idxs) >= 5:
#             break

# print("Bad indices:", bad_idxs)
# for i in bad_idxs:
#     print(i, train_ds[i])

Bad indices: []


In [15]:
# max_len_seen = max(len(x) for x in train_ds["input_ids"])
# print("Max tokenized length:", max_len_seen)

Max tokenized length: 256


In [16]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args = TrainingArguments(
    output_dir="./out",
    learning_rate=1e-5,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,

    metric_for_best_model="eval_f1",
    greater_is_better=True,

    report_to="none",
    fp16=False,
    bf16=False,
    max_grad_norm=1.0,
)


trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    class_weights=class_weights,
    compute_metrics=compute_metrics,
)

trainer.train()

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.904078,0.598063,0.920422,0.605042,0.376963,0.464516
2,0.644127,0.790982,0.92953,0.729167,0.366492,0.487805
3,0.398823,0.819425,0.916587,0.544974,0.539267,0.542105


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

TrainOutput(global_step=6276, training_loss=0.7024538491165813, metrics={'train_runtime': 972.1794, 'train_samples_per_second': 25.819, 'train_steps_per_second': 6.456, 'total_flos': 1142737049529900.0, 'train_loss': 0.7024538491165813, 'epoch': 3.0})

# Results

In [17]:
print("TRAIN:", trainer.evaluate(train_ds))

print("VAL:", trainer.evaluate())

TRAIN: {'eval_loss': 0.41968220472335815, 'eval_accuracy': 0.9770527070634636, 'eval_precision': 0.9067567567567567, 'eval_recall': 0.845088161209068, 'eval_f1': 0.8748370273794003, 'eval_runtime': 70.2553, 'eval_samples_per_second': 119.094, 'eval_steps_per_second': 7.444, 'epoch': 3.0}
VAL: {'eval_loss': 0.8194250464439392, 'eval_accuracy': 0.9165867689357622, 'eval_precision': 0.544973544973545, 'eval_recall': 0.5392670157068062, 'eval_f1': 0.5421052631578948, 'eval_runtime': 17.0692, 'eval_samples_per_second': 122.209, 'eval_steps_per_second': 7.675, 'epoch': 3.0}


In [18]:
pred_val = trainer.predict(val_ds)
y_true = pred_val.label_ids
y_pred = np.argmax(pred_val.predictions, axis=-1)

print("Confusion Matrix:\n\n", confusion_matrix(y_true, y_pred))
print("\nClassification Report\n")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

Confusion Matrix:

 [[1809   86]
 [  88  103]]

Classification Report

              precision    recall  f1-score   support

           0     0.9536    0.9546    0.9541      1895
           1     0.5450    0.5393    0.5421       191

    accuracy                         0.9166      2086
   macro avg     0.7493    0.7469    0.7481      2086
weighted avg     0.9162    0.9166    0.9164      2086



# Save Model

In [20]:
trainer.save_model(SAVED_MODEL_PATH)
tokenizer.save_pretrained(SAVED_MODEL_PATH)

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('/content/best_model/tokenizer_config.json',
 '/content/best_model/tokenizer.json')