In [1]:
import torch
print("GPU available:", torch.cuda.is_available())
print("GPU name:", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "None")

GPU available: True
GPU name: Tesla T4


In [2]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [3]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
import os
import gc
from datasets import Dataset

from sklearn.metrics import confusion_matrix, classification_report, f1_score
from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)

from sklearn.utils.class_weight import compute_class_weight

import torch.nn as nn
from transformers import Trainer
from torch.utils.data import DataLoader

In [4]:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Configurations

In [5]:
TRAIN_DATA_PATH = "PCL_train_dataset.tsv"
VAL_DATA_PATH = "PCL_val_dataset.tsv"
TEST_DATA_PATH = "PCL_test_dataset.tsv"

SAVED_MODEL_PATH = "/content/best_model"

MODEL_NAME = "roberta-base"
RANDOM_SEED = 42
MAX_LENGTH = 256  # roberta-base limit: 512

# Data Loading and Preprocessing



In [None]:
import sys
if "google.colab" in sys.modules:
    from google.colab import files
    files.upload()

Saving PCL_test_dataset.tsv to PCL_test_dataset.tsv
Saving PCL_train_dataset.tsv to PCL_train_dataset.tsv
Saving PCL_val_dataset.tsv to PCL_val_dataset.tsv


In [7]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, use_fast=True)

def tokenize(batch):
  return tokenizer(
      batch["text"],
      truncation=True,
      padding=False,   # let DataCollatorWithPadding handle dynamic padding
      max_length=MAX_LENGTH,
  )

def make_hf_dataset(data_path: str, is_train: bool = False, is_test: bool = False):
  columns = ["article_id", "keyword", "country", "text"]
  if not is_test:
    columns += ["PCL_category"]

  df = pd.read_csv(
        data_path,
        sep="\t",
        skiprows=9,
        engine="python",
        index_col=0,
        header=None,
        names=columns,
    )

  if is_train:
    df = df.sample(frac=1, random_state=RANDOM_SEED).reset_index(drop=True)

  if not is_test:
    df["labels"] = (df["PCL_category"] >= 2).astype("int64")

  df["text"] = df["text"].astype(str).fillna("")

  ds = Dataset.from_pandas(df, preserve_index=False)
  ds = ds.map(tokenize, batched=True)

  cols = ["input_ids", "attention_mask"]
  if not is_test:
    cols += ["labels"]

  ds.set_format(type="torch", columns=cols)

  return ds, df

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/481 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [8]:
train_ds, df_train = make_hf_dataset(TRAIN_DATA_PATH, is_train = True)
val_ds, _ = make_hf_dataset(VAL_DATA_PATH)
test_ds, _ = make_hf_dataset(TEST_DATA_PATH, is_test = True)

print("Length of training dataset:", len(train_ds))
print("Length of validation dataset:", len(val_ds))

Map:   0%|          | 0/8367 [00:00<?, ? examples/s]

Map:   0%|          | 0/2086 [00:00<?, ? examples/s]

Map:   0%|          | 0/3823 [00:00<?, ? examples/s]

Length of training dataset: 8367
Length of validation dataset: 2086


# Class Weights

In [9]:
train_labels = df_train["labels"].values
labels = np.array(train_labels)  # 0/1 labels
class_weights = compute_class_weight(
    class_weight="balanced",
    classes=np.array([0, 1]),
    y=labels
)
class_weights[1] *= 2
class_weights = torch.tensor(class_weights, dtype=torch.float)

print("Class weights:", class_weights)

Class weights: tensor([ 0.5524, 10.5378])


In [10]:
class WeightedTrainer(Trainer):
    def __init__(self, class_weights, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels").long()
        outputs = model(**inputs)
        logits = outputs.logits

        # Compute loss in fp32 to avoid fp16 dtype issues
        logits_fp32 = logits.float()
        w_fp32 = self.class_weights.to(device=logits.device, dtype=torch.float32)

        loss = torch.nn.functional.cross_entropy(logits_fp32, labels, weight=w_fp32)
        return (loss, outputs) if return_outputs else loss

# Evaluation Metrics

In [11]:
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, average_precision_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)

    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", zero_division=0
    )
    acc = accuracy_score(labels, preds)

    return {
        "accuracy": acc,
        "precision": precision,
        "recall": recall,
        "f1": f1,
    }

# Model Training

In [15]:
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)

args = TrainingArguments(
    output_dir="./out",
    learning_rate= 5e-6,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=16,
    num_train_epochs= 4, #5

    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,

    metric_for_best_model="eval_f1",
    greater_is_better=True,

    report_to="none",
    fp16=False,
    bf16=False,
    max_grad_norm=1.0,
)


trainer = WeightedTrainer(
    model=model,
    args=args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    data_collator=data_collator,
    class_weights=class_weights,
    compute_metrics=compute_metrics,
)

trainer.train()

model.safetensors:   0%|          | 0.00/499M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

RobertaForSequenceClassification LOAD REPORT from: roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.dense.bias              | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.968381,0.701948,0.922339,0.716418,0.251309,0.372093
2,0.736073,0.760628,0.928092,0.675214,0.413613,0.512987
3,0.620075,0.743868,0.924257,0.588235,0.575916,0.582011
4,0.445515,0.726351,0.925216,0.588832,0.60733,0.597938
5,0.344042,0.789524,0.923778,0.583333,0.586387,0.584856


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.encoder.layer.3.attention.output.LayerNorm.bias', 'roberta.encoder.layer.3.output.LayerNorm.weight', 'roberta.encoder.layer.3.output.Laye

TrainOutput(global_step=10460, training_loss=0.6373866633748917, metrics={'train_runtime': 1464.3881, 'train_samples_per_second': 28.568, 'train_steps_per_second': 7.143, 'total_flos': 1903869541029840.0, 'train_loss': 0.6373866633748917, 'epoch': 5.0})

# Results

In [16]:
print("TRAIN:", trainer.evaluate(train_ds))

print("VAL:", trainer.evaluate())

TRAIN: {'eval_loss': 0.4210270047187805, 'eval_accuracy': 0.9783673957212861, 'eval_precision': 0.9147496617050067, 'eval_recall': 0.8513853904282116, 'eval_f1': 0.8819308545335942, 'eval_runtime': 64.7207, 'eval_samples_per_second': 129.278, 'eval_steps_per_second': 8.081, 'epoch': 5.0}
VAL: {'eval_loss': 0.7265282869338989, 'eval_accuracy': 0.925215723873442, 'eval_precision': 0.5888324873096447, 'eval_recall': 0.6073298429319371, 'eval_f1': 0.5979381443298969, 'eval_runtime': 15.7465, 'eval_samples_per_second': 132.474, 'eval_steps_per_second': 8.319, 'epoch': 5.0}


In [17]:
pred_val = trainer.predict(val_ds)
y_true = pred_val.label_ids
y_pred = np.argmax(pred_val.predictions, axis=-1)

print("Confusion Matrix:\n\n", confusion_matrix(y_true, y_pred))
print("\nClassification Report\n")
print(classification_report(y_true, y_pred, digits=4, zero_division=0))

Confusion Matrix:

 [[1814   81]
 [  75  116]]

Classification Report

              precision    recall  f1-score   support

           0     0.9603    0.9573    0.9588      1895
           1     0.5888    0.6073    0.5979       191

    accuracy                         0.9252      2086
   macro avg     0.7746    0.7823    0.7784      2086
weighted avg     0.9263    0.9252    0.9257      2086



# Save Model

In [18]:
trainer.save_model(SAVED_MODEL_PATH)
tokenizer.save_pretrained(SAVED_MODEL_PATH)

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

('/content/best_model/tokenizer_config.json',
 '/content/best_model/tokenizer.json')

In [19]:
!zip -r hf_model.zip /content/best_model

  adding: content/best_model/ (stored 0%)
  adding: content/best_model/config.json (deflated 50%)
  adding: content/best_model/training_args.bin (deflated 53%)
  adding: content/best_model/tokenizer_config.json (deflated 50%)
  adding: content/best_model/model.safetensors (deflated 11%)
  adding: content/best_model/tokenizer.json (deflated 82%)


In [20]:
from google.colab import files
files.download("hf_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>