In [None]:
from google.colab import drive
drive.mount('/content/drive')

!git clone https://github.com/atremante26/nlp_final_project.git
%cd nlp_final_project

In [None]:
!pip install -q transformers torch scikit-learn pandas tqdm gensim

In [None]:
# IMPORTS
import os, random, sys
import numpy as np
import pandas as pd
from tqdm.auto import tqdm
from google.colab import files

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.optim import AdamW

from transformers import AutoTokenizer, get_linear_schedule_with_warmup
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score,
    average_precision_score, roc_auc_score
)

In [None]:
# SET SEED
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed(42)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
# IMPORTS
# Define project path
project_path = '/content/nlp_final_project'
sys.path.insert(0, project_path)

# Import model
from models import MultiTaskRoBERTa, MultiTaskDataset

sys.path.insert(0, os.path.join(project_path, 'src'))
from data_preprocessing import load_multi_task_data
import data_preprocessing
data_preprocessing.DATA_PATH = '/content/nlp_final_project/data/processed/dices_350_binary.csv'

In [None]:
# LOAD DATA
splits = load_multi_task_data(balance=False)

train_df = splits["train"].reset_index(drop=True)
val_df   = splits["val"].reset_index(drop=True)
test_df  = splits["test"].reset_index(drop=True)

print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))
train_df.head()

In [None]:
# DEFINE TASKS
TASKS_2 = ["Q_overall", "Q2_harmful"]
TASKS_4 = ["Q_overall", "Q2_harmful", "Q3_bias", "Q6_policy"]

TASKS = TASKS_4
print("Active tasks:", TASKS)

Active tasks: ['Q_overall', 'Q2_harmful', 'Q3_bias', 'Q6_policy']


In [None]:
# TASK TO COLUMN MAPPING
task_to_col = {
    "Q_overall": "Q_overall_binary",
    "Q2_harmful": "Q2_harmful_binary",
    "Q3_bias": "Q3_bias_binary",
    "Q6_policy": "Q6_policy_binary",
}

for t in TASKS:
    col = task_to_col[t]
    vc = train_df[col].value_counts(dropna=False)
    print(f"\n{t} ({col}) train distribution:")
    print(vc)

In [None]:
# DEFINE TOKENIZER + DATALOADERS
MODEL_NAME = "roberta-base"
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

MAX_LEN = 256
BATCH_SIZE = 16

train_ds = MultiTaskDataset(train_df, tokenizer, max_length=MAX_LEN, tasks=TASKS)
val_ds   = MultiTaskDataset(val_df,   tokenizer, max_length=MAX_LEN, tasks=TASKS)
test_ds  = MultiTaskDataset(test_df,  tokenizer, max_length=MAX_LEN, tasks=TASKS)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True, num_workers=2)
val_loader   = DataLoader(val_ds,   batch_size=BATCH_SIZE, shuffle=False, num_workers=2)
test_loader  = DataLoader(test_ds,  batch_size=BATCH_SIZE, shuffle=False, num_workers=2)

batch = next(iter(train_loader))
{k: (v.shape, v.dtype) for k,v in batch.items()}

In [None]:
# WEIGHTING
pos_weights = {}
for task in TASKS:
    col = task_to_col[task]
    y = train_df[col].astype(int).values
    pos = (y == 1).sum()
    neg = (y == 0).sum()
    pw = neg / max(pos, 1)
    pos_weights[task] = float(pw)
    print(f"{task}: pos={pos} neg={neg} pos_weight={pw:.3f}")

print(pos_weights)

In [None]:
# DEFINE MODEL
model = MultiTaskRoBERTa(model_name=MODEL_NAME, tasks=TASKS).to(device)

loss_fns = {}
for task in TASKS:
    pw = torch.tensor(pos_weights[task], dtype=torch.float32, device=device)
    loss_fns[task] = nn.BCEWithLogitsLoss(pos_weight=pw)

print(model)

In [None]:
# OPTIMIZER + SCHEDULER
LR = 1e-5
EPOCHS = 5

optimizer = AdamW(model.parameters(), lr=LR)

total_steps = len(train_loader) * EPOCHS
warmup_steps = int(0.1 * total_steps)

scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=warmup_steps,
    num_training_steps=total_steps
)

print("total_steps:", total_steps, "warmup_steps:", warmup_steps)

In [13]:
def compute_binary_metrics(y_true, y_prob, threshold=0.5):
    y_true = np.asarray(y_true).astype(int)
    y_prob = np.asarray(y_prob).astype(float)
    y_pred = (y_prob >= threshold).astype(int)

    metrics = {
        "accuracy": accuracy_score(y_true, y_pred),
        "precision_pos": precision_score(y_true, y_pred, pos_label=1, zero_division=0),
        "recall_pos": recall_score(y_true, y_pred, pos_label=1, zero_division=0),
        "f1_pos": f1_score(y_true, y_pred, pos_label=1, zero_division=0),
        "f1_macro": f1_score(y_true, y_pred, average="macro", zero_division=0),
        "pr_auc": average_precision_score(y_true, y_prob),
        "pred_pos_rate": float(y_pred.mean()),
        "true_pos_rate": float(y_true.mean())
    }

    if len(np.unique(y_true)) == 2:
        metrics["roc_auc"] = roc_auc_score(y_true, y_prob)
    else:
        metrics["roc_auc"] = np.nan

    return metrics

In [None]:
# TRAINING
def train(model, loader, tasks, epoch):
    model.train()
    running_loss = 0.0

    for batch in tqdm(loader, desc=f"Train epoch {epoch+1}", leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask)

        loss = 0.0
        for task in tasks:
            labels = batch[f"labels_{task}"].to(device).float() # [B,1] float
            logits = outputs[task]                       # [B,1]
            loss = loss + loss_fns[task](logits, labels)

        optimizer.zero_grad()
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()

        running_loss += loss.item()

    return running_loss / max(len(loader), 1)

In [None]:
# EVALUATION
@torch.no_grad()
def evaluate(model, loader, tasks):
    model.eval()
    running_loss = 0.0

    all_true = {t: [] for t in tasks}
    all_prob = {t: [] for t in tasks}

    for batch in tqdm(loader, desc="Eval", leave=False):
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask)

        loss = 0.0
        for task in tasks:
            labels = batch[f"labels_{task}"].to(device)  # [B,1]
            logits = outputs[task]                       # [B,1]

            loss = loss + loss_fns[task](logits, labels)

            probs = torch.sigmoid(logits).squeeze(-1).cpu().numpy()  # [B]
            labs  = labels.squeeze(-1).cpu().numpy()                 # [B]

            all_prob[task].extend(probs.tolist())
            all_true[task].extend(labs.tolist())

        running_loss += loss.item()

    metrics = {t: compute_binary_metrics(all_true[t], all_prob[t]) for t in tasks}
    return running_loss / max(len(loader), 1), metrics

In [None]:
# TRAIN MODEL
BEST_PATH = "best_multitask_4.pt"
best_score = -1.0

history = []

for epoch in range(EPOCHS):
    train_loss = train(model, train_loader, TASKS, epoch)
    val_loss, val_metrics = evaluate(model, val_loader, TASKS)

    # Select metric: Q2 PR-AUC is great for imbalance (or Q_overall f1_pos)
    if "Q2_harmful" in TASKS:
        score = val_metrics["Q2_harmful"]["pr_auc"]
        score_name = "Q2_pr_auc"
    else:
        score = val_metrics["Q_overall"]["f1_pos"]
        score_name = "Q_overall_f1_pos"

    print(f"\nEpoch {epoch+1}/{EPOCHS}")
    print(f"  train_loss={train_loss:.4f}  val_loss={val_loss:.4f}  {score_name}={score:.4f}")

    for t in TASKS:
        m = val_metrics[t]
        print(f"  {t}: f1_pos={m['f1_pos']:.3f} pr_auc={m['pr_auc']:.3f} "
              f"pred_pos_rate={m['pred_pos_rate']:.3f} true_pos_rate={m['true_pos_rate']:.3f}")

    history.append({
        "epoch": epoch+1,
        "train_loss": train_loss,
        "val_loss": val_loss,
        score_name: score,
        "val_metrics": val_metrics
    })

    if score > best_score:
        best_score = score
        torch.save({
            "model_state_dict": model.state_dict(),
            "tasks": TASKS,
            "model_name": MODEL_NAME,
            "max_len": MAX_LEN
        }, BEST_PATH)
        print(f"  Saved best checkpoint: {BEST_PATH} ({score_name}={best_score:.4f})")

In [None]:
# LOAD BEST MODEL
ckpt = torch.load(BEST_PATH, map_location=device)
model.load_state_dict(ckpt["model_state_dict"])

test_loss, test_metrics = evaluate(model, test_loader, TASKS)
print(f"Test loss: {test_loss:.4f}")

for t in TASKS:
    m = test_metrics[t]
    print(f"{t}: accuracy={m['accuracy']:.3f} f1_pos={m['f1_pos']:.3f} "
          f"f1_macro={m['f1_macro']:.3f} pr_auc={m['pr_auc']:.3f} "
          f"pred_pos_rate={m['pred_pos_rate']:.3f}")

In [None]:
# PREDICT
@torch.no_grad()
def predict(model, loader, df_source, tasks):
    model.eval()
    rows = []
    idx = 0
    df_source = df_source.reset_index(drop=True)

    for batch in tqdm(loader, desc="Predict", leave=False):
        bs = batch["input_ids"].shape[0]
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)

        outputs = model(input_ids, attention_mask)

        for i in range(bs):
            row = {"text": df_source.loc[idx, "text"]}
            for t in tasks:
                col = task_to_col[t]
                row[f"{t}_true"] = int(df_source.loc[idx, col])
                row[f"{t}_logit"] = float(outputs[t][i].item())
                row[f"{t}_prob"] = float(torch.sigmoid(outputs[t][i]).item())
            rows.append(row)
            idx += 1

    return pd.DataFrame(rows)

preds = predict(model, test_loader, test_df, TASKS)
preds.to_csv("test_predictions_4.csv", index=False)
preds.head()

In [None]:
# DOWNLOAD FROM COLAB
files.download('best_multitask_4.pt')
files.download('test_predictions_4.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>