# Install and import



In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install evaluate rouge_score bert-score

Collecting evaluate
  Downloading evaluate-0.4.4-py3-none-any.whl.metadata (9.5 kB)
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Downloading evaluate-0.4.4-py3-none-any.whl (84 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m
[?25hBuilding wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py) ... [?25l[?25hdone
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24934 sha256=ee3aeb18d92483fbb27a1705789de3f94f65a740d26af31c952f31e48d2c4941
  Stored in directory: /root/.cache/pip/wheels/1e/19/43/8a442dc83660ca25e163e1bd1f89919284ab0d0c1475475148
Successfully built rouge_score
Installing collected packages: rouge_score, evaluate
Successfully installed evaluate-0.4.4 rouge_score-0.1.2


In [None]:

import os, math, random, re, json, gc, pathlib, itertools
import pandas as pd
import numpy as np
import torch
from torch import nn
from torch.utils.data import Dataset, DataLoader, random_split
from nltk.tokenize import sent_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split
from evaluate import load as load_metric
import nltk
import random
from tqdm.auto import tqdm
from transformers import T5TokenizerFast

nltk.download('punkt_tab')
rouge_eval = load_metric("rouge")
SEED = 2025
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Downloading builder script: 0.00B [00:00, ?B/s]

<torch._C.Generator at 0x7e854f7f8290>

#  Dataset


In [None]:
CSV_PATH = '/content/drive/MyDrive/nlp project/project/wikihow_clean.csv'
wiki_df = pd.read_csv(CSV_PATH, )
wiki_df = wiki_df.drop(columns='Unnamed: 0')
print(f"➡️  Righe totali: {len(wiki_df):,}")
wiki_df.head(3)

➡️  Righe totali: 126,613


Unnamed: 0,text,summary
0,So you're a new or aspiring artist and your cr...,Sell Fine Art Online
1,"If you want to be well-read, then, in the word...",Be Well Read
2,Stage names are used by all types of performer...,Pick a Stage Name


In [None]:

train_df, temp_df = train_test_split(wiki_df, test_size=0.2, random_state=SEED)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED)
test_df = test_df.iloc[:100]
print(f'📊 Split -> train: {len(train_df)}, valid: {len(valid_df)}, test: {len(test_df)}')


📊 Split -> train: 101290, valid: 12661, test: 100


# GRU

In [None]:
# Seed & hyper-param

SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE      = torch.device("cuda" if torch.cuda.is_available() else "cpu")
CSV_PATH    = "/content/drive/MyDrive/wikihow_clean.csv"
EMB_DIM     = 256
HID_DIM     = 512
BATCH_SZ    = 8
ACC_STEPS   = 4
EPOCHS      = 2
MAX_SRC_LEN = 256
MAX_TGT_LEN = 32
LABEL_SMOOTH= 0.1
COPY_MECH   = True
WARM_STEPS  = 1_000
DROPOUT     = 0.3


# Data, tokenizer, loader

tokenizer = T5TokenizerFast.from_pretrained("t5-small")
PAD, EOS  = tokenizer.pad_token_id, tokenizer.eos_token_id

df_all = pd.read_csv(CSV_PATH).dropna(subset=["text", "summary"])
train_df, temp_df = train_test_split(df_all, test_size=0.2, random_state=SEED)
valid_df, test_df = train_test_split(temp_df, test_size=0.5, random_state=SEED)


test_df = test_df.iloc[:300]

class WikiDataset(Dataset):
    def __init__(self, frame):
        self.data = frame.reset_index(drop=True)

    def __len__(self): return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        enc = tokenizer(row["text"],
                        padding="max_length", truncation=True,
                        max_length=MAX_SRC_LEN, return_tensors="pt")
        dec = tokenizer(row["summary"],
                        padding="max_length", truncation=True,
                        max_length=MAX_TGT_LEN, return_tensors="pt")
        return {
            "src_ids":  enc["input_ids"].squeeze(0),
            "src_mask": enc["attention_mask"].squeeze(0).bool(),
            "tgt_ids":  dec["input_ids"].squeeze(0)
        }

def make_loader(df, shuffle):
    return DataLoader(WikiDataset(df), batch_size=BATCH_SZ,
                      shuffle=shuffle, num_workers=2, pin_memory=True)

train_loader = make_loader(train_df, True)
val_loader   = make_loader(valid_df, False)
test_loader  = make_loader(test_df, False)


# Model: Encoder, Attention, Decoder

class Encoder(nn.Module):
    def __init__(self, vocab, emb_dim, hid_dim, dropout):
        super().__init__()
        self.embed   = nn.Embedding(vocab, emb_dim, padding_idx=PAD)
        self.rnn     = nn.GRU(emb_dim, hid_dim, batch_first=True,
                              bidirectional=True)
        self.drop_in = nn.Dropout(dropout)
        self.drop_out= nn.Dropout(dropout)

    def forward(self, ids, mask):
        x   = self.drop_in(self.embed(ids))
        out, _ = self.rnn(x)
        out = self.drop_out(out)
        return out


class ScaledDotAttention(nn.Module):
    def __init__(self, query_dim, key_dim):
        super().__init__()
        self.q_proj = nn.Linear(query_dim, key_dim)
        self.scale  = math.sqrt(key_dim)

    def forward(self, query, keys, values, pad_mask=None):
        q = self.q_proj(query)
        scores = torch.bmm(q, keys.transpose(1, 2)) / self.scale
        if pad_mask is not None:
            scores = scores.masked_fill(~pad_mask[:,None,:], -1e9)
        α = torch.softmax(scores, -1)
        ctx = torch.bmm(α, values)
        return ctx, α.squeeze(1)

class Decoder(nn.Module):
    def __init__(self, vocab, emb_dim, hid_dim, dropout):
        super().__init__()
        self.vocab_size = vocab
        self.embed   = nn.Embedding(vocab, emb_dim, padding_idx=PAD)

        self.attn    = ScaledDotAttention(hid_dim, hid_dim * 2)
        self.gru     = nn.GRU(emb_dim + hid_dim * 2, hid_dim,
                              batch_first=True)

        self.fc_out  = nn.Linear(hid_dim, vocab)
        self.drop_in  = nn.Dropout(dropout)
        self.drop_out = nn.Dropout(dropout)

        if COPY_MECH:
            self.p_lin = nn.Linear(hid_dim + hid_dim * 2 + emb_dim, 1)

    def _step(self, inp_tok, h, enc_out, src_ids, src_mask):

        emb_t = self.drop_in(self.embed(inp_tok))[:, None]
        ctx_t, alpha_t = self.attn(h[-1][:, None], enc_out, enc_out, src_mask)
        gru_in = torch.cat([emb_t, ctx_t], -1)
        out, h = self.gru(gru_in, h)

        out_d  = self.drop_out(out.squeeze(1))
        logit_t = self.fc_out(out_d)

        if COPY_MECH:
            p_gen  = torch.sigmoid(
                self.p_lin(torch.cat([out_d, ctx_t.squeeze(1),
                                      emb_t.squeeze(1)], -1)))
            p_vocab = torch.softmax(logit_t, -1)
            p_copy  = torch.zeros_like(p_vocab)
            p_copy.scatter_add_(1, src_ids, alpha_t)
            probs   = p_gen * p_vocab + (1 - p_gen) * p_copy
            logit_t = torch.log(probs + 1e-8)

        return logit_t, h, alpha_t

    def forward(self, tgt_ids, enc_out, src_ids, src_mask, tf_ratio=1.0):

        B, T = tgt_ids.size()
        h = torch.zeros(1, B, self.gru.hidden_size, device=tgt_ids.device)

        logits, attns = [], []

        inp_tok = tgt_ids[:, 0]
        logit_t, h, alpha_t = self._step(inp_tok, h, enc_out,
                                         src_ids, src_mask)
        logits.append(logit_t[:, None])
        attns.append(alpha_t[:, None])

        if T > 1:
            for t in range(1, T - 1):
                use_teacher = (torch.rand(B, device=tgt_ids.device) < tf_ratio)
                inp_tok = torch.where(use_teacher,
                                      tgt_ids[:, t],
                                      logit_t.argmax(-1))
                logit_t, h, alpha_t = self._step(inp_tok, h, enc_out,
                                                 src_ids, src_mask)
                logits.append(logit_t[:, None])
                attns.append(alpha_t[:, None])

        return torch.cat(logits, 1), torch.cat(attns, 1)


class Seq2Seq(nn.Module):
    def __init__(self, vocab):
        super().__init__()
        self.enc = Encoder(vocab, EMB_DIM, HID_DIM, DROPOUT)
        self.dec = Decoder(vocab, EMB_DIM, HID_DIM, DROPOUT)

    def forward(self, src_ids, src_mask, tgt_ids, tf_ratio=1.0):
        enc_out = self.enc(src_ids, src_mask)
        logits, _ = self.dec(tgt_ids, enc_out, src_ids, src_mask, tf_ratio)
        return logits

# Loss, optimiser, schedulers

model = Seq2Seq(len(tokenizer)).to(DEVICE)
criterion = nn.CrossEntropyLoss(ignore_index=PAD, label_smoothing=LABEL_SMOOTH)
optimiser  = torch.optim.Adam(model.parameters(), lr=1e-3, betas=(0.9, 0.98))
cosine     = torch.optim.lr_scheduler.CosineAnnealingLR(
                optimiser, T_max=EPOCHS*len(train_loader)//ACC_STEPS,
                eta_min=1e-5)

def lr_lambda(step):
    return min(1.0, step / WARM_STEPS)
warmup = torch.optim.lr_scheduler.LambdaLR(optimiser, lr_lambda)

# Training & validation loop

def teacher_ratio(step, k=7_000):
    return k / (k + math.exp(step / k))

def run_epoch(loader, train=True):
    model.train(train)
    tot_loss, step = 0, 0
    bar = tqdm(loader, leave=False, unit="batch")
    optimiser.zero_grad()
    for batch in bar:
        step += 1
        src  = batch["src_ids"].to(DEVICE)
        sm   = batch["src_mask"].to(DEVICE)
        tgt  = batch["tgt_ids"].to(DEVICE)

        tf_r = teacher_ratio(run_epoch.global_step) if train else 0.0
        out  = model(src, sm, tgt, tf_ratio=tf_r)
        loss = criterion(out.reshape(-1, out.size(-1)), tgt[:,1:].reshape(-1))
        if train:
            (loss / ACC_STEPS).backward()
            if step % ACC_STEPS == 0:
                nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimiser.step(); warmup.step(); cosine.step()
                optimiser.zero_grad()
                run_epoch.global_step += 1
        tot_loss += loss.item()
        bar.set_description(f"{'Train' if train else 'Val'}  "
                            f"loss={tot_loss/step:.3f}  "
                            f"lr={optimiser.param_groups[0]['lr']:.1e}")
    return tot_loss / step
run_epoch.global_step = 0

for ep in range(1, EPOCHS+1):
    tr = run_epoch(train_loader, True)
    vl = run_epoch(val_loader, False)
    print(f"Epoch {ep}: 🏋️ {tr:.3f} | 🔍 {vl:.3f}")

# Decoding utilities

def length_penalty(score, length, alpha=0.6):
    return score / ((5+length)**alpha / 6**alpha)

@torch.inference_mode()
def beam_decode(net, src, src_mask, beam=4, max_len=MAX_TGT_LEN):
    net.eval(); B = src.size(0)
    enc = net.enc(src, src_mask)
    beams = [[([PAD], 0.0)] for _ in range(B)]
    finished = [[] for _ in range(B)]

    for _ in range(max_len):
        cand = [[] for _ in range(B)]
        for b in range(B):
            for seq, score in beams[b]:
                if seq[-1] == EOS:
                    finished[b].append((seq, score)); continue
                tgt = torch.tensor(seq, device=src.device).unsqueeze(0)
                logit, _ = net.dec(tgt, enc[b:b+1], src[b:b+1],
                                   src_mask[b:b+1], tf_ratio=0.0)
                logp = torch.log_softmax(logit[:,-1], -1).squeeze(0)
                topk = torch.topk(logp, beam)
                for tok, lp in zip(topk.indices.tolist(),
                                   topk.values.tolist()):
                    cand[b].append((seq+[tok], score+lp))
        beams = []
        for c in cand:
            c.sort(key=lambda t: length_penalty(t[1], len(t[0])), reverse=True)
            beams.append(c[:beam])
    summaries = []
    for b in range(B):
        best = max(finished[b] or beams[b],
                   key=lambda t: length_penalty(t[1], len(t[0])))
        summaries.append(best[0][1:])
    maxlen = max(len(s) for s in summaries)
    pad = lambda s: s + [PAD]*(maxlen-len(s)) + [EOS]
    return torch.tensor([pad(s) for s in summaries],
                        dtype=torch.long, device=src.device)

def greedy_decode(net, src, src_mask, max_len=MAX_TGT_LEN):
    net.eval(); B = src.size(0)
    enc = net.enc(src, src_mask)
    seq = torch.full((B, 1), PAD, device=src.device)
    for _ in range(max_len):
        logit, _ = net.dec(seq, enc, src, src_mask, tf_ratio=0.0)
        next_tok = logit[:, -1].argmax(-1, keepdim=True)
        seq = torch.cat([seq, next_tok], 1)
        if (next_tok.squeeze(-1) == EOS).all(): break
    return seq[:, 1:]

def topk_decode(net, src, src_mask, k=50, max_len=MAX_TGT_LEN):
    return sample_decode(net, src, src_mask, k=k, p=1.0, temp=1.0,
                         max_len=max_len)

def topp_decode(net, src, src_mask, p=0.9, max_len=MAX_TGT_LEN):
    return sample_decode(net, src, src_mask, k=0,  p=p,  temp=1.0,
                         max_len=max_len)

  0%|          | 0/12662 [00:00<?, ?batch/s]

  0%|          | 0/1583 [00:00<?, ?batch/s]

Epoch 1: 🏋️ 5.374 | 🔍 6.307


  0%|          | 0/12662 [00:00<?, ?batch/s]

  0%|          | 0/1583 [00:00<?, ?batch/s]

Epoch 2: 🏋️ 4.661 | 🔍 6.185


In [None]:
DECODERS = {
    "greedy": lambda n,s,m: greedy_decode(n,s,m),
    "beam4":  lambda n,s,m: beam_decode(n,s,m, beam=4),
    "topk50": lambda n,s,m: topk_decode(n,s,m, k=50),
    "topp90": lambda n,s,m: topp_decode(n,s,m, p=0.9),
}

# Evaluation loop (ROUGE + BERTScore)

rouge_metric = evaluate.load("rouge")
bert_metric  = evaluate.load("bertscore")

def evaluate_strategy(name, decode_fn):
    gen, ref = [], []
    for batch in tqdm(test_loader, desc=f"⏩ {name}", leave=False):
        src = batch["src_ids"].to(DEVICE)
        sm  = batch["src_mask"].to(DEVICE)
        tgt = batch["tgt_ids"][:, 1:]

        pred_ids = decode_fn(model, src, sm)
        gen.extend(tokenizer.batch_decode(pred_ids, skip_special_tokens=True))
        ref.extend(tokenizer.batch_decode(tgt,      skip_special_tokens=True))

    rouge = rouge_metric.compute(predictions=gen, references=ref)
    bert  = bert_metric.compute(
                predictions=gen, references=ref,
                model_type="microsoft/deberta-xlarge-mnli", lang="en")
    p = np.mean(bert["precision"])
    r = np.mean(bert["recall"])
    f = np.mean(bert["f1"])

    print(f"\n=== {name.upper()} ===")
    print("ROUGE-1/2/L:", round(rouge['rouge1'],4),
          round(rouge['rouge2'],4), round(rouge['rougeL'],4))
    print(f"BERTScore   : P = {p:.4f}  R = {r:.4f}  F1 = {f:.4f}")
    return rouge, bert

all_scores = {}
model.eval()
for name, fn in DECODERS.items():
    all_scores[name] = evaluate_strategy(name, fn)

⏩ greedy:   0%|          | 0/38 [00:00<?, ?it/s]


=== GREEDY ===
ROUGE-1/2/L: 0.2963 0.1065 0.292
BERTScore   : P = 0.5653  R = 0.5382  F1 = 0.5487


⏩ beam4:   0%|          | 0/38 [00:00<?, ?it/s]


=== BEAM4 ===
ROUGE-1/2/L: 0.2736 0.1 0.2698
BERTScore   : P = 0.4940  R = 0.4616  F1 = 0.4750




⏩ topk50:   0%|          | 0/38 [00:00<?, ?it/s]


=== TOPK50 ===
ROUGE-1/2/L: 0.1478 0.0199 0.1429
BERTScore   : P = 0.4479  R = 0.4886  F1 = 0.4650


⏩ topp90:   0%|          | 0/38 [00:00<?, ?it/s]


=== TOPP90 ===
ROUGE-1/2/L: 0.1215 0.0171 0.1172
BERTScore   : P = 0.4297  R = 0.4893  F1 = 0.4555
