In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os, json

In [3]:
train_j_path = "/content/drive/MyDrive/Colab Notebooks/bootcamb/train.json"
test_j_path =  "/content/drive/MyDrive/Colab Notebooks/bootcamb/test.json"

In [4]:
assert os.path.exists(train_j_path), f"train.json not found at: {train_j_path}"
print("train.json size (MB):", round(os.path.getsize(train_j_path)/1e6, 2))

# quick peek
sample = json.load(open(train_j_path, "r"))[:1]
print("Sample keys:", list(sample[0].keys()))
print("Tokens ex:", sample[0]["tokens"][:10])
print("Labels ex:", sample[0]["labels"][:10])


train.json size (MB): 109.5
Sample keys: ['document', 'full_text', 'tokens', 'trailing_whitespace', 'labels']
Tokens ex: ['Design', 'Thinking', 'for', 'innovation', 'reflexion', '-', 'Avril', '2021', '-', 'Nathalie']
Labels ex: ['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-NAME_STUDENT']


In [None]:
###################################################################
###################################################################
###################################################################
###################################################################
###################################################################
###################################################################

In [1]:
# Clean setup: disable TF/Flax to avoid extra imports, keep environment minimal
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"
os.environ["TRANSFORMERS_NO_FLAX"] = "1"
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Minimal installs (no-deps to avoid dragging conflicting packages)
!pip -q install --no-deps "transformers==4.43.3" "tokenizers==0.19.1" "sentencepiece==0.1.99"

import torch, transformers
print("torch:", torch.__version__)
print("transformers:", transformers.__version__)


torch: 2.8.0+cu126
transformers: 4.43.3


In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os, json, random

train_j_path = "/content/drive/MyDrive/Colab Notebooks/bootcamb/train.json"
test_j_path  = "/content/drive/MyDrive/Colab Notebooks/bootcamb/test.json"

assert os.path.exists(train_j_path), f"train.json not found at: {train_j_path}"
docs_raw = json.load(open(train_j_path, "r"))
docs = [{"tokens": d["tokens"], "labels": d["labels"]}
        for d in docs_raw if d.get("tokens") and d.get("labels") and len(d["tokens"])==len(d["labels"])]

label_set = sorted({lab for d in docs for lab in d["labels"]})
label2id = {l:i for i,l in enumerate(label_set)}
id2label = {i:l for l,i in label2id.items()}

random.Random(42).shuffle(docs)
cut = int(0.9*len(docs))
train_docs, val_docs = docs[:cut], docs[cut:]
print("Labels:", label_set)
print(f"Train: {len(train_docs)} | Val: {len(val_docs)}")


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Labels: ['B-EMAIL', 'B-ID_NUM', 'B-NAME_STUDENT', 'B-PHONE_NUM', 'B-STREET_ADDRESS', 'B-URL_PERSONAL', 'B-USERNAME', 'I-ID_NUM', 'I-NAME_STUDENT', 'I-PHONE_NUM', 'I-STREET_ADDRESS', 'I-URL_PERSONAL', 'O']
Train: 6126 | Val: 681


In [3]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer

base_model = "microsoft/deberta-v3-base"
tokenizer = AutoTokenizer.from_pretrained(base_model, use_fast=True)

def make_windows(tokens, labels, window_len=384, stride=64):
    n = len(tokens)
    if n <= window_len:
        return [(tokens, labels)]
    out, start = [], 0
    while start < n:
        end = min(start+window_len, n)
        out.append((tokens[start:end], labels[start:end]))
        if end == n: break
        start += (window_len - stride)
    return out

train_wins, val_wins = [], []
for d in train_docs: train_wins += make_windows(d["tokens"], d["labels"])
for d in val_docs:   val_wins   += make_windows(d["tokens"], d["labels"])
print(f"Train windows: {len(train_wins)} | Val windows: {len(val_wins)}")

def align_labels_to_subwords(tokens, bio_labels):
    enc = tokenizer(tokens, is_split_into_words=True, truncation=True, max_length=512)
    word_ids = enc.word_ids()
    y, prev = [], None
    for wid in word_ids:
        if wid is None: y.append(-100)
        elif wid != prev: y.append(label2id[bio_labels[wid]])
        else:
            lab = bio_labels[wid]; y.append(label2id[lab] if lab!="O" else -100)
        prev = wid
    return enc["input_ids"], enc["attention_mask"], y

class PiiDataset(Dataset):
    def __init__(self, wins): self.wins = wins
    def __len__(self): return len(self.wins)
    def __getitem__(self, i):
        toks, labs = self.wins[i]
        ids, attn, y = align_labels_to_subwords(toks, labs)
        return {"input_ids": torch.tensor(ids), "attention_mask": torch.tensor(attn), "labels": torch.tensor(y)}

def collate_fn(batch):
    pad_id = tokenizer.pad_token_id
    mx = max(len(x["input_ids"]) for x in batch)
    def pad(t, val): return torch.cat([t, torch.full((mx-len(t),), val, dtype=t.dtype)])
    input_ids      = torch.stack([pad(x["input_ids"], pad_id) for x in batch])
    attention_mask = torch.stack([pad(x["attention_mask"], 0) for x in batch])
    labels         = torch.stack([pad(x["labels"], -100) for x in batch])
    return {"input_ids": input_ids.long(), "attention_mask": attention_mask.long(), "labels": labels.long()}

train_ds, val_ds = PiiDataset(train_wins), PiiDataset(val_wins)
train_dl = DataLoader(train_ds, batch_size=8, shuffle=True,  collate_fn=collate_fn, num_workers=0)
val_dl   = DataLoader(val_ds,   batch_size=8, shuffle=False, collate_fn=collate_fn, num_workers=0)

batch = next(iter(train_dl))
print({k:v.shape for k,v in batch.items()})


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Train windows: 15830 | Val windows: 1798
{'input_ids': torch.Size([8, 373]), 'attention_mask': torch.Size([8, 373]), 'labels': torch.Size([8, 373])}


In [4]:
import torch, os
from transformers import AutoModelForTokenClassification

os.environ["TOKENIZERS_PARALLELISM"] = "false"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

num_labels = len(label2id)
model = AutoModelForTokenClassification.from_pretrained(
    base_model, num_labels=num_labels, id2label=id2label, label2id=label2id
).to(device)

optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())

def bio_to_spans(seq):
    spans, cur, start = set(), None, None
    for i, tag in enumerate(seq):
        if tag=="O":
            if cur is not None: spans.add((cur,start,i)); cur=None
            continue
        p, ent = (tag.split("-",1)+[""])[:2] if "-" in tag else ("O","")
        if p=="B":
            if cur is not None: spans.add((cur,start,i))
            cur, start = ent, i
        elif p=="I":
            if cur!=ent:
                if cur is not None: spans.add((cur,start,i))
                cur, start = ent, i
        else:
            if cur is not None: spans.add((cur,start,i)); cur=None
    if cur is not None: spans.add((cur,start,len(seq)))
    return spans

def eval_f1():
    model.eval()
    TP=FP=FN=0
    with torch.no_grad():
        for batch in val_dl:
            ids = batch["input_ids"].to(device)
            msk = batch["attention_mask"].to(device)
            lbl = batch["labels"]

            logits = model(input_ids=ids, attention_mask=msk).logits.cpu()
            pred = logits.argmax(-1)

            for p_row, l_row in zip(pred, lbl):
                p_seq, l_seq = [], []
                for pi, li in zip(p_row.tolist(), l_row.tolist()):
                    if li == -100: continue
                    p_seq.append(id2label[pi]); l_seq.append(id2label[li])
                P, T = bio_to_spans(p_seq), bio_to_spans(l_seq)
                TP += len(P & T); FP += len(P - T); FN += len(T - P)
    prec = TP/(TP+FP) if (TP+FP) else 0.0
    rec  = TP/(TP+FN) if (TP+FN) else 0.0
    f1   = 2*prec*rec/(prec+rec) if (prec+rec) else 0.0
    return prec, rec, f1

epochs = 3
best_f1 = 0.0
for ep in range(1, epochs+1):
    model.train(); running=0.0
    for step, batch in enumerate(train_dl, start=1):
        ids = batch["input_ids"].to(device)
        msk = batch["attention_mask"].to(device)
        lbl = batch["labels"].to(device)

        optimizer.zero_grad(set_to_none=True)
        with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):
            out = model(input_ids=ids, attention_mask=msk, labels=lbl)
            loss = out.loss
        scaler.scale(loss).backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        scaler.step(optimizer); scaler.update()

        running += float(loss.detach().cpu())
        if step % 200 == 0:
            print(f"Epoch {ep} | step {step}/{len(train_dl)} | loss {running/step:.4f}")

    P,R,F1 = eval_f1()
    print(f"Epoch {ep} done | avg_loss={running/len(train_dl):.4f} | P={P:.4f} R={R:.4f} F1={F1:.4f}")
    if F1 > best_f1:
        best_f1 = F1
        torch.save(model.state_dict(), "/content/best_pii_ner.pt")
        print("Saved best → /content/best_pii_ner.pt")

print("Best F1:", round(best_f1,4))


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  scaler = torch.cuda.amp.GradScaler(enabled=torch.cuda.is_available())
  with torch.cuda.amp.autocast(enabled=torch.cuda.is_available()):


Epoch 1 | step 200/1979 | loss 0.1703
Epoch 1 | step 400/1979 | loss 0.0869
Epoch 1 | step 600/1979 | loss 0.0592
Epoch 1 | step 800/1979 | loss 0.0449
Epoch 1 | step 1000/1979 | loss 0.0365
Epoch 1 | step 1200/1979 | loss 0.0307
Epoch 1 | step 1400/1979 | loss 0.0265
Epoch 1 | step 1600/1979 | loss 0.0233
Epoch 1 | step 1800/1979 | loss 0.0208
Epoch 1 done | avg_loss=0.0192 | P=0.9216 R=0.9307 F1=0.9261
Saved best → /content/best_pii_ner.pt
Epoch 2 | step 200/1979 | loss 0.0006
Epoch 2 | step 400/1979 | loss 0.0004
Epoch 2 | step 600/1979 | loss 0.0005
Epoch 2 | step 800/1979 | loss 0.0005
Epoch 2 | step 1000/1979 | loss 0.0006
Epoch 2 | step 1200/1979 | loss 0.0007
Epoch 2 | step 1400/1979 | loss 0.0007
Epoch 2 | step 1600/1979 | loss 0.0006
Epoch 2 | step 1800/1979 | loss 0.0006
Epoch 2 done | avg_loss=0.0006 | P=0.9386 R=0.9455 F1=0.9420
Saved best → /content/best_pii_ner.pt
Epoch 3 | step 200/1979 | loss 0.0004
Epoch 3 | step 400/1979 | loss 0.0002
Epoch 3 | step 600/1979 | loss 0

In [5]:
# Cell 5 — Load best checkpoint, export HF folder, copy to Drive, and list files
import os, shutil, torch
from transformers import AutoModelForTokenClassification

best_path  = "/content/best_pii_ner.pt"                     # from training cell
local_dir  = "/content/pii_model_deberta_base"              # HF export folder
drive_dir  = "/content/drive/MyDrive/pii_model_deberta_base"  # copy to Drive

assert "model" in globals(), "Model not in memory. Run in the same notebook where you trained."
assert os.path.exists(best_path), "best_pii_ner.pt not found. Run the training cell first."

# 1) Load best weights into the current model
model.load_state_dict(torch.load(best_path, map_location="cpu"))
model.eval()
print("✓ Best weights loaded into the model.")

# 2) Export HF folder (includes config with id2label/label2id)
if os.path.exists(local_dir):
    shutil.rmtree(local_dir)
model.save_pretrained(local_dir)
tokenizer.save_pretrained(local_dir)
print("✓ Exported HF model to:", local_dir)

# 3) Copy to Google Drive
if os.path.exists(drive_dir):
    shutil.rmtree(drive_dir)
shutil.copytree(local_dir, drive_dir)
print("✓ Copied to Drive:", drive_dir)

# 4) List contents
print("\nLocal HF folder:")
os.system(f'ls -lh "{local_dir}"')
print("\nDrive HF folder:")
os.system(f'ls -lh "{drive_dir}"')


✓ Best weights loaded into the model.
✓ Exported HF model to: /content/pii_model_deberta_base
✓ Copied to Drive: /content/drive/MyDrive/pii_model_deberta_base

Local HF folder:

Drive HF folder:


0

In [6]:
# Save best checkpoint + export HF folder directly into your Drive path

import os, shutil, torch
from pathlib import Path
from transformers import AutoModelForTokenClassification

# 0) Target Drive folder (make sure Drive is mounted)
DRIVE_BASE = Path("/content/drive/MyDrive/Colab Notebooks/bootcamb")
DRIVE_BASE.mkdir(parents=True, exist_ok=True)

BEST_LOCAL = Path("/content/best_pii_ner.pt")                   # from training
BEST_DRIVE = DRIVE_BASE / "best_pii_ner.pt"                      # where to store in Drive
HF_DRIVE   = DRIVE_BASE / "pii_model_deberta_base"               # HF export folder in Drive

# 1) Ensure we have a checkpoint: if not found locally, save current in-memory model
if not BEST_LOCAL.exists():
    assert "model" in globals(), "Model not in memory. Run this in the training notebook."
    torch.save(model.state_dict(), BEST_LOCAL)
    print("Saved current model state to:", BEST_LOCAL)

# 2) Copy best checkpoint to Drive (or save directly if needed)
if BEST_DRIVE.exists():
    BEST_DRIVE.unlink()
shutil.copy2(BEST_LOCAL, BEST_DRIVE)
print("✓ Copied best checkpoint to:", BEST_DRIVE)

# 3) Export HuggingFace folder directly into Drive (config + tokenizer + weights)
assert "model" in globals() and "tokenizer" in globals(), "Missing model/tokenizer in memory."
if HF_DRIVE.exists():
    shutil.rmtree(HF_DRIVE)

# If you want to guarantee id2label/label2id in config, re-wrap once (optional but nice):
assert "label2id" in globals() and "id2label" in globals(), "Missing label maps."
wrapped = AutoModelForTokenClassification.from_pretrained(
    "microsoft/deberta-v3-base",
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)
wrapped.load_state_dict(torch.load(BEST_LOCAL, map_location="cpu"))
wrapped.save_pretrained(str(HF_DRIVE))
tokenizer.save_pretrained(str(HF_DRIVE))
print("✓ Exported HF model to:", HF_DRIVE)

# 4) List saved files
print("\n== Drive contents ==")
os.system(f'ls -lh "{DRIVE_BASE}"')
print("\n== HF folder contents ==")
os.system(f'ls -lh "{HF_DRIVE}"')


✓ Copied best checkpoint to: /content/drive/MyDrive/Colab Notebooks/bootcamb/best_pii_ner.pt


Some weights of DebertaV2ForTokenClassification were not initialized from the model checkpoint at microsoft/deberta-v3-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


✓ Exported HF model to: /content/drive/MyDrive/Colab Notebooks/bootcamb/pii_model_deberta_base

== Drive contents ==

== HF folder contents ==


0

In [7]:
from pathlib import Path

drive_base = Path("/content/drive/MyDrive/Colab Notebooks/bootcamb")
best_pt    = drive_base / "best_pii_ner.pt"
hf_dir     = drive_base / "pii_model_deberta_base"

def human_size(p: Path):
    try:
        return f"{p.stat().st_size/1e6:.2f} MB"
    except Exception:
        return "?"

print("== Checkpoint file ==")
print("Exists:", best_pt.exists(), "| Path:", best_pt)
if best_pt.exists():
    print("Size:", human_size(best_pt))

print("\n== HF folder ==")
print("Exists:", hf_dir.exists(), "| Path:", hf_dir)
if hf_dir.exists():
    files = sorted(list(hf_dir.glob("*")))
    print("Files:", len(files))
    for f in files:
        print(" -", f.name, "|", human_size(f))

== Checkpoint file ==
Exists: True | Path: /content/drive/MyDrive/Colab Notebooks/bootcamb/best_pii_ner.pt
Size: 735.45 MB

== HF folder ==
Exists: True | Path: /content/drive/MyDrive/Colab Notebooks/bootcamb/pii_model_deberta_base
Files: 7
 - added_tokens.json | 0.00 MB
 - config.json | 0.00 MB
 - model.safetensors | 735.39 MB
 - special_tokens_map.json | 0.00 MB
 - spm.model | 2.46 MB
 - tokenizer.json | 8.66 MB
 - tokenizer_config.json | 0.00 MB
