# Part D — Fine-tune PatentSBERTa (READY)

This notebook loads the parquet, builds train/eval splits safely, applies gold overrides (if available), fine-tunes, evaluates, and saves the model.

In [None]:
# ✅ Environment check (needed for HuggingFace Trainer)
import sys, subprocess, pkgutil

def _pip_install(pkgs):
    print("Installing:", pkgs)
    subprocess.check_call([sys.executable, "-m", "pip", "install", "-U", *pkgs])

need_restart = False

# accelerate
try:
    import accelerate
    from packaging import version
    if version.parse(accelerate.__version__) < version.parse("0.26.0"):
        _pip_install(["accelerate>=0.26.0"])
        need_restart = True
except Exception:
    _pip_install(["accelerate>=0.26.0"])
    need_restart = True

# transformers + torch extras
try:
    import transformers
except Exception:
    _pip_install(["transformers[torch]"])
    need_restart = True

if need_restart:
    print("\n✅ Packages updated. Please RESTART the kernel, then run all cells again.")
    raise SystemExit("Restart kernel and rerun.")
else:
    print("✅ Environment looks good.")

In [None]:
import sys, os
sys.path.append(os.path.abspath(".."))

import pandas as pd
import numpy as np
from datasets import Dataset
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
from sklearn.metrics import precision_recall_fscore_support
from sklearn.model_selection import train_test_split

from src.config import CFG
from src.data_tools import load_parquet_or_dummy

# -------- Load data --------
df = load_parquet_or_dummy(CFG.parquet_path)
print("DF shape:", df.shape)

# -------- Pick train/eval safely --------
split_col = getattr(CFG, "split_col", None)
train_split = getattr(CFG, "train_split", None)
eval_split  = getattr(CFG, "eval_split", None)

label_col = getattr(CFG, "silver_label_col", None)
text_col  = getattr(CFG, "text_col", None)

# Basic sanity
missing = [c for c in [label_col, text_col] if (c is None or c not in df.columns)]
if missing:
    raise KeyError(f"Missing required columns in df: {missing}. Check CFG.silver_label_col / CFG.text_col.")

# Try CFG split if available
use_cfg_split = (split_col in df.columns) and (train_split is not None) and (eval_split is not None)
if use_cfg_split:
    print("Split counts:\n", df[split_col].value_counts(dropna=False))
    train_df = df[df[split_col] == train_split].copy()
    eval_df  = df[df[split_col] == eval_split].copy()
else:
    train_df = df.iloc[:0].copy()
    eval_df  = df.iloc[:0].copy()

def has_two_classes(series):
    vals = pd.Series(series).dropna().astype(int).unique()
    return len(vals) >= 2

# If CFG split failed / empty / one-class, do stratified split
if (len(train_df) == 0) or (len(eval_df) == 0) or (not has_two_classes(train_df[label_col])):
    print("⚠️ CFG split not usable (empty or one-class). Creating stratified split from full df...")
    y_all = df[label_col].astype(int)
    train_df, eval_df = train_test_split(
        df,
        test_size=0.25,
        random_state=getattr(CFG, "seed", 42),
        stratify=y_all
    )
    train_df = train_df.copy()
    eval_df  = eval_df.copy()

print("Train size:", len(train_df), "| Eval size:", len(eval_df))
print("Train label counts:", train_df[label_col].astype(int).value_counts().to_dict())
print("Eval  label counts:", eval_df[label_col].astype(int).value_counts().to_dict())

# -------- Load gold labels if available --------
gold_path = "../data/hitl_green_100_labeled.csv"
if os.path.exists(gold_path):
    gold = pd.read_csv(gold_path)
    print("Loaded gold:", gold.shape)
else:
    gold = None
    print(f"Gold file not found at {gold_path}. Proceeding without gold overrides.")

DF shape: (450, 4)
Split counts:
 split
train_silver      200
pool_unlabeled    150
eval_silver       100
Name: count, dtype: int64
⚠️ CFG split not usable (empty or one-class). Creating stratified split from full df...
Train size: 337 | Eval size: 113
Train label counts: {1: 225, 0: 112}
Eval  label counts: {1: 75, 0: 38}
Gold file not found at ../data/hitl_green_100_labeled.csv. Proceeding without gold overrides.


In [None]:
# Create is_green_gold (gold overrides silver, if gold file is present)
label_col = CFG.silver_label_col
text_col  = CFG.text_col
doc_id_col = getattr(CFG, "doc_id_col", None)

train_df["is_green_gold"] = train_df[label_col].astype(int)

if gold is not None and doc_id_col in train_df.columns and doc_id_col in gold.columns and "is_green_human" in gold.columns:
    gold_map = dict(zip(gold[doc_id_col], gold["is_green_human"].astype(int)))
    mask = train_df[doc_id_col].isin(gold_map.keys())
    train_df.loc[mask, "is_green_gold"] = train_df.loc[mask, doc_id_col].map(gold_map)
    print("Gold overrides applied to", int(mask.sum()), "rows.")
else:
    if gold is not None:
        print("Gold loaded, but required columns missing for overrides. Skipping overrides.")

train_hf = Dataset.from_pandas(
    train_df[[text_col, "is_green_gold"]]
    .rename(columns={text_col: "text", "is_green_gold": "label"})
)

eval_hf = Dataset.from_pandas(
    eval_df[[text_col, label_col]]
    .rename(columns={text_col: "text", label_col: "label"})
)

print("HF train:", train_hf)
print("HF eval :", eval_hf)

HF train: Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 337
})
HF eval : Dataset({
    features: ['text', 'label', '__index_level_0__'],
    num_rows: 113
})


In [None]:
tokenizer = AutoTokenizer.from_pretrained(CFG.encoder_name)
model = AutoModelForSequenceClassification.from_pretrained(CFG.encoder_name, num_labels=2)

def tok(batch):
    return tokenizer(batch["text"], truncation=True, padding="max_length", max_length=getattr(CFG, "max_length", 256))

train_tok = train_hf.map(tok, batched=True)
eval_tok  = eval_hf.map(tok, batched=True)

train_tok.set_format(type="torch", columns=["input_ids","attention_mask","label"])
eval_tok.set_format(type="torch", columns=["input_ids","attention_mask","label"])

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=1)
    p, r, f1, _ = precision_recall_fscore_support(labels, preds, average="binary", zero_division=0)
    return {"precision": float(p), "recall": float(r), "f1": float(f1)}

# ---- TrainingArguments: compatible with older/newer transformers ----
import inspect
ta_params = set(inspect.signature(TrainingArguments).parameters.keys())

ta_kwargs = dict(
    output_dir="../models/finetuned_patentsberta",
    num_train_epochs=getattr(CFG, "ft_epochs", 1),
    learning_rate=getattr(CFG, "ft_lr", 1e-5),
    per_device_train_batch_size=getattr(CFG, "ft_batch_size", 8),
    per_device_eval_batch_size=getattr(CFG, "ft_eval_batch_size", 16),
    save_strategy="no",
    logging_steps=50,
    report_to="none",
)

# evaluation strategy name differs across versions
if "eval_strategy" in ta_params:
    ta_kwargs["eval_strategy"] = "epoch"
elif "evaluation_strategy" in ta_params:
    ta_kwargs["evaluation_strategy"] = "epoch"

training_args = TrainingArguments(**ta_kwargs)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tok,
    eval_dataset=eval_tok,
    compute_metrics=compute_metrics
)

trainer.train()
eval_metrics = trainer.evaluate()
eval_metrics

Loading weights:   0%|          | 0/197 [00:00<?, ?it/s]

[1mMPNetForSequenceClassification LOAD REPORT[0m from: AI-Growth-Lab/PatentSBERTa
Key                        | Status     | 
---------------------------+------------+-
embeddings.position_ids    | UNEXPECTED | 
pooler.dense.weight        | UNEXPECTED | 
pooler.dense.bias          | UNEXPECTED | 
classifier.dense.weight    | MISSING    | 
classifier.dense.bias      | MISSING    | 
classifier.out_proj.weight | MISSING    | 
classifier.out_proj.bias   | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Map:   0%|          | 0/337 [00:00<?, ? examples/s]

Map:   0%|          | 0/113 [00:00<?, ? examples/s]

  super().__init__(loader)


Epoch,Training Loss,Validation Loss,Precision,Recall,F1
1,No log,0.240657,1.0,1.0,1.0


  super().__init__(loader)


{'eval_loss': 0.24065658450126648,
 'eval_precision': 1.0,
 'eval_recall': 1.0,
 'eval_f1': 1.0,
 'eval_runtime': 48.7209,
 'eval_samples_per_second': 2.319,
 'eval_steps_per_second': 0.164,
 'epoch': 1.0}

In [None]:
# Save the fine-tuned model
out_dir = "../models/finetuned_patentsberta"
trainer.save_model(out_dir)
tokenizer.save_pretrained(out_dir)
print(f"Saved model to {out_dir}")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Saved model to ../models/finetuned_patentsberta


In this part, I fine-tuned the PatentSBERTa model using the improved HITL labeled dataset.

Unlike Part A where the language model was frozen, here I allowed the model weights to update during training. This helps the model learn domain-specific patterns related to green patents.

The model was trained using Hugging Face Trainer with training and evaluation datasets. After training, the fine-tuned model was saved and later uploaded to the Hugging Face Hub.

Fine-tuning helps improve performance because the model learns directly from the task-specific data instead of relying only on general language knowledge.