In [1]:
!pip install --upgrade transformers datasets evaluate accelerate seqeval



In [2]:
# run in a notebook cell
!pip install -q transformers datasets evaluate seqeval accelerate
!pip install -q pandas scikit-learn


In [3]:
# Notebook cell 2
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"   # quiet warning
os.environ["HF_HOME"] = os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface"))

import json
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
import evaluate
from pprint import pprint


In [4]:
# Notebook cell 3
csv_path = "../data/annotated/job_ner_annotations_full_20_jds.csv"
df = pd.read_csv(csv_path)
print("CSV rows:", len(df))

# rebuild sequences grouped by sentence_id (sorted)
sentences = []
labels = []
ids = []
for sid, grp in df.groupby("sentence_id", sort=True):
    toks = grp["token"].tolist()
    labs = grp["label"].tolist()
    sentences.append(toks)
    labels.append(labs)
    ids.append(int(sid))

print("Loaded sequences:", len(sentences))
# Basic sanity: each seq should have equal tokens/labels
bad = [(i, len(t), len(l)) for i,(t,l) in enumerate(zip(sentences, labels)) if len(t)!=len(l)]
assert len(bad)==0, f"Alignment errors found: {bad}"


CSV rows: 569
Loaded sequences: 20


In [5]:
# Notebook cell 4
# 80/10/10 split by sequence count (already used earlier)
train_tokens = sentences[:16]
train_labels = labels[:16]

val_tokens = train_tokens[-2:]
val_labels = train_labels[-2:]

train_tokens = train_tokens[:-2]
train_labels = train_labels[:-2]

test_tokens = sentences[16:]
test_labels = labels[16:]

print("Train/Val/Test counts:", len(train_tokens), len(val_tokens), len(test_tokens))

def build_hf_dataset(token_seqs, label_seqs):
    return Dataset.from_list([{"tokens": t, "labels": l} for t,l in zip(token_seqs, label_seqs)])

dataset = DatasetDict({
    "train": build_hf_dataset(train_tokens, train_labels),
    "validation": build_hf_dataset(val_tokens, val_labels),
    "test": build_hf_dataset(test_tokens, test_labels),
})
dataset


Train/Val/Test counts: 14 2 4


DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 2
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 4
    })
})

In [6]:
# Notebook cell 5
# derive BIO label set from dataset (keeps order deterministic)
unique_labels = sorted({lab for seq in labels for lab in seq})
# ensure O present
if "O" not in unique_labels:
    unique_labels.append("O")

label_list = unique_labels
label_to_id = {l:i for i,l in enumerate(label_list)}
id_to_label = {i:l for l,i in label_to_id.items()}

print("Num labels:", len(label_list))
pprint(label_list)


Num labels: 16
['B-COMPANY',
 'B-DEGREE_MAJOR',
 'B-EDUCATION_LEVEL',
 'B-EMPLOYEMENT_TYPE',
 'B-FRAMEWORK',
 'B-JOB_TITLE',
 'B-LOCATION',
 'B-PROGRAMMING_LANGUAGE',
 'B-SKILL_TECH',
 'B-TOOL',
 'I-DEGREE_MAJOR',
 'I-FRAMEWORK',
 'I-JOB_TITLE',
 'I-SKILL_TECH',
 'I-TOOL',
 'O']


In [7]:
# Notebook cell 6
# For a fast smoke test use a tiny model; later swap to distilbert/bert
model_checkpoint = "sshleifer/tiny-distilbert-base-cased"  # tiny, quick download
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def hf_tokenize_align(batch):
    # tokenizes with is_split_into_words=True and aligns labels -> token ids
    tokenized = tokenizer(batch["tokens"], is_split_into_words=True, truncation=True)
    all_labels = []
    for i, label_seq in enumerate(batch["labels"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        for word_idx in word_ids:
            if word_idx is None:
                aligned.append(-100)  # ignore in loss
            else:
                aligned.append(label_to_id[label_seq[word_idx]])
        all_labels.append(aligned)
    tokenized["labels"] = all_labels
    return tokenized

# Apply mapping
tokenized_datasets = dataset.map(hf_tokenize_align, batched=True, remove_columns=["tokens","labels"])
tokenized_datasets


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4
    })
})

In [8]:
# Notebook cell 7
# Inspect one example (converted token ids -> tokens) to confirm alignment
i = 0
ex = tokenized_datasets["train"][i]
tokens_wp = tokenizer.convert_ids_to_tokens(ex["input_ids"])
labels_ids = ex["labels"]
# show pairs (first 60)
pairs = list(zip(tokens_wp[:60], labels_ids[:60]))
print("token-piece : label-id (first 60)")
for t,l in pairs:
    print(f"{t:12s} -> {l}")
# quick label id range check
all_lab_ids = []
for ex in tokenized_datasets["train"]:
    all_lab_ids.extend([x for x in ex["labels"] if x!=-100])
print("label id range:", min(all_lab_ids), max(all_lab_ids), "num labels:", len(label_list))


token-piece : label-id (first 60)
[CLS]        -> -100
We           -> 15
are          -> 15
looking      -> 15
for          -> 15
a            -> 15
Data         -> 5
Scientist    -> 12
to           -> 15
join         -> 15
our          -> 15
Bangalore    -> 6
office       -> 15
.            -> 15
The          -> 15
ideal        -> 15
candidate    -> 15
has          -> 15
experience   -> 15
with         -> 15
Python       -> 7
,            -> 15
S            -> 9
##QL         -> 9
,            -> 15
and          -> 15
machine      -> 8
learning     -> 13
framework    -> 15
##s          -> 15
like         -> 15
Ten          -> 4
##sor        -> 4
##F          -> 4
##low        -> 4
or           -> 15
P            -> 4
##y          -> 4
##T          -> 4
##or         -> 4
##ch         -> 4
.            -> 15
Employment   -> 15
type         -> 15
is           -> 15
full         -> 3
-            -> 3
time         -> 3
.            -> 15
[SEP]        -> -100
label id range: 0 15 num label

In [9]:
# Notebook cell 8
from transformers import AutoModelForTokenClassification

# load model; ignore_mismatched_sizes helps if checkpoint head doesn't match our num_labels
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
    ignore_mismatched_sizes=True
)

data_collator = DataCollatorForTokenClassification(tokenizer)
print("Model loaded. num_labels:", model.config.num_labels)


Some weights of the model checkpoint at sshleifer/tiny-distilbert-base-cased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at sshleifer/tiny-distilbert-base-cased and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 2]) in the checkpoint and torch.Size([16, 2]) in the model instantiated
- classifie

Model loaded. num_labels: 16


In [13]:
import torch.nn as nn

# 1) load the base model WITHOUT forcing a classifier shape
base_model = AutoModelForTokenClassification.from_pretrained(model_checkpoint)

# 2) create a new classification head sized to your labels
hidden_size = base_model.config.hidden_size
num_labels = len(label_list)

new_classifier = nn.Linear(hidden_size, num_labels)

# initialize new head (Xavier init is common)
nn.init.xavier_uniform_(new_classifier.weight)
if new_classifier.bias is not None:
    nn.init.zeros_(new_classifier.bias)

# 3) assign the new head to the model
# common attribute names: `classifier` or `score` depending on model type.
# Many BERT-like models use `classifier`. If not, inspect model children.
if hasattr(base_model, "classifier"):
    base_model.classifier = new_classifier
else:
    # fallback: find a linear layer name to replace (less common)
    for name, module in base_model.named_modules():
        # look for first Linear in top-level modules (not perfect but works)
        if isinstance(module, nn.Linear):
            parent_name = name.rsplit(".", 1)[0]
            setattr(base_model, parent_name, new_classifier)
            break

# 4) update config label maps
base_model.config.num_labels = num_labels
base_model.config.id2label = id_to_label
base_model.config.label2id = label_to_id

# Use base_model as `model` from now on
model = base_model
print("Replaced classifier. hidden_size:", hidden_size, "num_labels:", num_labels)


Some weights of the model checkpoint at sshleifer/tiny-distilbert-base-cased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Replaced classifier. hidden_size: 2 num_labels: 16


In [14]:
# 1) config check
print("model.config.num_labels:", model.config.num_labels)
print("len(label_list):", len(label_list))
print("id2label keys sample:", list(model.config.id2label.items())[:5])

# 2) show classifier shape
print("Classifier module:", type(model.classifier), getattr(model.classifier, "weight", None).shape)


model.config.num_labels: 16
len(label_list): 16
id2label keys sample: [(0, 'B-COMPANY'), (1, 'B-DEGREE_MAJOR'), (2, 'B-EDUCATION_LEVEL'), (3, 'B-EMPLOYEMENT_TYPE'), (4, 'B-FRAMEWORK')]
Classifier module: <class 'torch.nn.modules.linear.Linear'> torch.Size([16, 2])


In [18]:
# --- fixed TrainingArguments + Trainer (use this cell instead) ---
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="models/finetuned-small-debug",
    evaluation_strategy="epoch",        # EVALUATE every epoch
    save_strategy="epoch",              # SAVE every epoch (must match evaluation_strategy)
    learning_rate=2e-5,
    per_device_train_batch_size=1,      # small to avoid OOM (use 2 if you have GPU memory)
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=4,      # effective batch = 1 * 4
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,        # now valid because eval & save strategies match
    metric_for_best_model="f1",
    push_to_hub=False,
    seed=42,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

# Smoke test: run a short train
trainer.train()


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [12]:
# Notebook cell 11
metrics = trainer.evaluate(tokenized_datasets["test"])
print(metrics)

trainer.save_model("models/finetuned-small-debug")
tokenizer.save_pretrained("models/finetuned-small-debug")
print("Saved model to models/finetuned-small-debug")


NameError: name 'trainer' is not defined