In [None]:
# run in a notebook cell
!pip install -q transformers datasets evaluate seqeval accelerate
!pip install -q pandas scikit-learn

In [None]:
!pip install --upgrade transformers datasets evaluate accelerate seqeval -q


In [None]:
# Notebook cell 2
import os
os.environ["TOKENIZERS_PARALLELISM"] = "false"   # quiet warning
os.environ["HF_HOME"] = os.environ.get("HF_HOME", os.path.expanduser("~/.cache/huggingface"))

import json
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer
)
import evaluate
from pprint import pprint


In [None]:
# Notebook cell 3
csv_path = "/content/job_ner_annotations_full_20_jds.csv"
df = pd.read_csv(csv_path)
print("CSV rows:", len(df))

# rebuild sequences grouped by sentence_id (sorted)
sentences = []
labels = []
ids = []
for sid, grp in df.groupby("sentence_id", sort=True):
    toks = grp["token"].tolist()
    labs = grp["label"].tolist()
    sentences.append(toks)
    labels.append(labs)
    ids.append(int(sid))

print("Loaded sequences:", len(sentences))
# Basic sanity: each seq should have equal tokens/labels
bad = [(i, len(t), len(l)) for i,(t,l) in enumerate(zip(sentences, labels)) if len(t)!=len(l)]
assert len(bad)==0, f"Alignment errors found: {bad}"


CSV rows: 569
Loaded sequences: 20


In [None]:
# Notebook cell 4
# 80/10/10 split by sequence count (already used earlier)
train_tokens = sentences[:16]
train_labels = labels[:16]

val_tokens = train_tokens[-2:]
val_labels = train_labels[-2:]

train_tokens = train_tokens[:-2]
train_labels = train_labels[:-2]

test_tokens = sentences[16:]
test_labels = labels[16:]

print("Train/Val/Test counts:", len(train_tokens), len(val_tokens), len(test_tokens))

def build_hf_dataset(token_seqs, label_seqs):
    return Dataset.from_list([{"tokens": t, "labels": l} for t,l in zip(token_seqs, label_seqs)])

dataset = DatasetDict({
    "train": build_hf_dataset(train_tokens, train_labels),
    "validation": build_hf_dataset(val_tokens, val_labels),
    "test": build_hf_dataset(test_tokens, test_labels),
})
dataset


Train/Val/Test counts: 14 2 4


DatasetDict({
    train: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 2
    })
    test: Dataset({
        features: ['tokens', 'labels'],
        num_rows: 4
    })
})

In [None]:
# Notebook cell 5
# derive BIO label set from dataset (keeps order deterministic)
unique_labels = sorted({lab for seq in labels for lab in seq})
# ensure O present
if "O" not in unique_labels:
    unique_labels.append("O")

label_list = unique_labels
label_to_id = {l:i for i,l in enumerate(label_list)}
id_to_label = {i:l for l,i in label_to_id.items()}

print("Num labels:", len(label_list))
pprint(label_list)


Num labels: 16
['B-COMPANY',
 'B-DEGREE_MAJOR',
 'B-EDUCATION_LEVEL',
 'B-EMPLOYEMENT_TYPE',
 'B-FRAMEWORK',
 'B-JOB_TITLE',
 'B-LOCATION',
 'B-PROGRAMMING_LANGUAGE',
 'B-SKILL_TECH',
 'B-TOOL',
 'I-DEGREE_MAJOR',
 'I-FRAMEWORK',
 'I-JOB_TITLE',
 'I-SKILL_TECH',
 'I-TOOL',
 'O']


In [None]:
# Notebook cell 6
# For a fast smoke test use a tiny model; later swap to distilbert/bert
model_checkpoint = "sshleifer/tiny-distilbert-base-cased"  # tiny, quick download
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

def hf_tokenize_align(batch):
    # tokenizes with is_split_into_words=True and aligns labels -> token ids
    tokenized = tokenizer(batch["tokens"], is_split_into_words=True, truncation=True)
    all_labels = []
    for i, label_seq in enumerate(batch["labels"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        for word_idx in word_ids:
            if word_idx is None:
                aligned.append(-100)  # ignore in loss
            else:
                aligned.append(label_to_id[label_seq[word_idx]])
        all_labels.append(aligned)
    tokenized["labels"] = all_labels
    return tokenized

# Apply mapping
tokenized_datasets = dataset.map(hf_tokenize_align, batched=True, remove_columns=["tokens","labels"])
tokenized_datasets


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4
    })
})

In [None]:
# Notebook cell 7
# Inspect one example (converted token ids -> tokens) to confirm alignment
i = 0
ex = tokenized_datasets["train"][i]
tokens_wp = tokenizer.convert_ids_to_tokens(ex["input_ids"])
labels_ids = ex["labels"]
# show pairs (first 60)
pairs = list(zip(tokens_wp[:60], labels_ids[:60]))
print("token-piece : label-id (first 60)")
for t,l in pairs:
    print(f"{t:12s} -> {l}")
# quick label id range check
all_lab_ids = []
for ex in tokenized_datasets["train"]:
    all_lab_ids.extend([x for x in ex["labels"] if x!=-100])
print("label id range:", min(all_lab_ids), max(all_lab_ids), "num labels:", len(label_list))


token-piece : label-id (first 60)
[CLS]        -> -100
We           -> 15
are          -> 15
looking      -> 15
for          -> 15
a            -> 15
Data         -> 5
Scientist    -> 12
to           -> 15
join         -> 15
our          -> 15
Bangalore    -> 6
office       -> 15
.            -> 15
The          -> 15
ideal        -> 15
candidate    -> 15
has          -> 15
experience   -> 15
with         -> 15
Python       -> 7
,            -> 15
S            -> 9
##QL         -> 9
,            -> 15
and          -> 15
machine      -> 8
learning     -> 13
framework    -> 15
##s          -> 15
like         -> 15
Ten          -> 4
##sor        -> 4
##F          -> 4
##low        -> 4
or           -> 15
P            -> 4
##y          -> 4
##T          -> 4
##or         -> 4
##ch         -> 4
.            -> 15
Employment   -> 15
type         -> 15
is           -> 15
full         -> 3
-            -> 3
time         -> 3
.            -> 15
[SEP]        -> -100
label id range: 0 15 num label

In [None]:
# Notebook cell 8
from transformers import AutoModelForTokenClassification

# load model; ignore_mismatched_sizes helps if checkpoint head doesn't match our num_labels
model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
    ignore_mismatched_sizes=True
)

data_collator = DataCollatorForTokenClassification(tokenizer)
print("Model loaded. num_labels:", model.config.num_labels)


Some weights of the model checkpoint at sshleifer/tiny-distilbert-base-cased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at sshleifer/tiny-distilbert-base-cased and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 2]) in the checkpoint and torch.Size([16, 2]) in the model instantiated
- classifie

Model loaded. num_labels: 16


In [None]:
# Notebook cell 9
seqeval = evaluate.load("seqeval")

def compute_metrics(p):
    logits, labels = p
    preds = np.argmax(logits, axis=2)

    true_labels = []
    true_preds = []
    for lab_seq, pred_seq in zip(labels, preds):
        t_lab = []
        t_prd = []
        for lab_id, pr_id in zip(lab_seq, pred_seq):
            if lab_id == -100:
                continue
            t_lab.append(id_to_label[int(lab_id)])
            t_prd.append(id_to_label[int(pr_id)])
        true_labels.append(t_lab)
        true_preds.append(t_prd)

    results = seqeval.compute(predictions=true_preds, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results.get("overall_accuracy", 0.0)
    }


In [None]:
# Notebook cell 10
training_args = TrainingArguments(
    output_dir="models/finetuned-small-debug",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=2,   # small to avoid OOM
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,   # effective batch = 2*4
    num_train_epochs=3,
    weight_decay=0.01,
    logging_steps=10,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    push_to_hub=False,
    report_to=[],
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# smoke test: train for a tiny number of steps to verify pipeline
trainer.train()


  trainer = Trainer(


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.768777,0.043478,0.107143,0.061856,0.042857
2,No log,2.768712,0.043478,0.107143,0.061856,0.042857
3,No log,2.768684,0.043478,0.107143,0.061856,0.042857


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=6, training_loss=2.765071233113607, metrics={'train_runtime': 1.3091, 'train_samples_per_second': 32.083, 'train_steps_per_second': 4.583, 'total_flos': 1745280.0, 'train_loss': 2.765071233113607, 'epoch': 3.0})

In [None]:
# Diagnostic: run this and paste back the printed output if you want me to inspect it.
import transformers, inspect, sys
print("python:", sys.version.splitlines()[0])
print("transformers version:", transformers.__version__)

from transformers import TrainingArguments
supported_params = TrainingArguments.__init__.__code__.co_varnames
print("\nSupported TrainingArguments params (first 120 shown):")
print(supported_params[:120])

# Quick check: does the modern keyword exist?
print("\nHas 'evaluation_strategy' in TrainingArguments.__init__?:", "evaluation_strategy" in supported_params)
print("Has 'save_strategy'?:", "save_strategy" in supported_params)
print("Has 'load_best_model_at_end'?:", "load_best_model_at_end" in supported_params)


python: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
transformers version: 4.57.2

Supported TrainingArguments params (first 120 shown):
('self', 'output_dir', 'overwrite_output_dir', 'do_train', 'do_eval', 'do_predict', 'eval_strategy', 'prediction_loss_only', 'per_device_train_batch_size', 'per_device_eval_batch_size', 'per_gpu_train_batch_size', 'per_gpu_eval_batch_size', 'gradient_accumulation_steps', 'eval_accumulation_steps', 'eval_delay', 'torch_empty_cache_steps', 'learning_rate', 'weight_decay', 'adam_beta1', 'adam_beta2', 'adam_epsilon', 'max_grad_norm', 'num_train_epochs', 'max_steps', 'lr_scheduler_type', 'lr_scheduler_kwargs', 'warmup_ratio', 'warmup_steps', 'log_level', 'log_level_replica', 'log_on_each_node', 'logging_dir', 'logging_strategy', 'logging_first_step', 'logging_steps', 'logging_nan_inf_filter', 'save_strategy', 'save_steps', 'save_total_limit', 'save_safetensors', 'save_on_each_node', 'save_only_model', 'restore_callback_states_from_checkpoint', 'no_cu

In [None]:
!huggingface-cli logout


Not logged in!


In [None]:
#remove any stale repo info
!rm -rf ~/.cache/huggingface/hub


In [None]:
#see which keywords Trainer.__init__ accepts:
from transformers import Trainer
print("Trainer init params:", Trainer.__init__.__code__.co_varnames[:150])


Trainer init params: ('args', 'kwargs', 'func_name', 'minimum_action', 'message')


In [None]:
# === TRAIN ===
train_output = trainer.train()
print(train_output)




Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,2.768676,0.043478,0.107143,0.061856,0.042857
2,No log,2.768612,0.043478,0.107143,0.061856,0.042857
3,No log,2.768584,0.043478,0.107143,0.061856,0.042857


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


TrainOutput(global_step=6, training_loss=2.7651850382486978, metrics={'train_runtime': 0.787, 'train_samples_per_second': 53.366, 'train_steps_per_second': 7.624, 'total_flos': 1745280.0, 'train_loss': 2.7651850382486978, 'epoch': 3.0})


In [None]:
# === VALIDATION EVALUATION ===
val_metrics = trainer.evaluate()
print("Validation Metrics:")
print(val_metrics)




Validation Metrics:
{'eval_loss': 2.768676280975342, 'eval_precision': 0.043478260869565216, 'eval_recall': 0.10714285714285714, 'eval_f1': 0.06185567010309278, 'eval_accuracy': 0.04285714285714286, 'eval_runtime': 0.0327, 'eval_samples_per_second': 61.105, 'eval_steps_per_second': 30.553, 'epoch': 3.0}


  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# === TEST EVALUATION ===
test_metrics = trainer.evaluate(tokenized_datasets["test"])
print("Test Metrics:")
print(test_metrics)


Test Metrics:
{'eval_loss': 2.7672743797302246, 'eval_precision': 0.04827586206896552, 'eval_recall': 0.1206896551724138, 'eval_f1': 0.06896551724137931, 'eval_accuracy': 0.04794520547945205, 'eval_runtime': 0.0337, 'eval_samples_per_second': 118.747, 'eval_steps_per_second': 59.374, 'epoch': 3.0}


In [None]:
import numpy as np

def predict_labels(text_tokens):
    # tokenize (word-piece level)
    encoded = tokenizer(text_tokens, is_split_into_words=True, return_tensors="pt", truncation=True)

    with torch.no_grad():
        logits = model(**encoded).logits

    preds = torch.argmax(logits, dim=-1).squeeze().tolist()
    word_ids = encoded.word_ids()

    # align predictions back to words
    word_level_preds = []
    prev_word = None
    for pred_id, word_id in zip(preds, word_ids):
        if word_id is None:
            continue
        if word_id != prev_word:
            word_level_preds.append(id_to_label[pred_id])
            prev_word = word_id

    return word_level_preds


def extract_entities(tokens, labels):
    entities = []
    current_entity = []
    current_label = None

    for token, label in zip(tokens, labels):
        if label.startswith("B-"):
            if current_entity:
                entities.append((current_label, " ".join(current_entity)))
            current_entity = [token]
            current_label = label[2:]
        elif label.startswith("I-") and current_label == label[2:]:
            current_entity.append(token)
        else:
            if current_entity:
                entities.append((current_label, " ".join(current_entity)))
                current_entity = []
                current_label = None

    if current_entity:
        entities.append((current_label, " ".join(current_entity)))

    return entities


In [None]:
tokens_example = test_tokens[0]
pred_labels = predict_labels(tokens_example)
entities = extract_entities(tokens_example, pred_labels)

print("TOKENS:", tokens_example)
print("PRED LABELS:", pred_labels)
print("ENTITIES:", entities)


TOKENS: ['We', 'need', 'a', 'Mobile', 'App', 'Developer', 'with', 'Flutter', ',', 'Dart', ',', 'and', 'Firebase', 'experience', '.', 'Company', ':', 'Swiggy', '.', 'Location', ':', 'Bangalore', '.', 'Employment', 'type', ':', 'full-time', '.']
PRED LABELS: ['B-COMPANY', 'B-COMPANY', 'I-TOOL', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'I-TOOL', 'I-TOOL', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'I-TOOL', 'I-TOOL', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY', 'B-COMPANY']
ENTITIES: [('COMPANY', 'We'), ('COMPANY', 'need'), ('COMPANY', 'Mobile'), ('COMPANY', 'App'), ('COMPANY', 'Developer'), ('COMPANY', 'with'), ('COMPANY', 'Flutter'), ('COMPANY', ','), ('COMPANY', 'and'), ('COMPANY', 'Firebase'), ('COMPANY', 'experience'), ('COMPANY', '.'), ('COMPANY', 'Company'), ('COMPANY', ':'), ('COMPANY', 'Location'), ('COMPANY', ':'), ('COMPANY', 'Bangalore'), ('COMPANY

In [None]:
model_save_path = "models/final-job-ner-model"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print("Model saved to:", model_save_path)


Model saved to: models/final-job-ner-model
