In [2]:
# Notebook cell 1
# Install required libs (run in a notebook cell)
!pip install -q transformers datasets seqeval evaluate accelerate
!pip install -q tokenizers

# If you haven't already:
!pip install -q pandas scikit-learn

# Then imports
import os
import pandas as pd
from collections import defaultdict
from datasets import Dataset, DatasetDict
import evaluate
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    DataCollatorForTokenClassification,
    TrainingArguments,
    Trainer,
    set_seed
)
import numpy as np


In [3]:
# Notebook cell 2
csv_path = "../data/annotated/job_ner_annotations_full_20_jds.csv"  # adjust path if needed
df = pd.read_csv(csv_path)
df.head()

# Reconstruct tokens & labels per sentence_id (job id).
sentences = []
labels = []
ids = []

for sid, grp in df.groupby("sentence_id", sort=True):
    toks = grp["token"].tolist()
    labs = grp["label"].tolist()
    sentences.append(toks)
    labels.append(labs)
    ids.append(int(sid))

print(f"Loaded {len(sentences)} sequences")


Loaded 20 sequences


In [4]:
# Notebook cell 3
train_tokens = sentences[:16]
train_labels = labels[:16]

test_tokens = sentences[16:]
test_labels = labels[16:]

# Optionally create a small validation from train (e.g., last 2 of train)
val_tokens = train_tokens[-2:]
val_labels = train_labels[-2:]
train_tokens = train_tokens[:-2]
train_labels = train_labels[:-2]

print("Train/Val/Test sizes (sequences):", len(train_tokens), len(val_tokens), len(test_tokens))

# Make HF datasets from lists
def build_hf_dataset(token_seqs, label_seqs):
    records = []
    for toks, labs in zip(token_seqs, label_seqs):
        records.append({"tokens": toks, "labels": labs})
    return Dataset.from_list(records)

hf_train = build_hf_dataset(train_tokens, train_labels)
hf_val = build_hf_dataset(val_tokens, val_labels)
hf_test = build_hf_dataset(test_tokens, test_labels)

dataset = DatasetDict({"train": hf_train, "validation": hf_val, "test": hf_test})
dataset

# quick check to ensure every sequence has same number of tokens and labels
def check_alignment(token_seqs, label_seqs):
    bad = []
    for i, (t, l) in enumerate(zip(token_seqs, label_seqs)):
        if len(t) != len(l):
            bad.append((i, len(t), len(l)))
    return bad

print("train alignment issues:", check_alignment(train_tokens, train_labels))
print("val alignment issues:", check_alignment(val_tokens, val_labels))
print("test alignment issues:", check_alignment(test_tokens, test_labels))

from pprint import pprint
pprint({"tokens": train_tokens[0], "labels": train_labels[0]})


#quick verify on HF datasets
for split in ["train","validation","test"]:
    ds = dataset[split]
    # check that each row has tokens and labels and lengths match
    for i in range(min(3, len(ds))):
        t = ds[i]["tokens"]; l = ds[i]["labels"]
        assert len(t) == len(l), f"Mismatch in {split} row {i}: {len(t)} vs {len(l)}"
print("HF datasets built and validated.")


Train/Val/Test sizes (sequences): 14 2 4
train alignment issues: []
val alignment issues: []
test alignment issues: []
{'labels': ['O',
            'O',
            'O',
            'O',
            'O',
            'B-JOB_TITLE',
            'I-JOB_TITLE',
            'O',
            'O',
            'O',
            'B-LOCATION',
            'O',
            'O',
            'O',
            'O',
            'O',
            'O',
            'O',
            'O',
            'B-PROGRAMMING_LANGUAGE',
            'O',
            'B-TOOL',
            'O',
            'O',
            'B-SKILL_TECH',
            'I-SKILL_TECH',
            'O',
            'O',
            'B-FRAMEWORK',
            'O',
            'B-FRAMEWORK',
            'O',
            'O',
            'O',
            'O',
            'B-EMPLOYEMENT_TYPE',
            'O'],
 'tokens': ['We',
            'are',
            'looking',
            'for',
            'a',
            'Data',
            'Scientis

In [5]:
# Notebook cell 4
# Collect all unique labels in BIO form from dataset
unique_labels = sorted({lab for labs in labels for lab in labs})
# Ensure 'O' present
if "O" not in unique_labels:
    unique_labels.append("O")
unique_labels = sorted(unique_labels, key=lambda x: (x == "O", x))  # put O last or first as you like

# But for token-classification we need unique entity tag types (B-xxx, I-xxx).
label_list = unique_labels
label_list
# Create maps
label_to_id = {l: i for i, l in enumerate(label_list)}
id_to_label = {i: l for l, i in label_to_id.items()}

print("Num labels:", len(label_list))
print(label_list)


Num labels: 16
['B-COMPANY', 'B-DEGREE_MAJOR', 'B-EDUCATION_LEVEL', 'B-EMPLOYEMENT_TYPE', 'B-FRAMEWORK', 'B-JOB_TITLE', 'B-LOCATION', 'B-PROGRAMMING_LANGUAGE', 'B-SKILL_TECH', 'B-TOOL', 'I-DEGREE_MAJOR', 'I-FRAMEWORK', 'I-JOB_TITLE', 'I-SKILL_TECH', 'I-TOOL', 'O']


In [6]:
# run in a notebook cell
!rm -rf ~/.cache/huggingface/transformers/*


zsh:1: no matches found: /Users/yajatchowdary/.cache/huggingface/transformers/*


In [7]:
# Notebook cell 5
model_checkpoint = "bert-base-cased"   # change to 'distilbert-base-cased' or other if needed
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)

# alignment function
def tokenize_and_align_labels(example):
    tokens = example["tokens"]
    labels = example["labels"]
    # join tokens with space for tokenizer but we will use token-level alignment via word_ids
    # encode with is_split_into_words=True to preserve original tokenization
    tokenized_inputs = tokenizer(tokens, is_split_into_words=True, truncation=True, padding=False)
    word_ids = tokenized_inputs.word_ids(batch_index=0)  # list of word_id per tokenized token

    aligned_labels = []
    previous_word_idx = None
    for word_idx in word_ids:
        if word_idx is None:
            aligned_labels.append(label_to_id["O"])
        else:
            # if new word, use B- or I- as provided (we have token-level BIO already)
            lab = labels[word_idx]
            aligned_labels.append(label_to_id[lab])
    tokenized_inputs["labels"] = aligned_labels
    return tokenized_inputs

# Test alignment on a single example (debug)
print("Example tokens:", dataset["train"][0]["tokens"])
print("Example labels:", dataset["train"][0]["labels"])
print(tokenize_and_align_labels(dataset["train"][0]))


Example tokens: ['We', 'are', 'looking', 'for', 'a', 'Data', 'Scientist', 'to', 'join', 'our', 'Bangalore', 'office', '.', 'The', 'ideal', 'candidate', 'has', 'experience', 'with', 'Python', ',', 'SQL', ',', 'and', 'machine', 'learning', 'frameworks', 'like', 'TensorFlow', 'or', 'PyTorch', '.', 'Employment', 'type', 'is', 'full-time', '.']
Example labels: ['O', 'O', 'O', 'O', 'O', 'B-JOB_TITLE', 'I-JOB_TITLE', 'O', 'O', 'O', 'B-LOCATION', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-PROGRAMMING_LANGUAGE', 'O', 'B-TOOL', 'O', 'O', 'B-SKILL_TECH', 'I-SKILL_TECH', 'O', 'O', 'B-FRAMEWORK', 'O', 'B-FRAMEWORK', 'O', 'O', 'O', 'O', 'B-EMPLOYEMENT_TYPE', 'O']
{'input_ids': [101, 1284, 1132, 1702, 1111, 170, 7154, 22985, 1106, 2866, 1412, 14560, 1701, 119, 1109, 7891, 3234, 1144, 2541, 1114, 23334, 117, 156, 22825, 117, 1105, 3395, 3776, 8297, 1116, 1176, 5157, 21484, 2271, 6737, 1137, 153, 1183, 1942, 1766, 1732, 119, 18340, 2076, 1110, 1554, 118, 1159, 119, 102], 'token_type_ids': [0, 0, 0, 0, 

In [11]:
# Notebook cell 6
def hf_tokenize_align(batch):
    tokenized = tokenizer(batch["tokens"], is_split_into_words=True, truncation=True)
    all_labels = []
    for i, label_seq in enumerate(batch["labels"]):
        word_ids = tokenized.word_ids(batch_index=i)
        aligned = []
        for word_idx in word_ids:
            if word_idx is None:
                aligned.append(-100)                         # ignore in loss
            else:
                aligned.append(label_to_id[label_seq[word_idx]])
        all_labels.append(aligned)
    tokenized["labels"] = all_labels
    return tokenized

tokenized_datasets = dataset.map(hf_tokenize_align, batched=True, remove_columns=["tokens","labels"])

tokenized_datasets


Map:   0%|          | 0/14 [00:00<?, ? examples/s]

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 14
    })
    validation: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 2
    })
    test: Dataset({
        features: ['labels', 'input_ids', 'token_type_ids', 'attention_mask'],
        num_rows: 4
    })
})

In [17]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(
    model_checkpoint,
    num_labels=len(label_list),
    id2label=id_to_label,
    label2id=label_to_id,
    ignore_mismatched_sizes=True,   # <- try this first
)


Some weights of the model checkpoint at sshleifer/tiny-distilbert-base-cased were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at sshleifer/tiny-distilbert-base-cased and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([9, 2]) in the checkpoint and torch.Size([16, 2]) in the model instantiated
- classifie

In [None]:
#!df -h

In [None]:
#!ping huggingface.co

In [18]:
# Notebook cell 8
import evaluate

seqeval = evaluate.load("seqeval")

# Convert predicted ids to label strings and compute seqeval metrics
def compute_metrics(p):
    predictions, labels = p
    # predictions: [batch_size, seq_len, num_labels] -> argmax
    preds = np.argmax(predictions, axis=2)

    true_labels = []
    true_preds = []

    for i in range(len(labels)):
        lab = labels[i]
        pred = preds[i]
        # iterate tokens and skip label == -100 if present (but we didn't set -100 earlier)
        seq_true = []
        seq_pred = []
        for j, lab_id in enumerate(lab):
            # if label is -100 (ignored), skip. In our pipeline, we used label_to_id only.
            if lab_id == -100:
                continue
            seq_true.append(id_to_label[int(lab_id)])
            seq_pred.append(id_to_label[int(pred[j])])
        true_labels.append(seq_true)
        true_preds.append(seq_pred)

    results = seqeval.compute(predictions=true_preds, references=true_labels)
    # results is dict with per-entity results and overall
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results.get("overall_accuracy", 0.0)
    }


In [19]:
# Notebook cell 9
training_args = TrainingArguments(
    output_dir="models/finetuned-bert-job-ner",
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=4,
    weight_decay=0.01,
    logging_steps=20,
    push_to_hub=False,
    seed=42,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
    greater_is_better=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Start training
trainer.train()


TypeError: TrainingArguments.__init__() got an unexpected keyword argument 'evaluation_strategy'

In [None]:
# Notebook cell 10
metrics = trainer.evaluate(tokenized_datasets["test"])
print(metrics)

trainer.save_model("models/finetuned-bert-job-ner")
tokenizer.save_pretrained("models/finetuned-bert-job-ner")


In [None]:
# Notebook cell 11
from transformers import pipeline

nlp_pipeline = pipeline("token-classification", model="models/finetuned-bert-job-ner", tokenizer="models/finetuned-bert-job-ner", aggregation_strategy="none")

def predict_on_raw_text(raw_tokens):
    # raw_tokens: list of tokens (same style as dataset tokens)
    # For simplicity we join and set is_split_into_words=True when tokenizing
    encoding = tokenizer(raw_tokens, is_split_into_words=True, return_offsets_mapping=True, truncation=True, padding=True, return_tensors="pt")
    outputs = model(**{k:v.to(model.device) for k,v in encoding.items()})
    logits = outputs.logits.cpu().numpy()
    preds = np.argmax(logits, axis=-1)[0]

    word_ids = encoding.word_ids(batch_index=0)
    bio_preds = []
    for idx, wid in enumerate(word_ids):
        if wid is None:
            bio_preds.append(("","O"))
        else:
            label = id_to_label[int(preds[idx])]
            bio_preds.append((raw_tokens[wid], label))
    # compress to one label per original token (skip duplicates from wordpiece)
    final = []
    last_wid = None
    for (tok, lab), pos in zip(bio_preds, range(len(bio_preds))):
        # this simplifies â€” better to iterate word ids and pick first occurrence per word
        pass

# A simpler approach: use our tokenized pipeline above per sentence via the Trainer's predict function:
def predict_sentence(tokens_list):
    # tokens_list: list of tokens
    enc = tokenizer(tokens_list, is_split_into_words=True, return_tensors="pt", truncation=True)
    with torch.no_grad():
        out = model(**{k:v.to(model.device) for k,v in enc.items()})
    logits = out.logits.detach().cpu().numpy()
    pred_ids = np.argmax(logits, axis=2)[0]
    word_ids = enc.word_ids(batch_index=0)
    pred_labels = []
    last_word_idx = None
    for idx, word_idx in enumerate(word_ids):
        if word_idx is None:
            continue
        if word_idx != last_word_idx:
            pred_labels.append(id_to_label[int(pred_ids[idx])])
            last_word_idx = word_idx
    # pred_labels now aligned 1:1 with tokens_list
    return list(zip(tokens_list, pred_labels))

# Example usage:
# tokens = dataset["test"][0]["tokens"]
# print(predict_sentence(tokens))
