In [None]:
import json
import numpy as np
from datasets import Dataset
from transformers import DistilBertForTokenClassification, AutoTokenizer, DataCollatorForTokenClassification, Trainer, TrainingArguments,AutoModelForTokenClassification
import evaluate


In [None]:
def load_maccrobat(json_path):
    with open(json_path, "r", encoding="utf-8") as f:
        data = json.load(f)
    return data["data"]

In [None]:
def build_label_map(data):
    unique_labels = set()
    for example in data:
        for ent in example["ner_info"]:
            unique_labels.add(f"B-{ent['label']}")
            unique_labels.add(f"I-{ent['label']}")
    unique_labels.add("O")

    label_list = sorted(list(unique_labels))
    label2id = {label: i for i, label in enumerate(label_list)}
    id2label = {i: label for label, i in label2id.items()}
    return label_list, label2id, id2label

In [None]:
def encode_example(example, tokenizer, label2id, max_length=512):
    text = example["full_text"]
    entities = example["ner_info"]

    encoding = tokenizer(
        text,
        return_offsets_mapping=True,
        truncation=True,
        max_length=max_length,
        padding=False   
    )

    labels = ["O"] * len(encoding["input_ids"])

    for ent in entities:
        ent_start, ent_end, ent_label = ent["start"], ent["end"], ent["label"]
        for idx, (start, end) in enumerate(encoding["offset_mapping"]):
            if start == 0 and end == 0:  # special tokens
                continue
            if start >= ent_start and end <= ent_end:
                if start == ent_start:
                    labels[idx] = f"B-{ent_label}"
                else:
                    labels[idx] = f"I-{ent_label}"

    # convert to ids
    label_ids = [label2id.get(l, label2id["O"]) for l in labels]
    encoding["labels"] = label_ids

    # remove offset mapping (not needed for training)
    encoding.pop("offset_mapping")
    return encoding

In [None]:
def build_dataset(json_path, tokenizer_path, train_split=0.7):
    # load raw data
    data = load_maccrobat(json_path)

    # build label map
    label_list, label2id, id2label = build_label_map(data)

    # load tokenizer
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_path)

    # encode all examples
    encoded_data = [encode_example(ex, tokenizer, label2id) for ex in data]

    # convert to HuggingFace Dataset
    dataset = Dataset.from_list(encoded_data)

    # train/validation split
    dataset = dataset.train_test_split(test_size=1-train_split, seed=42)

    # data collator for token classification (handles padding of both inputs + labels)
    data_collator = DataCollatorForTokenClassification(tokenizer)

    return dataset, label_list, label2id, id2label, tokenizer, data_collator


In [None]:
seqeval = evaluate.load("seqeval")

def compute_metrics(p, id2label):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [
        [id2label[l] for l in label if l != -100]
        for label in labels
    ]
    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = seqeval.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
dataset, label_list, label2id, id2label, tokenizer, data_collator = build_dataset(
    "../data/normalize/MACCROBAT_biomedical_ner/MACCROBAT2020-V2.json",
    "../artifacts/distilled_model"
)

train_data = dataset['train'].to_dict()

train_records = [
    {key: train_data[key][i] for key in train_data.keys()}
    for i in range(len(dataset['train']))
]

# Save to JSON
with open("train_dataset.json", "w", encoding="utf-8") as f:
    json.dump(train_records, f, ensure_ascii=False, indent=4)

model = DistilBertForTokenClassification.from_pretrained(
    "../artifacts/distilled_model",
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    learning_rate=2e-5,
    num_train_epochs=30,
    eval_strategy="steps",
    save_steps=100,
    logging_steps=50,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["test"],
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=lambda p: compute_metrics(p, id2label=id2label)
)

trainer.train()

print("Saving model...")
trainer.save_model("./maccrobat_ner_model")
tokenizer.save_pretrained("./maccrobat_ner_model")

In [None]:
predictions, labels, _ = trainer.predict(dataset["test"])
predictions = np.argmax(predictions, axis=2)

# Convert input_ids to tokens
all_tokens = [tokenizer.convert_ids_to_tokens(ids) for ids in dataset["test"]["input_ids"]]

# Align predictions and true labels
true_labels = [
    [id2label[l] for l in label if l != -100]
    for label in labels
]

true_predictions = [
    [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

# Save results as JSON
all_results = []
for tokens, preds, golds in zip(all_tokens, true_predictions, true_labels):
    sentence_result = []
    for tok, pred, gold in zip(tokens, preds, golds):
        if tok in tokenizer.all_special_tokens:
            continue
        sentence_result.append({
            "token": tok,
            "prediction": pred,
            "label": gold
        })
    all_results.append(sentence_result)

with open("ner_predictions.json", "w", encoding="utf-8") as f:
    json.dump(all_results, f, ensure_ascii=False, indent=4)

# Print metrics
results = trainer.evaluate()
print("Evaluation results:", results)
print("Predictions saved to ner_predictions.json")