In [1]:
!pip install -q transformers evaluate seqeval wandb bioc datasets==3.6.0 --no-deps

In [3]:
# Clean minimal code to download WandB artifacts and extract model_checkpoint
import warnings
warnings.filterwarnings('ignore')
import os
import shutil
import glob
import gc
from pathlib import Path
import wandb
from kaggle_secrets import UserSecretsClient

# ---------- constants ----------
PROJECT_NAME = "clinical-ner-maccrobat"
OUTPUT_DIR = "/kaggle/working/ner_runs"

fine_tuned_model_config = {
    "DISTILBERT": {
        "model_name": "kshitij-nevrekar-nmims/clinical-ner-maccrobat/DISTILBERT_20251116_191254-artifacts:v0"
    },
    "MODERNBERT": {
        "model_name": "kshitij-nevrekar-nmims/clinical-ner-maccrobat/MODERNBERT_20251117_064645-artifacts:v0"
    },
    "BIOMEDBERT": {
        "model_name": "kshitij-nevrekar-nmims/clinical-ner-maccrobat/BIOMEDBERT_20251117_085850-artifacts:v0"
    },
    "BIOFORMER": {
        "model_name": "kshitij-nevrekar-nmims/clinical-ner-maccrobat/BIOFORMER_20251117_112852-artifacts:v0"
    },
    "BIOBERT": {
        "model_name": "kshitij-nevrekar-nmims/clinical-ner-maccrobat/BIOBERT_20251117_121753-artifacts:v0"
    }
}

In [None]:
# ---------- W&B login ----------
WANDB_KEY = UserSecretsClient().get_secret("WANDB_API_KEY")
wandb.login(key=WANDB_KEY)
print("W&B logged in.")

# ---------- helper ----------
def ensure_dir(p):
    Path(p).mkdir(parents=True, exist_ok=True)

def find_checkpoint_dir(root_path):
    """
    Find checkpoint-* folder inside artifact.
    """
    for d in os.listdir(root_path):
        full = os.path.join(root_path, d)
        if os.path.isdir(full) and d.startswith("checkpoint-"):
            return full
    return None

# ---------- download ----------
api = wandb.Api()

tmp_root = "/kaggle/working/_wandb_tmp"
shutil.rmtree(tmp_root, ignore_errors=True)
ensure_dir(tmp_root)
ensure_dir(OUTPUT_DIR)

for key, v in fine_tuned_model_config.items():
    artifact_ref = v["model_name"]
    print(f"\n=== Processing {key} ===")
    print("Artifact:", artifact_ref)

    try:
        # dir for this model temp extraction
        tmp_dir = os.path.join(tmp_root, key)
        shutil.rmtree(tmp_dir, ignore_errors=True)
        ensure_dir(tmp_dir)

        # fetch artifact
        artifact = api.artifact(artifact_ref)
        downloaded_path = artifact.download(root=tmp_dir)
        print("Downloaded to:", downloaded_path)

        # locate checkpoint
        ckpt_dir = find_checkpoint_dir(downloaded_path)
        if not ckpt_dir:
            raise RuntimeError("ERROR: No checkpoint-* folder found in artifact.")

        # output final folder
        out_model_dir = os.path.join(OUTPUT_DIR, key)
        shutil.rmtree(out_model_dir, ignore_errors=True)
        ensure_dir(out_model_dir)

        # rename to model_checkpoint
        target_dir = os.path.join(out_model_dir, "model_checkpoint")
        shutil.move(ckpt_dir, target_dir)

        print("Saved:", target_dir)

    except Exception as e:
        print(f"[ERROR] {key}:", e)

# cleanup tmp
shutil.rmtree(tmp_root, ignore_errors=True)
gc.collect()

print("\nDone. Available model dirs:")
for key in fine_tuned_model_config:
    print(key, "->", os.listdir(os.path.join(OUTPUT_DIR, key)))

In [2]:
from datasets import load_dataset
import pandas as pd

# ---- dataset IDs ----
MACCROBAT = "ktgiahieu/maccrobat2018_2020"
NCBI = "bigbio/ncbi_disease"
BC5 = "bigbio/bc5cdr"

print("Loading datasets...")

# these load the bigbio schema versions
maccrobat = load_dataset(MACCROBAT)
ncbi = load_dataset(NCBI, trust_remote_code=True)
bc5 = load_dataset(BC5, trust_remote_code=True)

print("Loaded:")
print(" - MACCROBAT:", maccrobat)
print(" - NCBI:", ncbi)
print(" - BC5:", bc5)

Loading datasets...


README.md: 0.00B [00:00, ?B/s]

data.jsonl: 0.00B [00:00, ?B/s]

Generating train split:   0%|          | 0/400 [00:00<?, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

ncbi_disease.py: 0.00B [00:00, ?B/s]

bigbiohub.py: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/325k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/60.2k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/57.7k [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

README.md: 0.00B [00:00, ?B/s]

bc5cdr.py: 0.00B [00:00, ?B/s]

bigbiohub.py: 0.00B [00:00, ?B/s]

CDR_Data.zip:   0%|          | 0.00/20.4M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Loaded:
 - MACCROBAT: DatasetDict({
    train: Dataset({
        features: ['tokens', 'tags'],
        num_rows: 400
    })
})
 - NCBI: DatasetDict({
    train: Dataset({
        features: ['pmid', 'title', 'abstract', 'mentions'],
        num_rows: 593
    })
    test: Dataset({
        features: ['pmid', 'title', 'abstract', 'mentions'],
        num_rows: 100
    })
    validation: Dataset({
        features: ['pmid', 'title', 'abstract', 'mentions'],
        num_rows: 100
    })
})
 - BC5: DatasetDict({
    train: Dataset({
        features: ['passages'],
        num_rows: 500
    })
    test: Dataset({
        features: ['passages'],
        num_rows: 500
    })
    validation: Dataset({
        features: ['passages'],
        num_rows: 500
    })
})


In [None]:
# Load sample
ncbi['train'][0]

In [None]:
# Load sample
bc5['train'][0]['passages'][1]

In [5]:
import re
from datasets import Dataset, DatasetDict

# ==========================================================
# Helper: simple whitespace tokenizer keeping offsets aligned
# ==========================================================
def simple_tokenize_with_offsets(text):
    """
    Tokenizes similar to MACCROBAT style:
    - splits on whitespace
    - splits punctuation into separate tokens
    - keeps hyphens as separate tokens
    """
    tokens = []
    # Match: word chars, numbers, hyphenated pieces OR any single punctuation
    pattern = r"[A-Za-z0-9]+|[\-]+|[^\w\s]"
    for m in re.finditer(pattern, text):
        tok = m.group(0)
        start = m.start()
        end = m.end()
        tokens.append((tok, start, end))
    return tokens



# ==========================================================
# Convert NCBI format → token-level dataset
# ==========================================================
def convert_ncbi_to_token_tags(ncbi_ds):
    all_examples = []

    for ex in ncbi_ds:
        # Combine title + abstract
        text = ""
        if ex["title"]:
            text += ex["title"].strip() + "\n"
        if ex["abstract"]:
            text += ex["abstract"].strip()

        tokens_with_offsets = simple_tokenize_with_offsets(text)
        tokens = [t[0] for t in tokens_with_offsets]

        # create tags
        tags = ["O"] * len(tokens)

        # apply NCBI mentions
        for m in ex["mentions"]:
            ent_type = m["type"]   # DiseaseClass / SpecificDisease / etc
            start_char, end_char = m["offsets"]

            # labels: BI tagging “Disease”
            # unify them: everything becomes disease
            first = True
            for i, (_, s, e) in enumerate(tokens_with_offsets):
                if e <= start_char or s >= end_char:
                    continue
                if first:
                    tags[i] = "B-DISEASE"
                    first = False
                else:
                    tags[i] = "I-DISEASE"

        all_examples.append({"tokens": tokens, "tags": tags})

    return Dataset.from_dict({
        "tokens": [x["tokens"] for x in all_examples],
        "tags":   [x["tags"]   for x in all_examples]
    })


# ==========================================================
# Convert BC5 format → token-level dataset
# ==========================================================
def convert_bc5_to_token_tags(bc5_ds):
    all_examples = []

    for ex in bc5_ds:
        # BC5 passages: find the abstract (type=abstract)
        text_parts = []
        ents = []

        for p in ex["passages"]:
            if p["type"] == "abstract":
                text_parts.append(p["text"])
                if "entities" in p:
                    ents.extend(p["entities"])

        if not text_parts:
            continue

        text = "\n".join(text_parts)

        tokens_with_offsets = simple_tokenize_with_offsets(text)
        tokens = [t[0] for t in tokens_with_offsets]

        tags = ["O"] * len(tokens)

        # entities contain type: "Chemical" | "Disease"
        for ent in ents:
            ent_type = ent["type"]
            begin, end = ent["offsets"][0]

            if ent_type == "Chemical":
                label = "CHEMICAL"
            else:
                label = "DISEASE"

            first = True
            for i, (_, s, e) in enumerate(tokens_with_offsets):
                if e <= begin or s >= end:
                    continue
                if first:
                    tags[i] = f"B-{label}"
                    first = False
                else:
                    tags[i] = f"I-{label}"

        all_examples.append({"tokens": tokens, "tags": tags})

    return Dataset.from_dict({
        "tokens": [x["tokens"] for x in all_examples],
        "tags":   [x["tags"]   for x in all_examples]
    })


# ==========================================================
# Merge splits and convert everything
# ==========================================================

# ---- MACCROBAT already token-level, but flatten train only ----
maccrobat_flat = maccrobat["train"]

# ---- NCBI: merge train + val + test ----
ncbi_merged = Dataset.from_dict({
    "pmid":      ncbi["train"]["pmid"]      + ncbi["validation"]["pmid"]      + ncbi["test"]["pmid"],
    "title":     ncbi["train"]["title"]     + ncbi["validation"]["title"]     + ncbi["test"]["title"],
    "abstract":  ncbi["train"]["abstract"]  + ncbi["validation"]["abstract"]  + ncbi["test"]["abstract"],
    "mentions":  ncbi["train"]["mentions"]  + ncbi["validation"]["mentions"]  + ncbi["test"]["mentions"],
})

ncbi_token_ds = convert_ncbi_to_token_tags(ncbi_merged)


# ---- BC5: merge train + val + test ----
bc5_merged = Dataset.from_dict({
    "passages": bc5["train"]["passages"] + bc5["validation"]["passages"] + bc5["test"]["passages"],
})

bc5_token_ds = convert_bc5_to_token_tags(bc5_merged)


print("MACCROBAT:", len(maccrobat_flat))
print("NCBI tokenized:", len(ncbi_token_ds))
print("BC5 tokenized:", len(bc5_token_ds))

print("\nSample MACCROBAT:", maccrobat_flat[0])
print("\nSample NCBI:", ncbi_token_ds[0])
print("\nSample BC5:", bc5_token_ds[0])

MACCROBAT: 400
NCBI tokenized: 793
BC5 tokenized: 1500

Sample MACCROBAT: {'tokens': ['A', '68', '-', 'year', '-', 'old', 'female', 'nonsmoker', ',', 'nondrinker', 'with', 'a', 'medical', 'history', 'of', 'hypertension', 'presented', 'with', 'new', '-', 'onset', 'painless', 'jaundice', 'and', 'pruritus', ',', 'a', 'three', '-', 'month', 'history', 'of', '9.9', 'kg', 'weight', 'loss', 'and', 'chronic', 'diarrhea', 'with', 'four', 'to', 'five', 'loose', 'bowel', 'movements', 'per', 'day', '.', '\n', 'Medications', 'included', 'vitamin', 'D', ',', 'amlodipine', 'and', 'eprosartan', '.', '\n', 'Physical', 'examination', 'was', 'normal', 'except', 'for', 'jaundice', 'and', 'muscle', 'wasting', '.', '\n', 'Recent', 'colonoscopy', 'had', 'been', 'normal', '.', '\n', 'Total', 'and', 'direct', 'bilirubin', 'levels', 'were', '6.84', 'mg', '/', 'dL', '(', '116.96', 'μmol', '/', 'L', ')', 'and', '9.18', 'mg', '/', 'dL', '(', '156.98', 'μmol', '/', 'L', ')', ',', 'respectively', '.', '\n', 'Other',

In [6]:
import torch
import gc
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
from seqeval.metrics import f1_score, precision_score, recall_score

In [7]:
# NCBI only needs: O, B-DISEASE, I-DISEASE
def map_maccrobat_to_ncbi(label: str):
    if label == "O":
        return "O"
    base = label.split("-", 1)[-1].lower()

    # disease-ish maccrobat labels
    disease_like = {
        "disease_disorder",
        "sign_symptom",
    }

    if base in disease_like:
        prefix = label[0]  # B or I
        return f"{prefix}-DISEASE"

    return "O"

In [8]:
def map_maccrobat_to_bc5(label: str):
    if label == "O":
        return "O"
    base = label.split("-", 1)[-1].lower()
    prefix = label[0]  # B or I

    disease_like = {
        "disease_disorder",
        "sign_symptom",
    }
    chem_like = {
        "medication",
    }

    if base in chem_like:
        return f"{prefix}-CHEMICAL"
    if base in disease_like:
        return f"{prefix}-DISEASE"

    return "O"

In [9]:
label_all_tokens = False

def tokenize_and_align_labels(examples, tokenizer, label_all_tokens=label_all_tokens):
    # Tokenize
    tokenized_inputs = tokenizer(
        examples["tokens"],
        is_split_into_words=True,
        truncation=True,
        max_length=512,
    )

    all_labels = examples["tags"]
    new_labels = []

    for i, labels in enumerate(all_labels):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []

        for word_idx in word_ids:
            # Special tokens (CLS, SEP, PAD) → word_idx = None
            if word_idx is None:
                label_ids.append(-100)

            # First wordpiece of a word → assign label
            elif word_idx != previous_word_idx:
                label_ids.append(labels[word_idx])

            # Subword tokens (continuation) → assign or ignore
            else:
                if label_all_tokens:
                    label_ids.append(labels[word_idx])
                else:
                    label_ids.append(-100)

            previous_word_idx = word_idx

        new_labels.append(label_ids)

    tokenized_inputs["labels"] = new_labels
    return tokenized_inputs

In [10]:
from transformers import AutoTokenizer, AutoConfig, AutoModelForTokenClassification
import torch, gc

def load_model(model_path):
    config = AutoConfig.from_pretrained(model_path)
    tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)

    model = AutoModelForTokenClassification.from_pretrained(
        model_path,
        config=config
    )
    model.eval()

    return tokenizer, model, config.id2label

In [11]:
def predict_word_level(model, tokenizer, word_tokens):
    """
    word_tokens: list[str], e.g. ["A", "common", "skin", "tumour"]
    returns: list[str] predicted word-level labels (same length as word_tokens)
    """
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Use model's max length or default to 512
    max_length = getattr(model.config, 'max_position_embeddings', 512)
    
    enc = tokenizer(
        word_tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
    )

    with torch.no_grad():
        logits = model(
            input_ids=enc["input_ids"].to(device),
            attention_mask=enc["attention_mask"].to(device)
        ).logits  # [1, seq_len, num_labels]

    pred_ids = logits.argmax(dim=-1).squeeze(0).cpu().tolist()
    word_ids = enc.word_ids(batch_index=0)

    # Collapse subwords → one prediction per original word
    preds = []
    last_word = None

    for pid, wid in zip(pred_ids, word_ids):
        if wid is None:
            continue
        if wid != last_word:
            preds.append(model.config.id2label[pid])
            last_word = wid

    # Handle cases where tokenization was truncated
    # Pad with 'O' labels for any missing tokens
    while len(preds) < len(word_tokens):
        preds.append('O')
    
    # If somehow we have more predictions than tokens (shouldn't happen), truncate
    if len(preds) > len(word_tokens):
        preds = preds[:len(word_tokens)]

    return preds

In [12]:
def run_model_on_dataset(model_path, dataset, mapper_fn):
    """
    dataset: list/dict with fields
      - tokens: list[str]
      - tags:   list[str] (gold word-level labels - ALREADY in target format!)
    mapper_fn: function mapping MACCROBAT label → NCBI/BC5 label
    """
    tokenizer, model, id2label = load_model(model_path)

    preds_all = []
    gts_all = []
    
    mismatches = 0

    for i, ex in enumerate(dataset):
        tokens = ex["tokens"]
        gold   = ex["tags"]  # Already in target format (B-DISEASE, etc.)

        pred_macc = predict_word_level(model, tokenizer, tokens)

        if len(gold) != len(pred_macc):
            mismatches += 1
            # Only show first 3 mismatches for debugging
            if mismatches <= 3:
                print(f"[WARNING] Length mismatch at sample {i}: gold={len(gold)} pred={len(pred_macc)}")
                print(f"  First 10 tokens: {tokens[:10]}")
                print(f"  Last 10 tokens: {tokens[-10:]}")

        # ✅ IMPORTANT: Only map predictions (MACCROBAT → target format)
        # Ground truth is ALREADY in target format, so use it directly!
        pred_mapped = [mapper_fn(p) for p in pred_macc]
        
        preds_all.append(pred_mapped)
        gts_all.append(gold)  # ✅ Use gold directly, don't map!

    if mismatches > 0:
        print(f"[INFO] Total samples with length mismatches: {mismatches}/{len(dataset)}")

    # Free VRAM
    model.to("cpu")
    del model
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

    return preds_all, gts_all

In [14]:
import evaluate, pandas as pd
seqeval_metric = evaluate.load("seqeval")

def evaluate_predictions(preds, gts, name="eval"):
    report = seqeval_metric.compute(
        predictions=preds,
        references=gts,
        zero_division=0
    )

    print(f"\n{name}:")
    print(f"Precision: {report['overall_precision']:.4f}")
    print(f"Recall:    {report['overall_recall']:.4f}")
    print(f"F1-score:  {report['overall_f1']:.4f}")

    rows = []
    for label, stats in report.items():
        if isinstance(stats, dict):
            rows.append([
                label,
                stats.get("f1", 0.0),
                stats.get("precision", 0.0),
                stats.get("recall", 0.0),
                stats.get("number", 0),
            ])

    df = pd.DataFrame(rows, columns=["Entity", "F1", "Precision", "Recall", "Support"])
    df = df.sort_values("F1", ascending=False)
    display(df)

    return df

In [15]:
def predict_labels_ensemble(model_paths, tokenizers, models, tokens):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    # tokenize ONCE (using tokenizer of first model)
    tokenizer = tokenizers[0]
    
    # Use model's max length or default to 512
    max_length = getattr(models[0].config, 'max_position_embeddings', 512)
    
    enc = tokenizer(
        tokens,
        is_split_into_words=True,
        return_tensors="pt",
        truncation=True,
        max_length=max_length,
    )
    enc_on_gpu = {k: v.to(device) for k, v in enc.items()}

    # collect per-model logits
    logits_list = []
    for model in models:
        model.to(device)
        with torch.no_grad():
            logits = model(**enc_on_gpu).logits.cpu()
        logits_list.append(logits)
        model.to("cpu")
        torch.cuda.empty_cache()

    # average
    avg_logits = torch.mean(torch.stack(logits_list), dim=0).squeeze(0)

    pred_ids = avg_logits.argmax(-1).tolist()
    word_ids = enc.word_ids(batch_index=0)

    per_token_preds = []
    last_word = -1
    for pred_id, w in zip(pred_ids, word_ids):
        if w is None:
            continue
        if w != last_word:
            per_token_preds.append(models[0].config.id2label[pred_id])
            last_word = w

    # ✅ Handle truncation by padding with 'O' labels
    while len(per_token_preds) < len(tokens):
        per_token_preds.append('O')
    
    # ✅ Safety check: truncate if somehow too long
    if len(per_token_preds) > len(tokens):
        per_token_preds = per_token_preds[:len(tokens)]

    return per_token_preds

In [16]:
def run_ensemble(model_paths, dataset, mapper_fn):
    # load all models
    tokenizers = []
    models = []
    for p in model_paths:
        tok, m, id2label = load_model(p)
        tokenizers.append(tok)
        models.append(m)

    preds = []
    gts = []
    for ex in dataset:
        tokens = ex["tokens"]
        labels = ex["tags"]  # Already in target format (B-DISEASE, etc.)

        pred_macc = predict_labels_ensemble(model_paths, tokenizers, models, tokens)

        # ✅ IMPORTANT: Only map predictions (MACCROBAT → target format)
        # Ground truth is ALREADY in target format, so use it directly!
        mapped_preds = [mapper_fn(l) for l in pred_macc]

        preds.append(mapped_preds)
        gts.append(labels)  # ✅ Use labels directly, don't map!

    # unload
    for m in models:
        m.to("cpu")
        del m
    gc.collect()
    torch.cuda.empty_cache()
    torch.cuda.ipc_collect()

    return preds, gts

## NCBI

In [17]:
ncbi_preds, ncbi_gts = run_model_on_dataset(
    model_path="/kaggle/working/ner_runs/DISTILBERT/model_checkpoint",
    dataset=ncbi_token_ds,
    mapper_fn=map_maccrobat_to_ncbi
)

evaluate_predictions(ncbi_preds, ncbi_gts, "DISTILBERT on NCBI")


DISTILBERT on NCBI:
Precision: 0.4007
Recall:    0.4009
F1-score:  0.4008


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,DISEASE,0.400812,0.400725,0.4009,6892


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,DISEASE,0.400812,0.400725,0.4009,6892


In [18]:
ncbi_preds, ncbi_gts = run_model_on_dataset(
    model_path="/kaggle/working/ner_runs/MODERNBERT/model_checkpoint",
    dataset=ncbi_token_ds,
    mapper_fn=map_maccrobat_to_ncbi
)

evaluate_predictions(ncbi_preds, ncbi_gts, "MODERNBERT on NCBI")

W1118 08:30:48.164000 200 torch/_inductor/utils.py:1137] [1/0] Not enough SMs to use max_autotune_gemm mode



MODERNBERT on NCBI:
Precision: 0.0661
Recall:    0.1737
F1-score:  0.0958


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,DISEASE,0.09576,0.066103,0.17368,6892


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,DISEASE,0.09576,0.066103,0.17368,6892


In [19]:
ncbi_preds, ncbi_gts = run_model_on_dataset(
    model_path="/kaggle/working/ner_runs/BIOMEDBERT/model_checkpoint",
    dataset=ncbi_token_ds,
    mapper_fn=map_maccrobat_to_ncbi
)

evaluate_predictions(ncbi_preds, ncbi_gts, "BIOMEDBERT on NCBI")


BIOMEDBERT on NCBI:
Precision: 0.6162
Recall:    0.3217
F1-score:  0.4227


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,DISEASE,0.422688,0.616176,0.321677,6892


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,DISEASE,0.422688,0.616176,0.321677,6892


In [20]:
ncbi_preds, ncbi_gts = run_model_on_dataset(
    model_path="/kaggle/working/ner_runs/BIOFORMER/model_checkpoint",
    dataset=ncbi_token_ds,
    mapper_fn=map_maccrobat_to_ncbi
)

evaluate_predictions(ncbi_preds, ncbi_gts, "BIOFORMER on NCBI")


BIOFORMER on NCBI:
Precision: 0.5357
Recall:    0.3092
F1-score:  0.3921


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,DISEASE,0.392088,0.535696,0.309199,6892


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,DISEASE,0.392088,0.535696,0.309199,6892


In [21]:
ncbi_preds, ncbi_gts = run_model_on_dataset(
    model_path="/kaggle/working/ner_runs/BIOBERT/model_checkpoint",
    dataset=ncbi_token_ds,
    mapper_fn=map_maccrobat_to_ncbi
)

evaluate_predictions(ncbi_preds, ncbi_gts, "BIOBERT on NCBI")


BIOBERT on NCBI:
Precision: 0.6016
Recall:    0.4317
F1-score:  0.5027


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,DISEASE,0.502661,0.601618,0.43166,6892


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,DISEASE,0.502661,0.601618,0.43166,6892


## BC5CDR

In [22]:
bc5_preds, bc5_gts = run_model_on_dataset(
    model_path="/kaggle/working/ner_runs/DISTILBERT/model_checkpoint",
    dataset=bc5_token_ds,
    mapper_fn=map_maccrobat_to_bc5
)

evaluate_predictions(bc5_preds, bc5_gts, "DISTILBERT on BC5CDR")


DISTILBERT on BC5CDR:
Precision: 0.0164
Recall:    0.0160
F1-score:  0.0162


Unnamed: 0,Entity,F1,Precision,Recall,Support
1,DISEASE,0.016198,0.014478,0.018382,10173
0,CHEMICAL,0.016174,0.018921,0.014124,12815


Unnamed: 0,Entity,F1,Precision,Recall,Support
1,DISEASE,0.016198,0.014478,0.018382,10173
0,CHEMICAL,0.016174,0.018921,0.014124,12815


In [23]:
bc5_preds, bc5_gts = run_model_on_dataset(
    model_path="/kaggle/working/ner_runs/MODERNBERT/model_checkpoint",
    dataset=bc5_token_ds,
    mapper_fn=map_maccrobat_to_bc5
)

evaluate_predictions(bc5_preds, bc5_gts, "MODERNBERT on BC5CDR")


MODERNBERT on BC5CDR:
Precision: 0.0154
Recall:    0.0320
F1-score:  0.0208


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,CHEMICAL,0.029271,0.020968,0.048459,12815
1,DISEASE,0.008083,0.006321,0.011206,10173


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,CHEMICAL,0.029271,0.020968,0.048459,12815
1,DISEASE,0.008083,0.006321,0.011206,10173


In [24]:
bc5_preds, bc5_gts = run_model_on_dataset(
    model_path="/kaggle/working/ner_runs/BIOMEDBERT/model_checkpoint",
    dataset=bc5_token_ds,
    mapper_fn=map_maccrobat_to_bc5
)

evaluate_predictions(bc5_preds, bc5_gts, "BIOMEDBERT on BC5CDR")


BIOMEDBERT on BC5CDR:
Precision: 0.0169
Recall:    0.0096
F1-score:  0.0122


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,CHEMICAL,0.01249,0.020536,0.008974,12815
1,DISEASE,0.011932,0.014138,0.010321,10173


Unnamed: 0,Entity,F1,Precision,Recall,Support
0,CHEMICAL,0.01249,0.020536,0.008974,12815
1,DISEASE,0.011932,0.014138,0.010321,10173


In [25]:
bc5_preds, bc5_gts = run_model_on_dataset(
    model_path="/kaggle/working/ner_runs/BIOFORMER/model_checkpoint",
    dataset=bc5_token_ds,
    mapper_fn=map_maccrobat_to_bc5
)

evaluate_predictions(bc5_preds, bc5_gts, "BIOFORMER on BC5CDR")


BIOFORMER on BC5CDR:
Precision: 0.0156
Recall:    0.0094
F1-score:  0.0118


Unnamed: 0,Entity,F1,Precision,Recall,Support
1,DISEASE,0.012065,0.013202,0.011108,10173
0,CHEMICAL,0.011475,0.019578,0.008115,12815


Unnamed: 0,Entity,F1,Precision,Recall,Support
1,DISEASE,0.012065,0.013202,0.011108,10173
0,CHEMICAL,0.011475,0.019578,0.008115,12815


In [26]:
bc5_preds, bc5_gts = run_model_on_dataset(
    model_path="/kaggle/working/ner_runs/BIOBERT/model_checkpoint",
    dataset=bc5_token_ds,
    mapper_fn=map_maccrobat_to_bc5
)

evaluate_predictions(bc5_preds, bc5_gts, "BIOBERT on BC5CDR")


BIOBERT on BC5CDR:
Precision: 0.0168
Recall:    0.0098
F1-score:  0.0124


Unnamed: 0,Entity,F1,Precision,Recall,Support
1,DISEASE,0.01347,0.015204,0.012091,10173
0,CHEMICAL,0.011354,0.019332,0.008037,12815


Unnamed: 0,Entity,F1,Precision,Recall,Support
1,DISEASE,0.01347,0.015204,0.012091,10173
0,CHEMICAL,0.011354,0.019332,0.008037,12815


## Ensemble

In [27]:
model_paths = [
    "/kaggle/working/ner_runs/DISTILBERT/model_checkpoint",
    "/kaggle/working/ner_runs/MODERNBERT/model_checkpoint",
    "/kaggle/working/ner_runs/BIOMEDBERT/model_checkpoint",
    "/kaggle/working/ner_runs/BIOFORMER/model_checkpoint",
    "/kaggle/working/ner_runs/BIOBERT/model_checkpoint",
]

ncbi_preds, ncbi_gts = run_ensemble(
    model_paths,
    ncbi_token_ds,
    mapper_fn=map_maccrobat_to_ncbi
)

evaluate_predictions(ncbi_preds, ncbi_gts, "Ensemble on NCBI")

/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [65,0,0], thread: [0,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [65,0,0], thread: [1,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [65,0,0], thread: [2,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [65,0,0], thread: [3,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [65,0,0], thread: [4,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: indexSelectLargeIndex: block: [65,0,0], thread: [5,0,0] Assertion `srcIndex < srcSelectDimSize` failed.
/pytorch/aten/src/ATen/native/cuda/Indexing.cu:1422: index

RuntimeError: CUDA error: device-side assert triggered
CUDA kernel errors might be asynchronously reported at some other API call, so the stacktrace below might be incorrect.
For debugging consider passing CUDA_LAUNCH_BLOCKING=1
Compile with `TORCH_USE_CUDA_DSA` to enable device-side assertions.


In [None]:
model_paths = [
    "/kaggle/working/ner_runs/DISTILBERT/model_checkpoint",
    "/kaggle/working/ner_runs/MODERNBERT/model_checkpoint",
    "/kaggle/working/ner_runs/BIOMEDBERT/model_checkpoint",
    "/kaggle/working/ner_runs/BIOFORMER/model_checkpoint",
    "/kaggle/working/ner_runs/BIOBERT/model_checkpoint",
]

bc5_preds, bc5_gts = run_ensemble(
    model_paths,
    bc5_token_ds,
    mapper_fn=map_maccrobat_to_bc5
)

evaluate_predictions(bc5_preds, bc5_gts, "Ensemble on BC5CDR")