In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [3]:
import os
import random
import numpy as np
import pandas as pd
from tqdm import tqdm

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

from transformers import (
    AutoTokenizer,
    AutoModelForSequenceClassification,
    get_scheduler
)

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import (
    accuracy_score,
    f1_score,
    classification_report
)

In [4]:
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

Device: cuda


In [5]:
DATASET_MAP = {
    "46985": "/content/drive/MyDrive/WoS/original_dataset/46985_xydata_l1_l2.csv",
    "11967": "/content/drive/MyDrive/WoS/original_dataset/11967_xydata_l1_l2.csv",
    "5736": "/content/drive/MyDrive/WoS/original_dataset/5736_xydata_l1_l2.csv"
}

In [6]:
DATASET_KEY = "46985"

SAVE_DIR = "/content/drive/MyDrive/WoS/final_two_stage_pipeline"
os.makedirs(SAVE_DIR, exist_ok=True)

MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS_STAGE1 = 20
EPOCHS_STAGE2 = 20
LR_LIST = [2e-5]
PATIENCE = 5

SCIBERT = "allenai/scibert_scivocab_uncased"
BIOBERT = "dmis-lab/biobert-base-cased-v1.2"

TRAIN_ALPHA = 0.7

In [7]:
df = pd.read_csv(DATASET_MAP[DATASET_KEY])
text_col = "X"

le_l1 = LabelEncoder()
le_l2 = LabelEncoder()

df["YL1_enc"] = le_l1.fit_transform(df["YL1"])
df["YL2_enc"] = le_l2.fit_transform(df["YL2"])

num_l1 = df["YL1_enc"].nunique()
num_l2 = df["YL2_enc"].nunique()

print("L1 classes:", num_l1)
print("L2 classes:", num_l2)

df["_strat"] = df["YL1_enc"].astype(str) + "_" + df["YL2_enc"].astype(str)

train_df, test_df = train_test_split(
    df, test_size=0.2, random_state=SEED, stratify=df["_strat"]
)
val_df, _ = train_test_split(
    test_df, test_size=0.5, random_state=SEED, stratify=test_df["_strat"]
)

print("Train:", len(train_df), "Val:", len(val_df), "Test:", len(test_df))

L1 classes: 7
L2 classes: 53
Train: 37588 Val: 4698 Test: 9397


In [8]:
class DualEncoderDataset(Dataset):
    def __init__(self, texts, labels, tok1, tok2):
        self.enc1 = tok1(
            list(texts),
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        self.enc2 = tok2(
            list(texts),
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )

        self.labels = torch.tensor(labels.to_numpy(), dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids_1": self.enc1["input_ids"][idx],
            "attention_mask_1": self.enc1["attention_mask"][idx],
            "input_ids_2": self.enc2["input_ids"][idx],
            "attention_mask_2": self.enc2["attention_mask"][idx],
            "labels": self.labels[idx]
        }


In [9]:
print("\n STAGE 1: YL1 ")

tok_sci = AutoTokenizer.from_pretrained(SCIBERT)
tok_bio = AutoTokenizer.from_pretrained(BIOBERT)

train_ds = DualEncoderDataset(
    train_df[text_col], train_df["YL1_enc"], tok_sci, tok_bio
)
val_ds = DualEncoderDataset(
    val_df[text_col], val_df["YL1_enc"], tok_sci, tok_bio
)
test_ds = DualEncoderDataset(
    test_df[text_col], test_df["YL1_enc"], tok_sci, tok_bio
)

train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_ds, batch_size=BATCH_SIZE)
test_loader = DataLoader(test_ds, batch_size=BATCH_SIZE)




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

In [None]:

best_stage1_f1 = 0.0
best_alpha = TRAIN_ALPHA

for lr in LR_LIST:
    print(f"\n--- Training Stage-1 with LR={lr} ---")

    sci = AutoModelForSequenceClassification.from_pretrained(SCIBERT, num_labels=num_l1).to(device)
    bio = AutoModelForSequenceClassification.from_pretrained(BIOBERT, num_labels=num_l1).to(device)

    opt_sci = AdamW(sci.parameters(), lr=lr)
    opt_bio = AdamW(bio.parameters(), lr=lr)

    scheduler_sci = get_scheduler("linear", opt_sci, num_warmup_steps=0, num_training_steps=EPOCHS_STAGE1*len(train_loader))
    scheduler_bio = get_scheduler("linear", opt_bio, num_warmup_steps=0, num_training_steps=EPOCHS_STAGE1*len(train_loader))

    patience_ctr = 0
    best_local_f1 = 0.0

    for epoch in range(EPOCHS_STAGE1):
        sci.train()
        bio.train()

        loop = tqdm(train_loader, desc=f"Epoch {epoch+1}")
        for batch in loop:
            batch = {k: v.to(device) for k, v in batch.items()}

            #  Forward
            loss_sci = sci(
                input_ids=batch["input_ids_1"],
                attention_mask=batch["attention_mask_1"],
                labels=batch["labels"]
            ).loss

            loss_bio = bio(
                input_ids=batch["input_ids_2"],
                attention_mask=batch["attention_mask_2"],
                labels=batch["labels"]
            ).loss

            #  Combined loss
            loss = TRAIN_ALPHA * loss_sci + (1 - TRAIN_ALPHA) * loss_bio

            #  Backward
            opt_sci.zero_grad()
            opt_bio.zero_grad()
            loss.backward()
            opt_sci.step()
            opt_bio.step()
            scheduler_sci.step()
            scheduler_bio.step()

            loop.set_postfix(loss=loss.item())

        #  Validation
        sci.eval()
        bio.eval()

        logits_sci, logits_bio, labels_list = [], [], []
        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                logits_sci.append(
                    sci(input_ids=batch["input_ids_1"], attention_mask=batch["attention_mask_1"]).logits.cpu().numpy()
                )
                logits_bio.append(
                    bio(input_ids=batch["input_ids_2"], attention_mask=batch["attention_mask_2"]).logits.cpu().numpy()
                )
                labels_list.append(batch["labels"].cpu().numpy())

        logits_sci = np.vstack(logits_sci)
        logits_bio = np.vstack(logits_bio)
        labels_val = np.concatenate(labels_list)

        # Adaptive alpha search
        best_epoch_f1 = 0.0
        best_epoch_alpha = TRAIN_ALPHA
        best_epoch_preds = None
        for a in [0.3, 0.5, 0.7]:
            preds = (a * logits_sci + (1 - a) * logits_bio).argmax(axis=1)
            f1 = f1_score(labels_val, preds, average="micro")
            if round(f1,2) > round(best_epoch_f1,2):
                best_epoch_f1 = f1
                best_epoch_alpha = a
                best_epoch_preds = preds

        acc = accuracy_score(labels_val, best_epoch_preds)
        print(f"Epoch {epoch+1} | Val Acc={acc:.4f} | F1={best_epoch_f1:.4f} (alpha={best_epoch_alpha})")

        #  Save local best
        if round(best_epoch_f1,2) > round(best_local_f1,2):
            best_local_f1 = best_epoch_f1
            patience_ctr = 0
            torch.save(
                {"sci": sci.state_dict(),
                 "bio": bio.state_dict(),
                 "alpha": best_epoch_alpha},
                f"{SAVE_DIR}/stage1_best.pt"
            )
        else:
            patience_ctr += 1
            if patience_ctr >= PATIENCE:
                print("Early stopping triggered.")
                break

        #  Update global best
        if round(best_epoch_f1,2) > round(best_stage1_f1,2):
            best_stage1_f1 = best_epoch_f1
            best_alpha = best_epoch_alpha
            torch.save(
                {"sci": sci.state_dict(),
                 "bio": bio.state_dict(),
                 "alpha": best_alpha},
                f"{SAVE_DIR}/stage1_GLOBAL_best.pt"
            )

print(f"\n Stage-1 Training Complete ")
print(f"Best Stage-1 F1: {best_stage1_f1:.4f} with alpha={best_alpha}")



--- Training Stage-1 with LR=2e-05 ---


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Epoch 1: 100%|██████████| 2350/2350 [11:44<00:00,  3.33it/s, loss=0.205]


Epoch 1 | Val Acc=0.8868 | F1=0.8868 (alpha=0.3)


Epoch 2: 100%|██████████| 2350/2350 [11:43<00:00,  3.34it/s, loss=0.00915]


Epoch 2 | Val Acc=0.9068 | F1=0.9068 (alpha=0.5)


Epoch 3: 100%|██████████| 2350/2350 [11:44<00:00,  3.34it/s, loss=0.0149]


Epoch 3 | Val Acc=0.9072 | F1=0.9072 (alpha=0.5)


Epoch 4: 100%|██████████| 2350/2350 [11:44<00:00,  3.34it/s, loss=0.0265]


Epoch 4 | Val Acc=0.9014 | F1=0.9014 (alpha=0.3)


Epoch 5: 100%|██████████| 2350/2350 [11:44<00:00,  3.33it/s, loss=0.0308]


Epoch 5 | Val Acc=0.8989 | F1=0.8989 (alpha=0.3)


Epoch 6: 100%|██████████| 2350/2350 [11:44<00:00,  3.33it/s, loss=0.00123]


Epoch 6 | Val Acc=0.8993 | F1=0.8993 (alpha=0.3)


Epoch 7: 100%|██████████| 2350/2350 [11:44<00:00,  3.33it/s, loss=0.00352]


Epoch 7 | Val Acc=0.9059 | F1=0.9059 (alpha=0.5)
Early stopping triggered.

=== Stage-1 Training Complete ===
Best Stage-1 F1: 0.9068 with alpha=0.5


In [10]:
print("\n STAGE 2: YL2 ")

class SingleEncoderDataset(Dataset):
    def __init__(self, texts, labels, tokenizer):
        self.enc = tokenizer(
            list(texts),
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.enc["input_ids"][idx],
            "attention_mask": self.enc["attention_mask"][idx],
            "labels": self.labels[idx]
        }

children = {
    p: sorted(df[df["YL1_enc"] == p]["YL2_enc"].unique())
    for p in range(num_l1)
}

tok = AutoTokenizer.from_pretrained(SCIBERT)

for parent, child_list in children.items():
    if len(child_list) <= 1:
        continue

    print(f"\nTraining YL2 for parent {parent} ({len(child_list)} classes)")

    sub = train_df[train_df["YL1_enc"] == parent]
    val_sub = val_df[val_df["YL1_enc"] == parent]

    # Local label encoding
    le_local = {g: i for i, g in enumerate(child_list)}
    sub_labels = sub["YL2_enc"].map(le_local)
    val_labels = val_sub["YL2_enc"].map(le_local)

    train_ds = SingleEncoderDataset(
        sub[text_col], sub_labels, tok
    )
    val_ds = SingleEncoderDataset(
        val_sub[text_col], val_labels, tok
    )

    train_loader = DataLoader(
        train_ds, batch_size=BATCH_SIZE, shuffle=True
    )
    val_loader = DataLoader(
        val_ds, batch_size=BATCH_SIZE
    )

    model = AutoModelForSequenceClassification.from_pretrained(
        SCIBERT, num_labels=len(child_list)
    ).to(device)

    optimizer = AdamW(model.parameters(), lr=2e-5)

    best_acc = 0.0

    for epoch in range(EPOCHS_STAGE2):
        #  TRAIN
        model.train()
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            loss = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"],
                labels=batch["labels"]
            ).loss

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        #  VALIDATE
        model.eval()
        preds, labels = [], []

        with torch.no_grad():
            for batch in val_loader:
                batch = {k: v.to(device) for k, v in batch.items()}
                logits = model(
                    input_ids=batch["input_ids"],
                    attention_mask=batch["attention_mask"]
                ).logits

                preds.extend(logits.argmax(dim=1).cpu().numpy())
                labels.extend(batch["labels"].cpu().numpy())

        acc = accuracy_score(labels, preds)
        print(f"Epoch {epoch+1} | Val Acc={acc:.4f}")

        if round(acc, 2) > round(best_acc, 2):
            best_acc = acc
            torch.save(
                model.state_dict(),
                f"{SAVE_DIR}/stage2_parent_{parent}.pt"
            )

print("\n STAGE 2 TRAINING COMPLETE ")




Training YL2 for parent 0 (17 classes)


pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

Epoch 1 | Val Acc=0.8657
Epoch 2 | Val Acc=0.9105
Epoch 3 | Val Acc=0.9059
Epoch 4 | Val Acc=0.9012
Epoch 5 | Val Acc=0.8935
Epoch 6 | Val Acc=0.9074
Epoch 7 | Val Acc=0.8997
Epoch 8 | Val Acc=0.8920
Epoch 9 | Val Acc=0.8935
Epoch 10 | Val Acc=0.8889
Epoch 11 | Val Acc=0.8889
Epoch 12 | Val Acc=0.8781
Epoch 13 | Val Acc=0.9012
Epoch 14 | Val Acc=0.8704
Epoch 15 | Val Acc=0.8935
Epoch 16 | Val Acc=0.8873
Epoch 17 | Val Acc=0.8796
Epoch 18 | Val Acc=0.8827
Epoch 19 | Val Acc=0.8966
Epoch 20 | Val Acc=0.8904

Training YL2 for parent 1 (15 classes)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 | Val Acc=0.8816
Epoch 2 | Val Acc=0.9089
Epoch 3 | Val Acc=0.9089
Epoch 4 | Val Acc=0.9071
Epoch 5 | Val Acc=0.9126
Epoch 6 | Val Acc=0.8925
Epoch 7 | Val Acc=0.9016
Epoch 8 | Val Acc=0.9199
Epoch 9 | Val Acc=0.9144
Epoch 10 | Val Acc=0.9144
Epoch 11 | Val Acc=0.9271
Epoch 12 | Val Acc=0.9199
Epoch 13 | Val Acc=0.9089
Epoch 14 | Val Acc=0.9199
Epoch 15 | Val Acc=0.9107
Epoch 16 | Val Acc=0.9089
Epoch 17 | Val Acc=0.9144
Epoch 18 | Val Acc=0.9180
Epoch 19 | Val Acc=0.9089
Epoch 20 | Val Acc=0.9180

Training YL2 for parent 2 (19 classes)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 | Val Acc=0.7700
Epoch 2 | Val Acc=0.8219
Epoch 3 | Val Acc=0.8177
Epoch 4 | Val Acc=0.8036
Epoch 5 | Val Acc=0.8219
Epoch 6 | Val Acc=0.8050
Epoch 7 | Val Acc=0.8022
Epoch 8 | Val Acc=0.8135
Epoch 9 | Val Acc=0.8050
Epoch 10 | Val Acc=0.8135
Epoch 11 | Val Acc=0.8163
Epoch 12 | Val Acc=0.8079
Epoch 13 | Val Acc=0.8121
Epoch 14 | Val Acc=0.8317
Epoch 15 | Val Acc=0.8247
Epoch 16 | Val Acc=0.8219
Epoch 17 | Val Acc=0.8008
Epoch 18 | Val Acc=0.8008
Epoch 19 | Val Acc=0.8093
Epoch 20 | Val Acc=0.8163

Training YL2 for parent 3 (9 classes)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 | Val Acc=0.9058
Epoch 2 | Val Acc=0.9240
Epoch 3 | Val Acc=0.9271
Epoch 4 | Val Acc=0.9179
Epoch 5 | Val Acc=0.9362
Epoch 6 | Val Acc=0.9088
Epoch 7 | Val Acc=0.9149
Epoch 8 | Val Acc=0.9392
Epoch 9 | Val Acc=0.9362
Epoch 10 | Val Acc=0.9271
Epoch 11 | Val Acc=0.9301
Epoch 12 | Val Acc=0.9362
Epoch 13 | Val Acc=0.9392
Epoch 14 | Val Acc=0.9362
Epoch 15 | Val Acc=0.9362
Epoch 16 | Val Acc=0.9362
Epoch 17 | Val Acc=0.9362
Epoch 18 | Val Acc=0.9362
Epoch 19 | Val Acc=0.9362
Epoch 20 | Val Acc=0.9149

Training YL2 for parent 4 (11 classes)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 | Val Acc=0.9408
Epoch 2 | Val Acc=0.9597
Epoch 3 | Val Acc=0.9479
Epoch 4 | Val Acc=0.9408
Epoch 5 | Val Acc=0.9360
Epoch 6 | Val Acc=0.9336
Epoch 7 | Val Acc=0.9479
Epoch 8 | Val Acc=0.9479
Epoch 9 | Val Acc=0.9408
Epoch 10 | Val Acc=0.9479
Epoch 11 | Val Acc=0.9479
Epoch 12 | Val Acc=0.9455
Epoch 13 | Val Acc=0.9455
Epoch 14 | Val Acc=0.9171
Epoch 15 | Val Acc=0.9336
Epoch 16 | Val Acc=0.9431
Epoch 17 | Val Acc=0.9313
Epoch 18 | Val Acc=0.9431
Epoch 19 | Val Acc=0.9360
Epoch 20 | Val Acc=0.9336

Training YL2 for parent 5 (53 classes)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 | Val Acc=0.7886
Epoch 2 | Val Acc=0.8001
Epoch 3 | Val Acc=0.7981
Epoch 4 | Val Acc=0.8158
Epoch 5 | Val Acc=0.8103
Epoch 6 | Val Acc=0.7933
Epoch 7 | Val Acc=0.7913
Epoch 8 | Val Acc=0.7859
Epoch 9 | Val Acc=0.7879
Epoch 10 | Val Acc=0.7865
Epoch 11 | Val Acc=0.7811
Epoch 12 | Val Acc=0.7804
Epoch 13 | Val Acc=0.7920
Epoch 14 | Val Acc=0.7831
Epoch 15 | Val Acc=0.7852
Epoch 16 | Val Acc=0.7940
Epoch 17 | Val Acc=0.7893
Epoch 18 | Val Acc=0.7723
Epoch 19 | Val Acc=0.7797
Epoch 20 | Val Acc=0.7913

Training YL2 for parent 6 (9 classes)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1 | Val Acc=0.8587
Epoch 2 | Val Acc=0.8693
Epoch 3 | Val Acc=0.8657
Epoch 4 | Val Acc=0.8922
Epoch 5 | Val Acc=0.8657
Epoch 6 | Val Acc=0.8728
Epoch 7 | Val Acc=0.8746
Epoch 8 | Val Acc=0.8675
Epoch 9 | Val Acc=0.8763
Epoch 10 | Val Acc=0.8622
Epoch 11 | Val Acc=0.8657
Epoch 12 | Val Acc=0.8763
Epoch 13 | Val Acc=0.8763
Epoch 14 | Val Acc=0.8604
Epoch 15 | Val Acc=0.8622
Epoch 16 | Val Acc=0.8799
Epoch 17 | Val Acc=0.8640
Epoch 18 | Val Acc=0.8799
Epoch 19 | Val Acc=0.8728
Epoch 20 | Val Acc=0.8816



In [10]:
print("\n STAGE 1 TEST REPORT (YL1) ")

# load the saved
checkpoint = torch.load(f"{SAVE_DIR}/stage1_best.pt", map_location=device)

# load models
sci = AutoModelForSequenceClassification.from_pretrained(
    SCIBERT, num_labels=num_l1
).to(device)
bio = AutoModelForSequenceClassification.from_pretrained(
    BIOBERT, num_labels=num_l1
).to(device)

sci.load_state_dict(checkpoint["sci"])
bio.load_state_dict(checkpoint["bio"])
alpha = checkpoint["alpha"]

sci.eval()
bio.eval()

all_preds, all_labels = [], []

with torch.no_grad():
    for batch in test_loader:
        batch = {k: v.to(device) for k, v in batch.items()}

        logits_sci = sci(
            input_ids=batch["input_ids_1"],
            attention_mask=batch["attention_mask_1"]
        ).logits

        logits_bio = bio(
            input_ids=batch["input_ids_2"],
            attention_mask=batch["attention_mask_2"]
        ).logits

        # adaptive ensemble
        logits = alpha * logits_sci + (1 - alpha) * logits_bio
        preds = logits.argmax(dim=1)

        all_preds.extend(preds.cpu().numpy())
        all_labels.extend(batch["labels"].cpu().numpy())

# metrics
acc = accuracy_score(all_labels, all_preds)
print("Stage-1 Accuracy:", acc)

target_names = [str(c) for c in le_l1.classes_]

report_l1 = classification_report(
    all_labels,
    all_preds,
    target_names=target_names,
    digits=4
)

print(report_l1)

# classification report
with open(f"{SAVE_DIR}/stage1_test_report.txt", "w") as f:
    f.write(report_l1)





pytorch_model.bin:   0%|          | 0.00/442M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/442M [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Stage-1 Accuracy: 0.9109290198999681
              precision    recall  f1-score   support

           0     0.9408    0.9271    0.9339      1303
           1     0.9334    0.9572    0.9451      1098
           2     0.8925    0.8662    0.8792      1428
           3     0.9220    0.9164    0.9192       658
           4     0.9392    0.9093    0.9240       849
           5     0.8857    0.9326    0.9086      2925
           6     0.9196    0.8460    0.8812      1136

    accuracy                         0.9109      9397
   macro avg     0.9190    0.9078    0.9130      9397
weighted avg     0.9114    0.9109    0.9107      9397



In [13]:
print("\n STAGE 2 TEST REPORT (YL2) ")

# parent -> child mapping
children = {
    p: sorted(train_df[train_df["YL1_enc"] == p]["YL2_enc"].unique())
    for p in sorted(train_df["YL1_enc"].unique())
}

y_true_l2 = []
y_pred_l2 = []

tok = AutoTokenizer.from_pretrained(SCIBERT)

class SingleEncoderDataset(Dataset):
    def __init__(self, texts, labels):
        self.enc = tok(
            list(texts),
            padding=True,
            truncation=True,
            max_length=MAX_LEN,
            return_tensors="pt"
        )
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            "input_ids": self.enc["input_ids"][idx],
            "attention_mask": self.enc["attention_mask"][idx],
            "labels": self.labels[idx]
        }

for parent, child_list in children.items():
    if len(child_list) <= 1:
        continue

    model_path = f"{SAVE_DIR}/stage2_parent_{parent}.pt"
    if not os.path.exists(model_path):
        continue

    model = AutoModelForSequenceClassification.from_pretrained(
        SCIBERT, num_labels=len(child_list)
    ).to(device)
    model.load_state_dict(torch.load(model_path, map_location=device))
    model.eval()

    subset = test_df[test_df["YL1_enc"] == parent]
    if subset.empty:
        continue

    # global <-> local mapping
    label_map = {g: i for i, g in enumerate(child_list)}
    inv_map = {i: g for g, i in label_map.items()}

    test_ds = SingleEncoderDataset(
        subset[text_col],
        subset["YL2_enc"].map(label_map)
    )
    test_loader_local = DataLoader(test_ds, batch_size=BATCH_SIZE)

    with torch.no_grad():
        for batch in test_loader_local:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(
                input_ids=batch["input_ids"],
                attention_mask=batch["attention_mask"]
            ).logits

            preds_local = logits.argmax(dim=1).cpu().numpy()
            labels_local = batch["labels"].cpu().numpy()

            y_pred_l2.extend([inv_map[p] for p in preds_local])
            y_true_l2.extend([inv_map[t] for t in labels_local])

#  metrics
acc_l2 = accuracy_score(y_true_l2, y_pred_l2)
print("Stage-2 Accuracy:", acc_l2)

report_l2 = classification_report(
    y_true_l2,
    y_pred_l2,
    digits=4
)

print(report_l2)

with open(f"{SAVE_DIR}/stage2_test_report.txt", "w") as f:
    f.write(report_l2)





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the 

Stage-2 Accuracy: 0.8630413961902735
              precision    recall  f1-score   support

           0     0.8668    0.8486    0.8576       621
           1     0.8613    0.8646    0.8629       517
           2     0.9061    0.8891    0.8975       586
           3     0.8685    0.9043    0.8860       533
           4     0.8657    0.8126    0.8383       571
           5     0.9104    0.8912    0.9007       570
           6     0.8287    0.8821    0.8546       543
           7     0.8739    0.8341    0.8536       615
           8     0.8911    0.8930    0.8920       458
           9     0.9151    0.8920    0.9034       435
          10     0.8617    0.8351    0.8482       388
          11     0.8333    0.8293    0.8313       205
          12     0.8967    0.9205    0.9084       264
          13     0.9114    0.9392    0.9251       263
          14     0.8641    0.8930    0.8783       299
          15     0.8349    0.8590    0.8468       312
          16     0.8835    0.8667    0.8750 

In [26]:
print("\n HIERARCHICAL TEST REPORT ")

# load stage-1 checkpoint
ckpt = torch.load(f"{SAVE_DIR}/stage1_best.pt", map_location="cpu")

sci = AutoModelForSequenceClassification.from_pretrained(
    SCIBERT, num_labels=num_l1
)
bio = AutoModelForSequenceClassification.from_pretrained(
    BIOBERT, num_labels=num_l1
)

sci.load_state_dict(ckpt["sci"])
bio.load_state_dict(ckpt["bio"])
alpha = ckpt["alpha"]

# models to device
sci.to(device).eval()
bio.to(device).eval()


sci_tok = AutoTokenizer.from_pretrained(SCIBERT)
bio_tok = AutoTokenizer.from_pretrained(BIOBERT)
tok = AutoTokenizer.from_pretrained(SCIBERT)  # stage-2 tokenizer

# prepare stage-2 models
stage2_models = {}
for parent, child_list in children.items():
    if len(child_list) <= 1:
        continue

    path = f"{SAVE_DIR}/stage2_parent_{parent}.pt"
    if not os.path.exists(path):
        continue

    m = AutoModelForSequenceClassification.from_pretrained(
        SCIBERT, num_labels=len(child_list)
    )
    m.load_state_dict(torch.load(path, map_location="cpu"))
    m.to(device).eval()

    stage2_models[parent] = {
        "model": m,
        "inv_map": {i: g for i, g in enumerate(child_list)}
    }

#  Inference
y_true_l1, y_pred_l1 = [], []
y_true_l2, y_pred_l2 = [], []

with torch.no_grad():
    for idx, row in test_df.iterrows():
        text = row[text_col]
        true_l1 = row["YL1_enc"]
        true_l2 = row["YL2_enc"]

        # stage-1 tokenization (individual for each model)
        enc_sci = sci_tok(
            text, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt"
        )
        enc_bio = bio_tok(
            text, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt"
        )

        enc_sci = {k: v.to(device) for k, v in enc_sci.items()}
        enc_bio = {k: v.to(device) for k, v in enc_bio.items()}

        # stage-1 inference
        logits_sci = sci(**enc_sci).logits
        logits_bio = bio(**enc_bio).logits
        logits_l1 = alpha * logits_sci + (1 - alpha) * logits_bio
        pred_l1 = logits_l1.argmax(dim=1).item()

        # stage-2 inference
        if pred_l1 in stage2_models:
            m = stage2_models[pred_l1]["model"]
            inv_map = stage2_models[pred_l1]["inv_map"]

            enc2 = tok(
                text, padding=True, truncation=True, max_length=MAX_LEN, return_tensors="pt"
            )
            enc2 = {k: v.to(device) for k, v in enc2.items()}

            logits_l2 = m(**enc2).logits
            pred_local = logits_l2.argmax(dim=1).item()
            pred_l2 = inv_map[pred_local]
        else:
            # for single-child parents
            pred_l2 = true_l2

        #  Store
        y_true_l1.append(true_l1)
        y_pred_l1.append(pred_l1)
        y_true_l2.append(true_l2)
        y_pred_l2.append(pred_l2)



# hierarchical accuracy : both levels correct
hier_acc = ((np.array(y_true_l1) == np.array(y_pred_l1)) &
            (np.array(y_true_l2) == np.array(y_pred_l2))).mean()
print(f"\nHierarchical Accuracy: {hier_acc:.4f}")

# stage-1 accuracy
acc_l1 = accuracy_score(y_true_l1, y_pred_l1)
print(f"YL1 Accuracy: {acc_l1:.4f}")

# stage-2 accuracy: only where YL1 is correct
mask = np.array(y_true_l1) == np.array(y_pred_l1)
acc_l2_cond = accuracy_score(np.array(y_true_l2)[mask], np.array(y_pred_l2)[mask])
print(f"YL2 Accuracy | YL1 Correct: {acc_l2_cond:.4f}")

# classification reports
report_l1 = classification_report(y_true_l1, y_pred_l1, digits=4)
report_l2 = classification_report(np.array(y_true_l2)[mask],
                                  np.array(y_pred_l2)[mask], digits=4)

print("\nStage-1 Classification Report:\n", report_l1)
print("\nStage-2 Classification Report (Conditional on YL1 Correct):\n", report_l2)





Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at allenai/scibert_scivocab_uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForSequenceClassification were not initialized from the 


Hierarchical Accuracy: 0.8040
YL1 Accuracy: 0.9109
YL2 Accuracy | YL1 Correct: 0.8826

Stage-1 Classification Report:
               precision    recall  f1-score   support

           0     0.9408    0.9271    0.9339      1303
           1     0.9334    0.9572    0.9451      1098
           2     0.8925    0.8662    0.8792      1428
           3     0.9220    0.9164    0.9192       658
           4     0.9392    0.9093    0.9240       849
           5     0.8857    0.9326    0.9086      2925
           6     0.9196    0.8460    0.8812      1136

    accuracy                         0.9109      9397
   macro avg     0.9190    0.9078    0.9130      9397
weighted avg     0.9114    0.9109    0.9107      9397


Stage-2 Classification Report (Conditional on YL1 Correct):
               precision    recall  f1-score   support

           0     0.8917    0.8713    0.8814       567
           1     0.8922    0.8961    0.8942       462
           2     0.9240    0.9017    0.9127       539
    