In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# !pip install scispacy
!pip install /content/drive/MyDrive/eq_5d/scispacy/en_core_sci_sm-0.5.4.tar.gz

Processing ./drive/MyDrive/eq_5d/scispacy/en_core_sci_sm-0.5.4.tar.gz
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting spacy<3.8.0,>=3.7.4 (from en_core_sci_sm==0.5.4)
  Downloading spacy-3.7.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (27 kB)
Collecting thinc<8.3.0,>=8.2.2 (from spacy<3.8.0,>=3.7.4->en_core_sci_sm==0.5.4)
  Downloading thinc-8.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (15 kB)
Collecting blis<0.8.0,>=0.7.8 (from thinc<8.3.0,>=8.2.2->spacy<3.8.0,>=3.7.4->en_core_sci_sm==0.5.4)
  Downloading blis-0.7.11-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.4 kB)
Collecting numpy>=1.19.0 (from spacy<3.8.0,>=3.7.4->en_core_sci_sm==0.5.4)
  Downloading numpy-1.26.4-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m5.8 MB/s[0m eta [36m0:00:00[0m
Downloading spacy-3.7.5-cp311-

In [None]:

import os
import re
import random
import numpy as np
import pandas as pd
import spacy
from tqdm import tqdm
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, f1_score

import torch
from torch.utils.data import DataLoader, TensorDataset, RandomSampler, SequentialSampler
from torch.optim import AdamW

from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup



In [None]:
DATA_CSV = "drive/MyDrive/eq_5d/eq-5d-200-records.csv"
TEXT_COL = "Abstract"
LABEL_COL = "Label"
ID_COL = "No"

MODEL_NAME = "bert-base-uncased"
MAX_LEN = 256
BATCH_SIZE = 16
EPOCHS = 20
EARLY_STOP = 5
LEARNING_RATES = [2e-5, 5e-6, 2e-6, 1e-6]
SEED = 42
OUTPUT_DIR = "drive/MyDrive/eq_5d/bert_optimized_sm/"
os.makedirs(OUTPUT_DIR, exist_ok=True)



In [None]:

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)



In [None]:
set_seed(SEED)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")




Using device: cuda


In [None]:
nlp = spacy.load("en_core_sci_sm")



  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [None]:
def enrich_sentence(sent_text, nlp):
    doc = nlp(sent_text)
    ents = [f"{ent.text.strip()}|{ent.label_}" for ent in doc.ents if ent.text.strip()]
    if ents:
        unique = list(dict.fromkeys(ents))[:30]
        return sent_text + " [ENTS: " + "; ".join(unique) + "]"
    return sent_text



In [None]:
def split_and_enrich(df, nlp, text_col, id_col, label_col):
    rows = []
    for _, row in tqdm(df.iterrows(), total=len(df)):
        text = str(row[text_col]) if pd.notna(row[text_col]) else ""
        if not text:
            continue
        doc = nlp(text)
        for sent in doc.sents:
            s = sent.text.strip()
            if s:
                rows.append({
                    id_col: row[id_col],
                    "Sentence": s,
                    "Enriched": enrich_sentence(s, nlp),
                    "Label": int(row[label_col])
                })
    return pd.DataFrame(rows)




In [None]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [None]:
def encode_dataset(df, tokenizer, text_col, label_col, max_len):
    enc = tokenizer(
        df[text_col].tolist(),
        max_length=max_len,
        padding="max_length",
        truncation=True,
        return_tensors="pt"
    )
    labels = torch.tensor(df[label_col].values)
    return TensorDataset(enc["input_ids"], enc["attention_mask"], labels)


In [None]:

def train_eval(lr, train_loader, val_loader, device):
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2).to(device)
    optimizer = AdamW(model.parameters(), lr=lr, eps=1e-8)
    total_steps = len(train_loader) * EPOCHS
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)

    best_f1, best_state = -1, None
    patience = 0

    for epoch in range(EPOCHS):
        # Training
        model.train()
        total_loss = 0
        for batch in train_loader:
            input_ids, attn_mask, labels = [t.to(device) for t in batch]
            model.zero_grad()
            outputs = model(input_ids, attention_mask=attn_mask, labels=labels)
            loss = outputs.loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

        # Validation
        model.eval()
        preds, gold = [], []
        with torch.no_grad():
            for batch in val_loader:
                input_ids, attn_mask, labels = [t.to(device) for t in batch]
                logits = model(input_ids, attention_mask=attn_mask).logits
                preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
                gold.extend(labels.cpu().numpy())
        f1 = f1_score(gold, preds, average="micro")
        print(f"LR={lr:.1e} | Epoch {epoch+1} | Loss={total_loss/len(train_loader):.4f} | Val F1={f1:.4f}")

        if f1 > best_f1:
            best_f1, best_state = f1, {k: v.cpu() for k,v in model.state_dict().items()}
            patience = 0
        else:
            patience += 1
            if patience >= EARLY_STOP:
                print("Early stopping.")
                break
    return best_state, best_f1




In [None]:
def predict_and_evaluate(model, df, tokenizer, text_col, label_col, id_col, device, output_prefix):
    ds = encode_dataset(df, tokenizer, text_col, label_col, MAX_LEN)
    loader = DataLoader(ds, batch_size=BATCH_SIZE, sampler=SequentialSampler(ds))

    model.eval()
    all_logits, preds, gold = [], [], []
    with torch.no_grad():
        for batch in loader:
            input_ids, attn_mask, labels = [t.to(device) for t in batch]
            logits = torch.softmax(model(input_ids, attention_mask=attn_mask).logits, dim=1)
            all_logits.extend(logits.cpu().numpy())
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            gold.extend(labels.cpu().numpy())

    # Sentence-level results
    df["Pred"] = preds
    df["Conf_Label_0"] = [l[0] for l in all_logits]
    df["Conf_Label_1"] = [l[1] for l in all_logits]
    df.to_csv(os.path.join(OUTPUT_DIR, f"{output_prefix}_sentence_preds_5.csv"), index=False)

    print("\n=== Sentence-level Report ===")
    print(classification_report(gold, preds))
    print(confusion_matrix(gold, preds))

    # Study-level aggregation (confidence-based)
    summary = (
        df.groupby(id_col).apply(
            lambda x: pd.Series({
                "True_Label": x[label_col].mode()[0],
                "Avg_Conf_0": x["Conf_Label_0"].mean(),
                "Avg_Conf_1": x["Conf_Label_1"].mean()
            })
        ).reset_index()
    )
    summary["Pred_Label"] = (summary["Avg_Conf_1"] > summary["Avg_Conf_0"]).astype(int)
    summary.to_csv(os.path.join(OUTPUT_DIR, f"{output_prefix}_study_preds_5.csv"), index=False)

    print("\n=== Study-level Report ===")
    print(classification_report(summary["True_Label"], summary["Pred_Label"]))
    print(confusion_matrix(summary["True_Label"], summary["Pred_Label"]))


In [None]:

if __name__ == "__main__":
    # Load data
    df = pd.read_csv(DATA_CSV)[[ID_COL, TEXT_COL, LABEL_COL]].dropna()
    df[LABEL_COL] = df[LABEL_COL].astype(int)

    train_df, test_df = train_test_split(df, test_size=0.3, stratify=df[LABEL_COL], random_state=SEED)
    _, val_df = train_test_split(test_df, test_size=0.5, stratify=test_df[LABEL_COL], random_state=SEED)

    # Sentence splitting + enrichment
    print("Processing data with SciSpaCy...")
    train_sents = split_and_enrich(train_df, nlp, TEXT_COL, ID_COL, LABEL_COL)
    val_sents   = split_and_enrich(val_df,   nlp, TEXT_COL, ID_COL, LABEL_COL)
    test_sents  = split_and_enrich(test_df,  nlp, TEXT_COL, ID_COL, LABEL_COL)

    # Encode datasets
    train_ds = encode_dataset(train_sents, tokenizer, "Enriched", LABEL_COL, MAX_LEN)
    val_ds   = encode_dataset(val_sents,   tokenizer, "Enriched", LABEL_COL, MAX_LEN)

    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, sampler=RandomSampler(train_ds))
    val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE, sampler=SequentialSampler(val_ds))

    # Train with multiple learning rates
    best_overall, best_lr = -1, None
    best_state = None
    for lr in LEARNING_RATES:
        state, f1 = train_eval(lr, train_loader, val_loader, device)
        if f1 > best_overall:
            best_overall, best_lr, best_state = f1, lr, state

    print(f"\nBest LR={best_lr:.1e} | Val F1={best_overall:.4f}")
    best_model_path = os.path.join(OUTPUT_DIR, f"best_bert_lr{best_lr:.0e}_5.pth")
    torch.save(best_state, best_model_path)

    # Load best model
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=2)
    model.load_state_dict(torch.load(best_model_path))
    model.to(device)

    # Final evaluation on test set
    predict_and_evaluate(model, test_sents, tokenizer, "Enriched", LABEL_COL, ID_COL, device, "test")


Processing data with SciSpaCy...


100%|██████████| 140/140 [00:22<00:00,  6.36it/s]
100%|██████████| 30/30 [00:04<00:00,  6.13it/s]
100%|██████████| 60/60 [00:09<00:00,  6.41it/s]


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LR=2.0e-05 | Epoch 1 | Loss=0.6772 | Val F1=0.6514
LR=2.0e-05 | Epoch 2 | Loss=0.6497 | Val F1=0.6946
LR=2.0e-05 | Epoch 3 | Loss=0.5295 | Val F1=0.6324
LR=2.0e-05 | Epoch 4 | Loss=0.2937 | Val F1=0.6622
LR=2.0e-05 | Epoch 5 | Loss=0.1174 | Val F1=0.6189
LR=2.0e-05 | Epoch 6 | Loss=0.0414 | Val F1=0.6297
LR=2.0e-05 | Epoch 7 | Loss=0.0279 | Val F1=0.6486
Early stopping.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LR=5.0e-06 | Epoch 1 | Loss=0.7099 | Val F1=0.6378
LR=5.0e-06 | Epoch 2 | Loss=0.6651 | Val F1=0.6568
LR=5.0e-06 | Epoch 3 | Loss=0.6351 | Val F1=0.5676
LR=5.0e-06 | Epoch 4 | Loss=0.5811 | Val F1=0.6216
LR=5.0e-06 | Epoch 5 | Loss=0.4906 | Val F1=0.5730
LR=5.0e-06 | Epoch 6 | Loss=0.4050 | Val F1=0.6405
LR=5.0e-06 | Epoch 7 | Loss=0.2986 | Val F1=0.5784
Early stopping.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LR=2.0e-06 | Epoch 1 | Loss=0.6740 | Val F1=0.6541
LR=2.0e-06 | Epoch 2 | Loss=0.6753 | Val F1=0.6486
LR=2.0e-06 | Epoch 3 | Loss=0.6672 | Val F1=0.6649
LR=2.0e-06 | Epoch 4 | Loss=0.6499 | Val F1=0.6514
LR=2.0e-06 | Epoch 5 | Loss=0.6237 | Val F1=0.6568
LR=2.0e-06 | Epoch 6 | Loss=0.5880 | Val F1=0.6622
LR=2.0e-06 | Epoch 7 | Loss=0.5594 | Val F1=0.6486
LR=2.0e-06 | Epoch 8 | Loss=0.5390 | Val F1=0.6568
Early stopping.


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


LR=1.0e-06 | Epoch 1 | Loss=0.7263 | Val F1=0.5054
LR=1.0e-06 | Epoch 2 | Loss=0.6789 | Val F1=0.6459
LR=1.0e-06 | Epoch 3 | Loss=0.6670 | Val F1=0.6486
LR=1.0e-06 | Epoch 4 | Loss=0.6589 | Val F1=0.6486
LR=1.0e-06 | Epoch 5 | Loss=0.6523 | Val F1=0.6541
LR=1.0e-06 | Epoch 6 | Loss=0.6453 | Val F1=0.6514
LR=1.0e-06 | Epoch 7 | Loss=0.6363 | Val F1=0.6514
LR=1.0e-06 | Epoch 8 | Loss=0.6228 | Val F1=0.6622
LR=1.0e-06 | Epoch 9 | Loss=0.6099 | Val F1=0.6649
LR=1.0e-06 | Epoch 10 | Loss=0.6016 | Val F1=0.6486
LR=1.0e-06 | Epoch 11 | Loss=0.5963 | Val F1=0.6297
LR=1.0e-06 | Epoch 12 | Loss=0.5762 | Val F1=0.6622
LR=1.0e-06 | Epoch 13 | Loss=0.5736 | Val F1=0.6541
LR=1.0e-06 | Epoch 14 | Loss=0.5682 | Val F1=0.6649
Early stopping.

Best LR=2.0e-05 | Val F1=0.6946


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.



=== Sentence-level Report ===
              precision    recall  f1-score   support

           0       0.85      0.23      0.37       265
           1       0.68      0.98      0.80       449

    accuracy                           0.70       714
   macro avg       0.77      0.60      0.59       714
weighted avg       0.74      0.70      0.64       714

[[ 62 203]
 [ 11 438]]

=== Study-level Report ===
              precision    recall  f1-score   support

         0.0       1.00      0.17      0.29        24
         1.0       0.64      1.00      0.78        36

    accuracy                           0.67        60
   macro avg       0.82      0.58      0.53        60
weighted avg       0.79      0.67      0.58        60

[[ 4 20]
 [ 0 36]]


  df.groupby(id_col).apply(
