In [None]:
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_scheduler
from torch.utils.data import DataLoader
from sklearn.metrics import precision_recall_fscore_support, roc_auc_score
from datasets import Dataset
from sklearn.model_selection import train_test_split
import csv
import torch.nn.utils.prune as prune

# === Configuration ===
model_name = "emilyalsentzer/Bio_ClinicalBERT"
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
batch_size = 16
max_length = 256
epochs = 5
symptom = "fever"

# === Load and preprocess data ===
notes = pd.read_csv("NOTEEVENTS_random_chatgpt.csv")
notes = notes.dropna(subset=["TEXT"])
notes = notes.sample(frac=1, random_state=42)
notes["TEXT"] = notes["TEXT"].str.slice(0, 1000)
notes["label"] = notes["TEXT"].str.lower().str.contains(symptom.lower()).astype(int)

train_df, test_df = train_test_split(notes[["TEXT", "label"]].rename(columns={"TEXT": "text", "label": "labels"}), test_size=0.1, random_state=42)
train_dataset = Dataset.from_pandas(train_df.reset_index(drop=True))
eval_dataset = Dataset.from_pandas(test_df.reset_index(drop=True))

# === Tokenization ===
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_fn(examples):
    return tokenizer(examples["text"], padding="max_length", truncation=True, max_length=max_length)

train_dataset = train_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
eval_dataset = eval_dataset.map(tokenize_fn, batched=True, remove_columns=["text"])
train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=batch_size)

# === Evaluation ===
def evaluate_model(model, loader):
    model.eval()
    all_preds, all_labels, all_probs = [], [], []
    with torch.no_grad():
        for batch in loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            logits = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).logits
            probs = torch.sigmoid(logits).squeeze().cpu().numpy()
            preds = (probs > 0.5).astype(int)
            labels = batch["labels"].cpu().numpy()
            all_preds.extend(preds)
            all_probs.extend(probs)
            all_labels.extend(labels)
    precision, recall, f1, _ = precision_recall_fscore_support(all_labels, all_preds, average="binary", zero_division=0)
    try:
        auroc = roc_auc_score(all_labels, all_probs)
    except:
        auroc = 0.0
    return precision, recall, f1, auroc

# === Training ===
def train_model(model, method_name):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5)
    scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=0, num_training_steps=epochs * len(train_loader))
    csv_file = f"symptom_{method_name}_metrics.csv"
    with open(csv_file, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Epoch", "Precision", "Recall", "F1", "AUROC"])

    for epoch in range(1, epochs + 1):
        model.train()
        total_loss = 0
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            outputs = model(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"])
            loss = F.binary_cross_entropy_with_logits(outputs.logits.squeeze(), batch["labels"].float())
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

        precision, recall, f1, auroc = evaluate_model(model, eval_loader)
        with open(csv_file, mode="a", newline="") as file:
            writer = csv.writer(file)
            writer.writerow([epoch, precision, recall, f1, auroc])
        print(f"{method_name} | Epoch {epoch}/{epochs} - Loss: {total_loss/len(train_loader):.4f} - P: {precision:.4f} R: {recall:.4f} F1: {f1:.4f} AUROC: {auroc:.4f}")

# === Models ===
def base_model():
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
    train_model(model, "base")

def pruning_model():
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            prune.l1_unstructured(module, name="weight", amount=0.3)
    train_model(model, "pruning")

def lowrank_model():
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
    for name, module in model.named_modules():
        if isinstance(module, nn.Linear):
            weight = module.weight.data
            try:
                u, s, v = torch.svd_lowrank(weight, q=8)
                module.weight.data.copy_((u @ torch.diag(s) @ v.t()).to(weight.device))
            except:
                pass
    train_model(model, "lowrank")

def quantization_model():
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)
    model.to("cpu")
    model.eval()
    model_quantized = torch.quantization.quantize_dynamic(model, {nn.Linear}, dtype=torch.qint8)
    model_quantized.to(device)
    train_model(model_quantized, "quantization")

def distillation_model():
    teacher = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
    teacher.eval()
    student = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1).to(device)
    optimizer = torch.optim.AdamW(student.parameters(), lr=2e-5)
    scheduler = get_scheduler("cosine", optimizer=optimizer, num_warmup_steps=0, num_training_steps=epochs * len(train_loader))

    csv_file = "symptom_distillation_metrics.csv"
    with open(csv_file, mode="w", newline="") as file:
        writer = csv.writer(file)
        writer.writerow(["Epoch", "Precision", "Recall", "F1", "AUROC"])

    for epoch in range(1, epochs + 1):
        student.train()
        total_loss = 0
        for batch in train_loader:
            batch = {k: v.to(device) for k, v in batch.items()}
            optimizer.zero_grad()
            with torch.no_grad():
                teacher_logits = teacher(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).logits
            student_logits = student(input_ids=batch["input_ids"], attention_mask=batch["attention_mask"]).logits
            ce_loss = F.binary_cross_entropy_with_logits(student_logits.squeeze(), batch["labels"].float())
            kd_loss = F.kl_div(
                F.logsigmoid(student_logits.squeeze()),
                torch.sigmoid(teacher_logits.squeeze()),
                reduction="batchmean"
            )
            loss = 0.1 * ce_loss + 0.9 * kd_loss
            loss.backward()
            optimizer.step()
            scheduler.step()
            total_loss += loss.item()

        precision, recall, f1, auroc = evaluate_model(student, eval_loader)
        with open(csv_file, mode="a", newline="") as file:
            writer = csv.writer(file)
            writer.writerow([epoch, precision, recall, f1, auroc])
        print(f"Distillation | Epoch {epoch}/{epochs} - Loss: {total_loss/len(train_loader):.4f} - P: {precision:.4f} R: {recall:.4f} F1: {f1:.4f} AUROC: {auroc:.4f}")

def run_all():
    base_model()
    pruning_model()
    lowrank_model()
    distillation_model()
    quantization_model()

if __name__ == "__main__":
    run_all()

Map:   0%|          | 0/4500 [00:00<?, ? examples/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
