In [1]:
#importing dataset
from huggingface_hub import login
login("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")

In [2]:
from huggingface_hub import hf_hub_download

file_path = hf_hub_download(
    repo_id="AGBonnet/augmented-clinical-notes",
    filename="augmented_notes_30K.jsonl",                                              
    repo_type="dataset"
    )

In [3]:
import pandas as pd
import json

df = pd.read_json(file_path, lines=True)

In [4]:
#extracting the clinical note and condition (target) to be used
import json
import pandas as pd

def extract_note_and_label(row):
    summary = row.get("summary", {})
    if isinstance(summary, str):
        try:
            summary = json.loads(summary)
        except json.JSONDecodeError:
            cleaned = summary.replace("“", '"').replace("”", '"').replace("\n", "")
            try:
                summary = json.loads(cleaned)
            except json.JSONDecodeError:
                summary = {}

    text = row.get("note", "")
    text = text.strip() if isinstance(text, str) else ""

    label = ""
    if isinstance(summary, dict):
        for test_entry in summary.get("diagnosis tests", []):
            if not isinstance(test_entry, dict):
                continue

            cond_val = test_entry.get("condition", "")

            if not isinstance(cond_val, str):
                continue

            #check if the condition is none 
            cond = cond_val.strip().lower()
            if cond and cond not in ["none", "not specified", "not mentioned"]:
                label = cond
                break

    return pd.Series({"note": text, "label": label})

df_processed = df.apply(extract_note_and_label, axis=1)
df_processed = df_processed[df_processed["label"] != ""]



In [5]:
df_processed.head(10)

Unnamed: 0,note,label
2,A 36-year old female patient visited our hospi...,idiopathic osteonecrosis of the femoral head
3,A 49-year-old male presented with a complaint ...,"proximal ulnar shaft fracture, hypertrophic no..."
4,A 47-year-old male patient was referred to the...,bone marrow edema
5,A 24-year-old Yemeni female presented to the e...,osteomalacia
6,We report a 24-day-old female baby who present...,diaphragmatic defect
7,A 16 years old female patient presented to us ...,polyostotic fibrous dysplasia
9,A 23-year-old female patient was admitted to a...,simple skin lesion
12,"The patient was a healthy 13-year-old female, ...",spt of the pancreas
13,A 60-year-old Kashmiri female presented to our...,fracture
14,A 47 year old gentleman presented to his gener...,initially suspected acute disc prolapse


In [6]:
N = 30  
top_labels = df_processed["label"].value_counts().nlargest(N).index
df_filtered = df_processed[df_processed["label"].isin(top_labels)].copy()

In [7]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

#labelencoding 


label = LabelEncoder()
df_filtered["label"] = label.fit_transform(df_filtered["label"])

label_map = dict(zip(label.classes_, label.transform(label.classes_)))

train_note, test_note, train_label, test_label = train_test_split(
    df_filtered["note"].tolist(),
    df_filtered["label"].tolist(),
    test_size = 0.2,
    stratify = df_filtered["label"],
    random_state = 123
)

num_labels = len(label.classes_)
num_labels

30

In [18]:
from sklearn.model_selection import train_test_split

train_note, val_note, train_label_split, val_label_split = train_test_split(
    train_note, train_label, test_size=0.1, stratify=train_label, random_state=42
)

In [19]:
from transformers import AutoTokenizer

#tokenize medical terminology
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_ClinicalBERT")

#tokenize for nlp
train_encodings = tokenizer(train_note, truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(test_note, truncation=True, padding=True, max_length=512)
val_encodings = tokenizer(val_note, truncation=True, padding=True, max_length=512)


In [20]:
from torch.utils.data import Dataset
import torch

#Ensure dataset is compatible for torch

class ClinicalNotes(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
    
    def __getitem__(self, id):
        #dictionary for labels/inputs
        item = {
            "labels": torch.tensor(self.labels[id]), #get label
            "attention_mask": torch.tensor(self.encodings["attention_mask"][id]),
            "input_ids": torch.tensor(self.encodings["input_ids"][id]) #gets token at poisition id
        }

        return item
    def __len__(self):
        return len(self.labels)

#apply the formatted dataset to existing one
train_dataset = ClinicalNotes(train_encodings, train_label_split)
test_dataset = ClinicalNotes(test_encodings, test_label)
val_dataset = ClinicalNotes(val_encodings, val_label_split)

In [21]:
from transformers import AutoModelForSequenceClassification

# Load Bio_ClinicalBERT 
model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=len(label_map)  # Number of disease categories you're predicting
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    num_train_epochs=3,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    logging_dir="./logs",
    logging_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

In [14]:
import numpy as np 
from sklearn.metrics import accuracy_score, f1_score

def metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis = -1)
    accuracy = accuracy_score(labels, predictions)
    f1 = f1_score(labels, predictions, average = "weighted")
    return {"accuracy": accuracy, "f1": f1}

In [23]:
from transformers import Trainer
trainer = Trainer(
    model = model,
    args = training_args,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    compute_metrics = metrics
)

In [24]:
#Train the model

trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,3.1168,2.475745,0.388889,0.262455
2,1.9682,1.521078,0.666667,0.59301
3,1.2276,1.167195,0.788889,0.759642


TrainOutput(global_step=306, training_loss=2.1041993035210504, metrics={'train_runtime': 5026.3786, 'train_samples_per_second': 0.483, 'train_steps_per_second': 0.061, 'total_flos': 639520600043520.0, 'train_loss': 2.1041993035210504, 'epoch': 3.0})

In [25]:
model.save_pretrained("./clinicalbert-finetuned")
tokenizer.save_pretrained("./clinicalbert-finetuned")

('./clinicalbert-finetuned\\tokenizer_config.json',
 './clinicalbert-finetuned\\special_tokens_map.json',
 './clinicalbert-finetuned\\vocab.txt',
 './clinicalbert-finetuned\\added_tokens.json',
 './clinicalbert-finetuned\\tokenizer.json')

In [26]:
!pip install optuna



[notice] A new release of pip is available: 24.3.1 -> 25.1.1
[notice] To update, run: C:\Users\User\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip


Collecting optuna
  Downloading optuna-4.3.0-py3-none-any.whl.metadata (17 kB)
Collecting alembic>=1.5.0 (from optuna)
  Downloading alembic-1.16.1-py3-none-any.whl.metadata (7.3 kB)
Collecting colorlog (from optuna)
  Downloading colorlog-6.9.0-py3-none-any.whl.metadata (10 kB)
Collecting sqlalchemy>=1.4.2 (from optuna)
  Downloading sqlalchemy-2.0.41-cp311-cp311-win_amd64.whl.metadata (9.8 kB)
Collecting Mako (from alembic>=1.5.0->optuna)
  Downloading mako-1.3.10-py3-none-any.whl.metadata (2.9 kB)
Collecting greenlet>=1 (from sqlalchemy>=1.4.2->optuna)
  Downloading greenlet-3.2.3-cp311-cp311-win_amd64.whl.metadata (4.2 kB)
Downloading optuna-4.3.0-py3-none-any.whl (386 kB)
Downloading alembic-1.16.1-py3-none-any.whl (242 kB)
Downloading sqlalchemy-2.0.41-cp311-cp311-win_amd64.whl (2.1 MB)
   ---------------------------------------- 0.0/2.1 MB ? eta -:--:--
   ---------------------------------------- 2.1/2.1 MB 16.8 MB/s eta 0:00:00
Downloading colorlog-6.9.0-py3-none-any.whl (11 kB

In [None]:
#hyperparameter tuning
def model_init():
    return AutoModelForSequenceClassification.from_pretrained(
        "emilyalsentzer/Bio_ClinicalBERT", 
        num_labels=len(label_map)
    )

def hp_space(trial):
    return {
        "learning_rate": trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [4, 8, 16]),
        "weight_decay": trial.suggest_float("weight_decay", 0.0, 0.3)
    }


In [34]:
trainer2 = Trainer(
    model_init=model_init,                      # ✅ NEW
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=metrics
)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
best_run = trainer2.hyperparameter_search(
    direction="maximize",
    hp_space=hp_space,
    compute_objective=lambda metrics: metrics["eval_f1"],
    n_trials=10,
)
#no improvements to initial trained model

[I 2025-06-09 20:47:06,757] A new study created in memory with name: no-name-5b02fd2b-b59d-4d7c-ac46-236298d3d03f


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at emilyalsentzer/Bio_ClinicalBERT and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss,Accuracy,F1
1,3.2153,2.86587,0.244444,0.135366
2,2.6023,2.300642,0.377778,0.254988


[W 2025-06-09 21:54:07,818] Trial 0 failed with parameters: {'learning_rate': 1.4358491902381033e-05, 'per_device_train_batch_size': 4, 'weight_decay': 0.1487858940263024} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\integrations\integration_utils.py", line 255, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "C:\Users\User\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.11_qbz5n2kfra8p0\LocalCache\local-packages\Python311\site-packages\transformers\trainer.py", line 2240, in tra

KeyboardInterrupt: 

In [37]:
predictions = trainer.predict(test_dataset)
predicted_ids = predictions.predictions.argmax(axis=1)


In [None]:
from sklearn.metrics import classification_report
 
#map scores back to label

id_to_label = {v: k for k, v in label_map.items()}
target_names = [id_to_label[i] for i in sorted(id_to_label)]

print(classification_report(test_label, predicted_ids, target_names=target_names))

                         precision    recall  f1-score   support

     acute appendicitis       1.00      1.00      1.00         6
     acute pancreatitis       1.00      1.00      1.00         5
         adenocarcinoma       0.58      0.70      0.64        10
                 anemia       0.67      0.84      0.74        19
    atrial fibrillation       1.00      0.50      0.67         4
          breast cancer       0.71      1.00      0.83         5
         chondrosarcoma       0.71      1.00      0.83         5
coronary artery disease       0.69      0.90      0.78        10
               covid-19       0.62      0.62      0.62         8
               epilepsy       1.00      1.00      1.00         6
               fracture       0.80      0.67      0.73         6
                    hiv       0.00      0.00      0.00         5
              infection       0.71      0.85      0.77        20
     malignant melanoma       0.77      1.00      0.87        10
     metastatic disease 

  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])
  _warn_prf(average, modifier, f"{metric.capitalize()} is", result.shape[0])


In [43]:
#prediction test
import torch

note = """
A 54-year-old male presented to the emergency department complaining of severe chest pain radiating to the left arm, 
shortness of breath, and dizziness. ECG showed ST-segment elevation in the anterior leads. Troponin levels were elevated. 
Patient has a history of hypertension and high cholesterol. He was immediately taken for coronary angiography, 
which revealed a significant occlusion in the left anterior descending artery.
"""

inputs = tokenizer(note, return_tensors="pt", truncation=True, padding=True)

model.eval()
with torch.no_grad():
    outputs = model(**inputs)
    logits = outputs.logits
    predicted_class_id = logits.argmax(dim=1).item()

predicted_label = {v: k for k, v in label_map.items()}[predicted_class_id]
print("Predicted diagnosis:", predicted_label)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


Predicted diagnosis: coronary artery disease
