In [1]:
import pandas as pd
from sklearn.preprocessing import MultiLabelBinarizer

# 1. Load the tiny sample in Colab
df = pd.read_csv('tiny_train_sample.csv')

# 2. Prepare the labels
# The Binarizer expects a list of labels for each patient, even if it's just one disease.
# We turn "URTI" into ["URTI"]
df['TARGET_LIST'] = df['PATHOLOGY'].apply(lambda x: [x])

# 3. Initialize and fit the Binarizer
mlb = MultiLabelBinarizer()
binary_labels = mlb.fit_transform(df['TARGET_LIST'])

# 4. Let's analyze what the machine just did to Row 0
print(f"Total unique pathologies found: {len(mlb.classes_)}")
print(f"Alphabetical Classes: {mlb.classes_[:5]}... (showing first 5)")
print("-" * 50)
print(f"Original Label (Row 0): {df['PATHOLOGY'].iloc[0]}")
print(f"Machine-Readable Vector (Row 0):\n{binary_labels[0]}")
print(f"Shape of the binary vector: {binary_labels[0].shape}")

Total unique pathologies found: 49
Alphabetical Classes: ['Acute COPD exacerbation / infection' 'Acute dystonic reactions'
 'Acute laryngitis' 'Acute otitis media' 'Acute pulmonary edema']... (showing first 5)
--------------------------------------------------
Original Label (Row 0): PSVT
Machine-Readable Vector (Row 0):
[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0]
Shape of the binary vector: (49,)


In [2]:
# Colab usually has transformers pre-installed, but let's be safe
!pip install -q transformers

from transformers import AutoTokenizer

# 1. Download the pre-trained ClinicalBERT Tokenizer
print("Downloading ClinicalBERT Tokenizer...")
model_checkpoint = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

# 2. Grab the text from Row 0
sample_text = df['NARRATIVE'].iloc[0]
print("\n--- ORIGINAL TEXT ---")
print(sample_text)

# 3. Pass the text through the Tokenizer
# We use truncation=True and max_length=128 to keep things standard
tokens = tokenizer(
    sample_text,
    padding='max_length',
    truncation=True,
    max_length=128,
    return_tensors='pt' # This tells it to return PyTorch Tensors instead of standard Python lists
)

# 4. Let's analyze the mathematical output
print("\n--- THE MATHEMATICAL TRANSLATION ---")
print(f"Shape of Input IDs: {tokens['input_ids'].shape}")
print(f"First 15 Input IDs:\n{tokens['input_ids'][0][:15]}")
print(f"First 15 Attention Mask values:\n{tokens['attention_mask'][0][:15]}")

# 5. Reverse-engineer it to see what the model actually sees!
print("\n--- REVERSE TRANSLATION (What the model reads) ---")
print(tokenizer.decode(tokens['input_ids'][0][:20]))

Downloading ClinicalBERT Tokenizer...


Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]


--- ORIGINAL TEXT ---
I am an 27-year-old female. I came into the clinic today because I feel your heart is beating fast (racing), irregularly (missing a beat) or do you feel palpitations. I would describe the pain as haunting, tedious, a knife stroke, tugging, and heavy. The pain is specifically located in my. forehead. On a scale of 1 to 10, the intensity is 3. On a scale of 1 to 10, the precision of the pain location is a 4. Regarding how fast the pain appeared, on a scale of 1 to 10, it was a 5. No, I am not traveling out of the country recently. To give you more context: I have pain somewhere, related to your reason for consulting. I regularly take stimulant drugs. I am experiencing shortness of breath or difficulty breathing in a significant way. I feel lightheaded and dizzy or do you feel like you are about to faint. Regarding the question 'Have you recently taken decongestants or other substances that may have stimulant effects', my answer is

--- THE MATHEMATICAL TRANSLATION 

In [3]:
import torch
from torch.utils.data import Dataset

class DDXPlusDataset(Dataset):
    def __init__(self, dataframe, tokenizer, multilabel_binarizer, max_len=128):
        # FIX 1: Reset the pandas index so row numbers are cleanly 0 to 9999
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.mlb = multilabel_binarizer
        self.max_len = max_len

        self.labels = self.mlb.transform(self.data['TARGET_LIST'])

    def __len__(self):
        return len(self.data)

    def __getitem__(self, index):
        narrative = str(self.data.iloc[index]['NARRATIVE'])
        label_vector = self.labels[index]

        # FIX 2: Call the tokenizer directly (The modern Hugging Face way)
        encoding = self.tokenizer(
            narrative,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'targets': torch.tensor(label_vector, dtype=torch.float)
        }

# --- TEST THE DATASET CLASS AGAIN ---
print("Initializing the PyTorch Dataset...")
train_dataset = DDXPlusDataset(df, tokenizer, mlb, max_len=128)

print(f"Total items in dataset: {len(train_dataset)}")

# Let's ask PyTorch to grab Patient #500
sample_item = train_dataset[500]

print("\n--- WHAT THE GPU RECEIVES FOR PATIENT #500 ---")
print(f"Input IDs Shape: {sample_item['input_ids'].shape}")
print(f"Attention Mask Shape: {sample_item['attention_mask'].shape}")
print(f"Targets Shape: {sample_item['targets'].shape}")
print(f"Target Vector snippet: {sample_item['targets'][:10]}")

Initializing the PyTorch Dataset...
Total items in dataset: 10000

--- WHAT THE GPU RECEIVES FOR PATIENT #500 ---
Input IDs Shape: torch.Size([128])
Attention Mask Shape: torch.Size([128])
Targets Shape: torch.Size([49])
Target Vector snippet: tensor([0., 0., 0., 0., 0., 0., 0., 0., 1., 0.])


In [4]:
import torch
from transformers import AutoModelForSequenceClassification

# 1. Define our exact number of diseases
num_classes = len(mlb.classes_) # This is 49

print("Downloading ClinicalBERT Model Weights...")
# 2. Initialize the model with our custom configuration
model = AutoModelForSequenceClassification.from_pretrained(
    "emilyalsentzer/Bio_ClinicalBERT",
    num_labels=num_classes,
    problem_type="multi_label_classification" # This tells HF to prep for BCE Loss, not Softmax!
)

# 3. Detect the T4 GPU and move the model onto it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

print("\n--- MODEL ARCHITECTURE STATUS ---")
print(f"Device connected: {device}")
print(f"Number of output neurons in the final layer: {model.num_labels}")

Downloading ClinicalBERT Model Weights...


pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/199 [00:00<?, ?it/s]

BertForSequenceClassification LOAD REPORT from: emilyalsentzer/Bio_ClinicalBERT
Key                                        | Status     | 
-------------------------------------------+------------+-
cls.predictions.transform.dense.bias       | UNEXPECTED | 
cls.predictions.transform.LayerNorm.weight | UNEXPECTED | 
cls.seq_relationship.bias                  | UNEXPECTED | 
cls.seq_relationship.weight                | UNEXPECTED | 
cls.predictions.decoder.weight             | UNEXPECTED | 
cls.predictions.transform.LayerNorm.bias   | UNEXPECTED | 
cls.predictions.transform.dense.weight     | UNEXPECTED | 
cls.predictions.bias                       | UNEXPECTED | 
classifier.weight                          | MISSING    | 
classifier.bias                            | MISSING    | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING	:those params were newly initialized because missing from the checkpoint. Conside

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]


--- MODEL ARCHITECTURE STATUS ---
Device connected: cuda
Number of output neurons in the final layer: 49


In [5]:
from torch.utils.data import DataLoader
from sklearn.model_selection import train_test_split

# 1. Split our 10,000 row sample into Train (80%) and Validation (20%)
print("Splitting data into Training and Validation sets...")
df_train, df_val = train_test_split(
    df,
    test_size=0.2,
    stratify=df['PATHOLOGY'], # Ensures the 20% validation set has the same disease ratios
    random_state=42
)

# 2. Initialize our Custom PyTorch Datasets
print("Initializing Datasets...")
train_dataset = DDXPlusDataset(df_train, tokenizer, mlb, max_len=128)
val_dataset = DDXPlusDataset(df_val, tokenizer, mlb, max_len=128)

print(f"Training patients: {len(train_dataset)}")
print(f"Validation patients: {len(val_dataset)}")

# 3. Create the DataLoaders (The "Waiters" for the GPU)
# A batch size of 16 is perfectly safe for a T4 GPU with 128 max_len tokens
BATCH_SIZE = 16

# We shuffle the train loader so the model doesn't learn the order of the patients
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

# 4. Let's peek at exactly one "Tray" of data going to the GPU
first_batch = next(iter(train_loader))
print("\n--- FIRST BATCH TENSORS (TRAY OF 16 PATIENTS) ---")
print(f"Input IDs shape: {first_batch['input_ids'].shape}")
print(f"Attention Mask shape: {first_batch['attention_mask'].shape}")
print(f"Targets shape: {first_batch['targets'].shape}")

Splitting data into Training and Validation sets...
Initializing Datasets...
Training patients: 8000
Validation patients: 2000

--- FIRST BATCH TENSORS (TRAY OF 16 PATIENTS) ---
Input IDs shape: torch.Size([16, 128])
Attention Mask shape: torch.Size([16, 128])
Targets shape: torch.Size([16, 49])


In [6]:
import torch.nn as nn
from torch.optim import AdamW

# 1. Define the Learning Engine
# 2e-5 is the standard "safe" learning rate for fine-tuning BERT
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()

# 2. The Sanity Check (Forward Pass on 1 Batch)
print("Passing the first batch of 16 patients into the untrained model...")

# Move the batch tensors to the GPU
input_ids = first_batch['input_ids'].to(device)
attention_mask = first_batch['attention_mask'].to(device)
targets = first_batch['targets'].to(device)

# Tell PyTorch we don't want to calculate gradients yet, just predict
with torch.no_grad():
    # Pass the text and masks into ClinicalBERT
    outputs = model(input_ids=input_ids, attention_mask=attention_mask)

    # Extract the 49 output neurons (Logits)
    logits = outputs.logits

# 3. Calculate the initial Loss
initial_loss = criterion(logits, targets)

# 4. Let's look at the math!
print("\n--- SANITY CHECK RESULTS ---")
print(f"Logits Shape (Should be 16, 49): {logits.shape}")
print(f"Initial Untrained Loss: {initial_loss.item():.4f}")

# Look at the raw output numbers for the very first patient
print(f"\nRaw Outputs (Logits) for Patient 0 (First 5 diseases):\n{logits[0][:5]}")

# Apply Sigmoid to turn them into readable percentages (0% to 100%)
probabilities = torch.sigmoid(logits)
print(f"\nActual Probabilities for Patient 0 (First 5 diseases):\n{probabilities[0][:5]}")

Passing the first batch of 16 patients into the untrained model...

--- SANITY CHECK RESULTS ---
Logits Shape (Should be 16, 49): torch.Size([16, 49])
Initial Untrained Loss: 0.6967

Raw Outputs (Logits) for Patient 0 (First 5 diseases):
tensor([-0.1223,  0.1772, -0.7391, -0.1911, -0.2373], device='cuda:0')

Actual Probabilities for Patient 0 (First 5 diseases):
tensor([0.4695, 0.5442, 0.3232, 0.4524, 0.4410], device='cuda:0')


In [7]:
from tqdm import tqdm
import numpy as np

# We will train for 2 loops over the dataset to see if the loss goes down
EPOCHS = 2

for epoch in range(EPOCHS):
    print(f"\n======== Epoch {epoch+1} / {EPOCHS} ========")

    # ==========================================
    #               TRAINING PHASE
    # ==========================================
    model.train() # Put model in training mode
    total_train_loss = 0

    # Create a progress bar for the training batches
    train_loop = tqdm(train_loader, leave=True, desc="Training Batches")

    for batch in train_loop:
        # 1. Clear old gradients
        optimizer.zero_grad()

        # 2. Push the tray of data to the GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        # 3. Forward Pass (Make a guess)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # 4. Calculate Loss (How wrong is it?)
        loss = criterion(logits, targets)
        total_train_loss += loss.item()

        # 5. Backward Pass (Calculate the fixes)
        loss.backward()

        # 6. Optimizer Step (Apply the fixes to the weights)
        optimizer.step()

        # Update the progress bar text with the current loss
        train_loop.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_loader)
    print(f"\n>>> Average Training Loss: {avg_train_loss:.4f}")

    # ==========================================
    #              VALIDATION PHASE
    # ==========================================
    model.eval() # Put model in test mode (turns off dropout layers, prevents learning)
    total_val_loss = 0

    val_loop = tqdm(val_loader, leave=True, desc="Validation Batches")

    with torch.no_grad(): # Tell PyTorch to stop tracking gradients (saves massive memory)
        for batch in val_loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            targets = batch['targets'].to(device)

            # Make a guess
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, targets)

            total_val_loss += loss.item()
            val_loop.set_postfix(loss=loss.item())

    avg_val_loss = total_val_loss / len(val_loader)
    print(f"\n>>> Average Validation Loss: {avg_val_loss:.4f}")





Training Batches:   0%|          | 0/500 [00:00<?, ?it/s][A
Training Batches:   0%|          | 0/500 [00:01<?, ?it/s, loss=0.697][A
Training Batches:   0%|          | 1/500 [00:01<10:52,  1.31s/it, loss=0.697][A
Training Batches:   0%|          | 1/500 [00:01<10:52,  1.31s/it, loss=0.685][A
Training Batches:   0%|          | 2/500 [00:01<06:57,  1.19it/s, loss=0.685][A
Training Batches:   0%|          | 2/500 [00:02<06:57,  1.19it/s, loss=0.673][A
Training Batches:   1%|          | 3/500 [00:02<05:57,  1.39it/s, loss=0.673][A
Training Batches:   1%|          | 3/500 [00:02<05:57,  1.39it/s, loss=0.659][A
Training Batches:   1%|          | 4/500 [00:02<05:23,  1.53it/s, loss=0.659][A
Training Batches:   1%|          | 4/500 [00:03<05:23,  1.53it/s, loss=0.649][A
Training Batches:   1%|          | 5/500 [00:03<04:44,  1.74it/s, loss=0.649][A
Training Batches:   1%|          | 5/500 [00:03<04:44,  1.74it/s, loss=0.629][A
Training Batches:   1%|          | 6/500 [00:03<04:23, 


>>> Average Training Loss: 0.1606


Validation Batches: 100%|██████████| 125/125 [00:17<00:00,  7.11it/s, loss=0.0919]



>>> Average Validation Loss: 0.0907



Training Batches: 100%|██████████| 500/500 [03:31<00:00,  2.36it/s, loss=0.0472]



>>> Average Training Loss: 0.0723


Validation Batches: 100%|██████████| 125/125 [00:17<00:00,  7.17it/s, loss=0.0532]


>>> Average Validation Loss: 0.0510





In [8]:
import numpy as np
from sklearn.metrics import classification_report
import warnings
warnings.filterwarnings('ignore') # Ignores warnings if some rare diseases weren't predicted

# 1. Put model in Evaluation Mode
model.eval()

# 2. Lists to store all predictions and actual answers
all_predictions = []
all_actuals = []

print("Running final evaluation on the Validation Set...")

with torch.no_grad():
    for batch in val_loader:
        # Move to GPU
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        targets = batch['targets'].to(device)

        # Get raw logits
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits

        # Step 1: Apply Sigmoid to get percentages
        probabilities = torch.sigmoid(logits)

        # Step 2: Apply the 0.5 Threshold
        # This creates an array of 0s and 1s just like our MultiLabelBinarizer targets!
        binary_predictions = (probabilities > 0.5).int()

        # Move back to CPU and store in our lists
        all_predictions.extend(binary_predictions.cpu().numpy())
        all_actuals.extend(targets.cpu().numpy())

# Convert lists to NumPy arrays for scikit-learn
all_predictions = np.array(all_predictions)
all_actuals = np.array(all_actuals)

# Step 3: Generate the Report Card
print("\n================= FINAL CLASSIFICATION REPORT =================")
print(classification_report(
    all_actuals,
    all_predictions,
    target_names=mlb.classes_,
    digits=3 # Show 3 decimal places
))

Running final evaluation on the Validation Set...

                                          precision    recall  f1-score   support

     Acute COPD exacerbation / infection      0.000     0.000     0.000        34
                Acute dystonic reactions      1.000     0.745     0.854        51
                        Acute laryngitis      0.000     0.000     0.000        47
                      Acute otitis media      0.000     0.000     0.000        51
                   Acute pulmonary edema      0.000     0.000     0.000        37
                    Acute rhinosinusitis      0.000     0.000     0.000        26
                      Allergic sinusitis      1.000     0.980     0.990        51
                             Anaphylaxis      0.000     0.000     0.000        54
                                  Anemia      0.980     0.980     0.980        99
                     Atrial fibrillation      0.000     0.000     0.000        41
                               Boerhaave      

In [9]:
import torch

def diagnose_patient(model, tokenizer, mlb, text, threshold=0.5):
    # 1. Put model in Evaluation mode
    model.eval()

    # 2. Tokenize the input text
    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    # Move to GPU
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # 3. Make Prediction
    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        probabilities = torch.sigmoid(logits)[0] # Grab the first (and only) patient's probabilities

    # 4. Filter the results based on our threshold
    predictions = []
    for i, prob in enumerate(probabilities):
        if prob > threshold:
            disease_name = mlb.classes_[i]
            confidence = prob.item() * 100
            predictions.append((disease_name, confidence))

    # Sort by highest confidence
    predictions.sort(key=lambda x: x[1], reverse=True)
    return predictions

# --- TEST IT OUT LIVE ---
# Let's give it the classic symptoms of URTI / Pharyngitis since we know it learned those well!
custom_patient_story = "I am a 22-year-old male. I have a really bad sore throat, a runny nose, and I have been coughing all day. I feel a bit feverish too."

results = diagnose_patient(model, tokenizer, mlb, custom_patient_story, threshold=0.5)

print(f"PATIENT STORY: '{custom_patient_story}'\n")
print("=== HEAL BRIDGE AI DIAGNOSIS ===")
if not results:
    print("No diseases detected above the confidence threshold.")
else:
    for disease, confidence in results:
        print(f"-> {disease}: {confidence:.2f}% confidence")

PATIENT STORY: 'I am a 22-year-old male. I have a really bad sore throat, a runny nose, and I have been coughing all day. I feel a bit feverish too.'

=== HEAL BRIDGE AI DIAGNOSIS ===
No diseases detected above the confidence threshold.


In [10]:
# --- THE DUAL INFERENCE TEST ---

# Story 1: Using the exact vocabulary the model learned in Phase 1
exact_story = "I am a 22-year-old male. I came into the clinic today because I have a fever (either felt or measured with a thermometer). I have a sore throat. I have a cough."

# Story 2: Your conversational version
conversational_story = "I am a 22-year-old male. I have a really bad sore throat, a runny nose, and I have been coughing all day. I feel a bit feverish too."

print("=== TEST 1: EXACT VOCABULARY ===")
results_exact = diagnose_patient(model, tokenizer, mlb, exact_story, threshold=0.05)
for disease, confidence in results_exact:
    print(f"-> {disease}: {confidence:.2f}% confidence")

print("\n=== TEST 2: CONVERSATIONAL VOCABULARY ===")
results_conv = diagnose_patient(model, tokenizer, mlb, conversational_story, threshold=0.05)
for disease, confidence in results_conv:
    print(f"-> {disease}: {confidence:.2f}% confidence")

=== TEST 1: EXACT VOCABULARY ===
-> Allergic sinusitis: 7.49% confidence
-> Tuberculosis: 5.58% confidence

=== TEST 2: CONVERSATIONAL VOCABULARY ===
-> Allergic sinusitis: 18.01% confidence


In [11]:
def diagnose_patient_top_k(model, tokenizer, mlb, text, top_k=3):
    model.eval()

    encoding = tokenizer(
        text,
        add_special_tokens=True,
        max_length=128,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        return_tensors='pt'
    )

    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    with torch.no_grad():
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        # Get raw probabilities
        probabilities = torch.sigmoid(outputs.logits)[0]

    # FORCE the model to give us the indices of its top 3 highest probabilities
    top_indices = torch.topk(probabilities, top_k).indices.tolist()

    predictions = []
    for idx in top_indices:
        disease_name = mlb.classes_[idx]
        confidence = probabilities[idx].item() * 100
        predictions.append((disease_name, confidence))

    return predictions

# --- THE TOP-3 FORCED INFERENCE TEST ---

print("=== TEST 1: EXACT VOCABULARY ===")
results_exact = diagnose_patient_top_k(model, tokenizer, mlb, exact_story, top_k=3)
for disease, confidence in results_exact:
    print(f"-> {disease}: {confidence:.4f}% confidence")

print("\n=== TEST 2: CONVERSATIONAL VOCABULARY ===")
results_conv = diagnose_patient_top_k(model, tokenizer, mlb, conversational_story, top_k=3)
for disease, confidence in results_conv:
    print(f"-> {disease}: {confidence:.4f}% confidence")

=== TEST 1: EXACT VOCABULARY ===
-> Allergic sinusitis: 7.4905% confidence
-> Tuberculosis: 5.5840% confidence
-> Bronchiectasis: 3.9299% confidence

=== TEST 2: CONVERSATIONAL VOCABULARY ===
-> Allergic sinusitis: 18.0101% confidence
-> Tuberculosis: 3.9962% confidence
-> Bronchospasm / acute asthma exacerbation: 3.5851% confidence


In [15]:
import pandas as pd

# 1. Mount Google Drive (It will ask for permission again if you restarted)
from google.colab import drive
drive.mount('/content/drive')

print("Unzipping the 1-Million row dataset into Colab's fast local storage...")
# 2. Extract the file directly to /content/
# (-q means quiet so it doesn't print 1 million lines to your screen, -o means overwrite)
!unzip -q -o /content/drive/MyDrive/healbridge/translated_train_data.zip -d /content/

# 3. Load the unzipped CSV into Pandas
print("Loading the dataset into memory...")
df_full = pd.read_csv('/content/translated_train_data.csv')

# 4. The Final Verification
print("\n--- DATASET STATUS ---")
print(f"Total Rows: {df_full.shape[0]}")
print(f"Total Columns: {df_full.shape[1]}")
print("First 3 rows:")
print(df_full.head(3))

Mounted at /content/drive
Unzipping the 1-Million row dataset into Colab's fast local storage...
Loading the dataset into memory...

--- DATASET STATUS ---
Total Rows: 1025602
Total Columns: 2
First 3 rows:
                 PATHOLOGY                                          NARRATIVE
0                     URTI  I am an 18-year-old male. I came into the clin...
1  HIV (initial infection)  I am an 21-year-old male. I came into the clin...
2                Pneumonia  I am an 19-year-old female. I came into the cl...


In [16]:
from sklearn.model_selection import train_test_split
from torch.utils.data import DataLoader

print("1. Re-fitting the MultiLabelBinarizer on the FULL dataset...")
# Make sure TARGET_LIST exists
df_full['TARGET_LIST'] = df_full['PATHOLOGY'].apply(lambda x: [x])

# We use the full dataset so the Binarizer sees every single disease perfectly
mlb.fit(df_full['TARGET_LIST'])

print("2. Splitting 1 Million Rows into Train and Validation (80/20)...")
df_full_train, df_full_val = train_test_split(
    df_full,
    test_size=0.2,
    stratify=df_full['PATHOLOGY'],
    random_state=42
)

print("3. Initializing the massive Datasets (this takes a moment)...")
# We reuse the exact same DDXPlusDataset class we built earlier!
train_dataset_full = DDXPlusDataset(df_full_train, tokenizer, mlb, max_len=128)
val_dataset_full = DDXPlusDataset(df_full_val, tokenizer, mlb, max_len=128)

print("4. Creating the Production DataLoaders...")
BATCH_SIZE = 16 # Safe memory limit for the free Colab T4 GPU
train_loader_full = DataLoader(train_dataset_full, batch_size=BATCH_SIZE, shuffle=True)
val_loader_full = DataLoader(val_dataset_full, batch_size=BATCH_SIZE, shuffle=False)

print("\n--- PRODUCTION DATA PIPELINE READY ---")
print(f"Total Training Patients: {len(train_dataset_full)}")
print(f"Total Validation Patients: {len(val_dataset_full)}")
print(f"Total Training Batches per Epoch: {len(train_loader_full)}")

1. Re-fitting the MultiLabelBinarizer on the FULL dataset...
2. Splitting 1 Million Rows into Train and Validation (80/20)...
3. Initializing the massive Datasets (this takes a moment)...
4. Creating the Production DataLoaders...

--- PRODUCTION DATA PIPELINE READY ---
Total Training Patients: 820481
Total Validation Patients: 205121
Total Training Batches per Epoch: 51281


In [17]:
import os
import torch
import torch.nn as nn
from torch.optim import AdamW
from tqdm import tqdm

# 1. Setup the Engine
checkpoint_path = '/content/drive/MyDrive/healbridge/checkpoints/heal_bridge_best_model.pt'
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCEWithLogitsLoss()
scaler = torch.cuda.amp.GradScaler() # Fresh scaler for the resumed AMP loop

# 2. Load the Checkpoint
print(f"Loading checkpoint from: {checkpoint_path}")
checkpoint = torch.load(checkpoint_path)

# 3. Restore the Brain and Memory
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])

start_epoch = checkpoint['epoch']
best_val_loss = checkpoint['loss']
EPOCHS = 3

print("-" * 40)
print(f"✅ Successfully restored!")
print(f"Resuming from Epoch: {start_epoch + 1}")
print(f"Previous Best Validation Loss: {best_val_loss:.4f}")

# 4. The Resumed High-Speed AMP Loop
for epoch in range(start_epoch, EPOCHS):
    print(f"\n======== Epoch {epoch+1} / {EPOCHS} ========")

    # --- TRAINING ---
    model.train()
    total_train_loss = 0
    train_loop = tqdm(train_loader_full, leave=True, desc="Training")

    for batch in train_loop:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device, non_blocking=True)
        attention_mask = batch['attention_mask'].to(device, non_blocking=True)
        targets = batch['targets'].to(device, non_blocking=True)

        with torch.autocast(device_type='cuda', dtype=torch.float16):
            outputs = model(input_ids=input_ids, attention_mask=attention_mask)
            loss = criterion(outputs.logits, targets)

        total_train_loss += loss.item()

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()

        train_loop.set_postfix(loss=loss.item())

    avg_train_loss = total_train_loss / len(train_loader_full)
    print(f"\n>>> Avg Training Loss: {avg_train_loss:.4f}")

    # --- VALIDATION ---
    model.eval()
    total_val_loss = 0
    val_loop = tqdm(val_loader_full, leave=True, desc="Validation")

    with torch.no_grad():
        for batch in val_loop:
            input_ids = batch['input_ids'].to(device, non_blocking=True)
            attention_mask = batch['attention_mask'].to(device, non_blocking=True)
            targets = batch['targets'].to(device, non_blocking=True)

            with torch.autocast(device_type='cuda', dtype=torch.float16):
                outputs = model(input_ids=input_ids, attention_mask=attention_mask)
                loss = criterion(outputs.logits, targets)

            total_val_loss += loss.item()
            val_loop.set_postfix(loss=loss.item())

    avg_val_loss = total_val_loss / len(val_loader_full)
    print(f"\n>>> Avg Validation Loss: {avg_val_loss:.4f}")

    # --- CHECKPOINTING ---
    if avg_val_loss < best_val_loss:
        print(f"🌟 Validation loss improved from {best_val_loss:.4f} to {avg_val_loss:.4f}!")
        print("💾 Saving the brain to Google Drive...")

        checkpoint = {
            'epoch': epoch + 1,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'loss': avg_val_loss
        }
        torch.save(checkpoint, checkpoint_path)
        best_val_loss = avg_val_loss
        print("✅ Save complete.\n")
    else:
        print("No improvement this epoch. Skipping save.\n")

Loading checkpoint from: /content/drive/MyDrive/healbridge/checkpoints/heal_bridge_best_model.pt
----------------------------------------
✅ Successfully restored!
Resuming from Epoch: 2
Previous Best Validation Loss: 0.0021



Training: 100%|██████████| 51281/51281 [2:08:14<00:00,  6.66it/s, loss=2.28e-5]



>>> Avg Training Loss: 0.0021


Validation: 100%|██████████| 12821/12821 [09:52<00:00, 21.62it/s, loss=8.92e-6]



>>> Avg Validation Loss: 0.0020
🌟 Validation loss improved from 0.0021 to 0.0020!
💾 Saving the brain to Google Drive...
✅ Save complete.




Training: 100%|██████████| 51281/51281 [2:07:41<00:00,  6.69it/s, loss=0.0169]



>>> Avg Training Loss: 0.0020


Validation: 100%|██████████| 12821/12821 [10:00<00:00, 21.35it/s, loss=4.72e-6]



>>> Avg Validation Loss: 0.0019
🌟 Validation loss improved from 0.0020 to 0.0019!
💾 Saving the brain to Google Drive...
✅ Save complete.

