In [None]:
# Cell 1: Install all required packages
!pip install -q transformers==4.30.0 datasets==2.12.0 torch rouge-score mlflow google-cloud-storage pandas

print("="*60)
print("SETUP COMPLETE")
print("="*60)
print("✅ Dependencies installed!")

import torch
print(f"GPU Available: {'YES ✅' if torch.cuda.is_available() else 'NO ❌'}")
if torch.cuda.is_available():
    print(f"GPU Type: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")
print("="*60)

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/113.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m113.6/113.6 kB[0m [31m5.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m314.9/314.9 kB[0m [31m9.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m40.0/40.0 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m106.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.6/474.6 kB[0m [31m43.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Install rouge-score
!pip install rouge-score

print("✅ rouge-score installed!")

Collecting rouge-score
  Using cached rouge_score-0.1.2-py3-none-any.whl
Installing collected packages: rouge-score
Successfully installed rouge-score-0.1.2
✅ rouge-score installed!


In [None]:
# Cell 2: Authenticate to access your GCP bucket
from google.colab import auth
auth.authenticate_user()

# Set your project
PROJECT_ID = "regal-bonito-455919-u3"
BUCKET_NAME = "lab-lens-data-regal-bonito-455919-u3"

!gcloud config set project {PROJECT_ID}

print("="*60)
print("GCP AUTHENTICATION")
print("="*60)
print(f"✅ Authenticated with project: {PROJECT_ID}")
print(f"✅ Bucket: gs://{BUCKET_NAME}")
print("="*60)

INFORMATION: Project 'regal-bonito-455919-u3' has no 'environment' tag set. Use either 'Production', 'Development', 'Test', or 'Staging'. Add an 'environment' tag using `gcloud resource-manager tags bindings create`.
Updated property [core/project].
GCP AUTHENTICATION
✅ Authenticated with project: regal-bonito-455919-u3
✅ Bucket: gs://lab-lens-data-regal-bonito-455919-u3


In [None]:
# Cell 3: Download training data from your GCP bucket
print("="*60)
print("DOWNLOADING DATA FROM GCP")
print("="*60)

!gsutil -m cp gs://{BUCKET_NAME}/data/model_ready/train.csv .
!gsutil -m cp gs://{BUCKET_NAME}/data/model_ready/validation.csv .
!gsutil -m cp gs://{BUCKET_NAME}/data/model_ready/test.csv .

# Verify files downloaded
import os
print("\n✅ Downloaded files:")
for file in ['train.csv', 'validation.csv', 'test.csv']:
    size_mb = os.path.getsize(file) / 1024 / 1024
    print(f"  {file}: {size_mb:.2f} MB")
print("="*60)

DOWNLOADING DATA FROM GCP
Copying gs://lab-lens-data-regal-bonito-455919-u3/data/model_ready/train.csv...
\ [1/1 files][ 49.7 MiB/ 49.7 MiB] 100% Done                                    
Operation completed over 1 objects/49.7 MiB.                                     
Copying gs://lab-lens-data-regal-bonito-455919-u3/data/model_ready/validation.csv...
- [1/1 files][ 11.0 MiB/ 11.0 MiB] 100% Done                                    
Operation completed over 1 objects/11.0 MiB.                                     
Copying gs://lab-lens-data-regal-bonito-455919-u3/data/model_ready/test.csv...
- [1/1 files][ 10.8 MiB/ 10.8 MiB] 100% Done                                    
Operation completed over 1 objects/10.8 MiB.                                     

✅ Downloaded files:
  train.csv: 49.73 MB
  validation.csv: 10.97 MB
  test.csv: 10.76 MB


In [None]:
# Cell 4: Verify data loaded correctly and template exists
import pandas as pd

print("="*60)
print("DATA VERIFICATION")
print("="*60)

train_df = pd.read_csv('train.csv')
val_df = pd.read_csv('validation.csv')
test_df = pd.read_csv('test.csv')

print(f"Train: {len(train_df)} records")
print(f"Validation: {len(val_df)} records")
print(f"Test: {len(test_df)} records")
print(f"Total columns: {len(train_df.columns)}")

# Check required columns exist
required_cols = ['input_text', 'target_summary', 'age_group', 'gender', 'ethnicity_clean']
missing = [col for col in required_cols if col not in train_df.columns]

if missing:
    print(f"\n❌ ERROR: Missing columns: {missing}")
else:
    print(f"\n✅ All required columns present!")

# Show sample template
print(f"\n{'='*60}")
print("SAMPLE TEMPLATE (First Training Example):")
print(f"{'='*60}")
print(f"\nINPUT LENGTH: {len(train_df.iloc[0]['input_text'])} chars")
print(f"TEMPLATE SUMMARY LENGTH: {len(train_df.iloc[0]['target_summary'])} chars")
print(f"\nTEMPLATE FORMAT:")
print(train_df.iloc[0]['target_summary'][:500])
print("\n[... rest of template ...]")
print("="*60)

DATA VERIFICATION
Train: 1638 records
Validation: 351 records
Test: 351 records
Total columns: 51

✅ All required columns present!

SAMPLE TEMPLATE (First Training Example):

INPUT LENGTH: 11707 chars
TEMPLATE SUMMARY LENGTH: 540 chars

TEMPLATE FORMAT:

PATIENT: 86.0-year-old M

DATES: Admitted 2199-08-23, Discharged 2199-09-01

ADMISSION: anemia work-up

HISTORY: 1. Coronary artery disease
- s/p cath (): Mild epicardial disease, collalateral...

DIAGNOSIS: Not documented

HOSPITAL COURSE: REASON FOR HOSPITALIZATION:
66 F with complicated PMH including multiple CVAs, CAD, ESRD on
HD (MWF)...

LABS: Magnesium: 1.9 mg/dL;  MCV: 96 fL;  Lactate: 5.5 mmol/L (!)

MEDICATIONS: Not documented

FOLLOW-UP: Extended Care

SUMMARY: Clinical management pr

[... rest of template ...]


In [None]:
# Cell 5: Initialize BioBART Model
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
import torch

print("="*60)
print("LOADING BIOBART MODEL")
print("="*60)

MODEL_NAME = "GanjinZero/biobart-v2-base"
device = 'cuda' if torch.cuda.is_available() else 'cpu'

print(f"Model: {MODEL_NAME}")
print(f"Device: {device.upper()}")
print("Pre-training: PubMed biomedical literature")
print("\nDownloading model (this takes 2-3 minutes)...")

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"✅ Tokenizer loaded | Vocab size: {len(tokenizer)}")

# Load model
model = AutoModelForSeq2SeqLM.from_pretrained(MODEL_NAME)
model = model.to(device)
print(f"✅ Model loaded and moved to {device.upper()}")

# Show model size
param_count = sum(p.numel() for p in model.parameters())
print(f"✅ Model parameters: {param_count:,} ({param_count/1e6:.1f}M)")
print("="*60)

LOADING BIOBART MODEL
Model: GanjinZero/biobart-v2-base
Device: CUDA
Pre-training: PubMed biomedical literature

Downloading model (this takes 2-3 minutes)...


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/772 [00:00<?, ?B/s]

✅ Tokenizer loaded | Vocab size: 85401


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/666M [00:00<?, ?B/s]

✅ Model loaded and moved to CUDA
✅ Model parameters: 166,404,864 (166.4M)


In [None]:
# Cell 6: Create PyTorch Dataset for Training
from torch.utils.data import Dataset

class MIMICSummarizationDataset(Dataset):
    """Dataset for BioBART fine-tuning"""

    def __init__(self, dataframe, tokenizer, max_input_length=512, max_target_length=256):
        self.data = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_input_length = max_input_length
        self.max_target_length = max_target_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]

        # Get texts
        input_text = str(row['input_text'])
        target_text = str(row['target_summary'])

        # Tokenize input
        input_encoding = self.tokenizer(
            input_text,
            max_length=self.max_input_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        # Tokenize target
        target_encoding = self.tokenizer(
            target_text,
            max_length=self.max_target_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )

        return {
            'input_ids': input_encoding['input_ids'].flatten(),
            'attention_mask': input_encoding['attention_mask'].flatten(),
            'labels': target_encoding['input_ids'].flatten()
        }

# Create datasets
print("="*60)
print("CREATING PYTORCH DATASETS")
print("="*60)

train_dataset = MIMICSummarizationDataset(train_df, tokenizer, 512, 256)
val_dataset = MIMICSummarizationDataset(val_df, tokenizer, 512, 256)
test_dataset = MIMICSummarizationDataset(test_df, tokenizer, 512, 256)

print(f"✅ Train dataset: {len(train_dataset)} samples")
print(f"✅ Validation dataset: {len(val_dataset)} samples")
print(f"✅ Test dataset: {len(test_dataset)} samples")
print("="*60)

CREATING PYTORCH DATASETS
✅ Train dataset: 1638 samples
✅ Validation dataset: 351 samples
✅ Test dataset: 351 samples


In [None]:
# Cell 7: Configure Training Arguments (FIXED)
from transformers import Seq2SeqTrainingArguments, DataCollatorForSeq2Seq

print("="*60)
print("CONFIGURING TRAINING")
print("="*60)

OUTPUT_DIR = "/content/biobart_model"

training_args = Seq2SeqTrainingArguments(
    output_dir=OUTPUT_DIR,

    # Training params
    num_train_epochs=3,
    per_device_train_batch_size=4,  # GPU can handle bigger batch
    per_device_eval_batch_size=4,
    learning_rate=2e-5,
    warmup_steps=100,
    weight_decay=0.01,

    # Evaluation - FIXED PARAMETER NAMES
    eval_strategy="steps",  # Changed from evaluation_strategy
    eval_steps=200,
    save_strategy="steps",
    save_steps=200,
    save_total_limit=2,

    # Model selection
    load_best_model_at_end=True,
    metric_for_best_model='eval_loss',

    # Generation
    predict_with_generate=True,
    generation_max_length=256,

    # Optimization for GPU
    fp16=True,  # Mixed precision for faster training

    # Logging
    logging_steps=50,
    logging_dir=f"{OUTPUT_DIR}/logs",
    report_to="none",  # Disable external logging for Colab
)

# Data collator
data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    padding=True
)

print("✅ Training configuration:")
print(f"  Epochs: 3")
print(f"  Batch size: 4")
print(f"  Learning rate: 2e-5")
print(f"  FP16: True (GPU acceleration)")
print(f"  Eval every: 200 steps")
print("="*60)

CONFIGURING TRAINING
✅ Training configuration:
  Epochs: 3
  Batch size: 4
  Learning rate: 2e-5
  FP16: True (GPU acceleration)
  Eval every: 200 steps


In [None]:
# Cell 8: Train BioBART Model
from transformers import Seq2SeqTrainer
import time

print("="*60)
print("STARTING MODEL TRAINING")
print("="*60)
print("Estimated time: 20-30 minutes on T4 GPU")
print("="*60)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# Start training
start_time = time.time()
train_result = trainer.train()
training_duration = time.time() - start_time

print("\n" + "="*60)
print("TRAINING COMPLETE!")
print("="*60)
print(f"✅ Duration: {training_duration/60:.1f} minutes")
print(f"✅ Final loss: {train_result.training_loss:.4f}")
print("="*60)

STARTING MODEL TRAINING
Estimated time: 20-30 minutes on T4 GPU


  trainer = Seq2SeqTrainer(
  batch["labels"] = torch.tensor(batch["labels"], dtype=torch.int64)


Step,Training Loss,Validation Loss
200,1.4426,1.292787
400,1.2909,1.121626
600,1.1194,1.066876
800,1.1085,1.038848
1000,1.0107,1.024305
1200,1.0509,1.020286


There were missing keys in the checkpoint model loaded: ['model.encoder.embed_tokens.weight', 'model.decoder.embed_tokens.weight', 'lm_head.weight'].



TRAINING COMPLETE!
✅ Duration: 9.5 minutes
✅ Final loss: 1.4024


In [None]:
# Cell 9: Evaluate Model on Test Set
from rouge_score import rouge_scorer
import numpy as np

print("="*60)
print("EVALUATING ON TEST SET")
print("="*60)

# Evaluate
eval_results = trainer.evaluate(eval_dataset=test_dataset)

print("Test Set Metrics:")
print(f"  Loss: {eval_results['eval_loss']:.4f}")
print(f"  Runtime: {eval_results['eval_runtime']:.1f} seconds")
print(f"  Samples/second: {eval_results['eval_samples_per_second']:.2f}")

# Calculate ROUGE scores manually on a sample
print("\nCalculating ROUGE scores on 50 test samples...")

scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
rouge_scores = {'rouge1': [], 'rouge2': [], 'rougeL': []}

# Generate summaries for first 50 test samples
for idx in range(min(50, len(test_df))):
    row = test_df.iloc[idx]

    # Tokenize input
    inputs = tokenizer(
        str(row['input_text']),
        max_length=512,
        truncation=True,
        return_tensors='pt'
    ).to(device)

    # Generate summary
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            early_stopping=True,
            no_repeat_ngram_size=3
        )

    # Decode
    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)
    reference = str(row['target_summary'])

    # Calculate ROUGE
    scores = scorer.score(reference, generated)
    rouge_scores['rouge1'].append(scores['rouge1'].fmeasure)
    rouge_scores['rouge2'].append(scores['rouge2'].fmeasure)
    rouge_scores['rougeL'].append(scores['rougeL'].fmeasure)

# Average ROUGE scores
print("\nROUGE Scores (average of 50 samples):")
print(f"  ROUGE-1: {np.mean(rouge_scores['rouge1']):.4f}")
print(f"  ROUGE-2: {np.mean(rouge_scores['rouge2']):.4f}")
print(f"  ROUGE-L: {np.mean(rouge_scores['rougeL']):.4f}")
print("="*60)

EVALUATING ON TEST SET


Test Set Metrics:
  Loss: 1.0257
  Runtime: 20.7 seconds
  Samples/second: 16.98

Calculating ROUGE scores on 50 test samples...

ROUGE Scores (average of 50 samples):
  ROUGE-1: 0.3630
  ROUGE-2: 0.1712
  ROUGE-L: 0.3120


In [None]:
# Cell 10: Generate and Display Sample Summaries
print("="*60)
print("SAMPLE GENERATED SUMMARIES")
print("="*60)

# Generate 3 diverse samples
sample_indices = [0, len(test_df)//2, len(test_df)-1]

for i, idx in enumerate(sample_indices, 1):
    row = test_df.iloc[idx]

    # Generate summary
    inputs = tokenizer(
        str(row['input_text']),
        max_length=512,
        truncation=True,
        return_tensors='pt'
    ).to(device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_length=256,
            num_beams=4,
            early_stopping=True
        )

    generated = tokenizer.decode(outputs[0], skip_special_tokens=True)

    print(f"\n--- SAMPLE {i} ---")
    print(f"Demographics: {row.get('age_group', 'N/A')}, {row.get('gender', 'N/A')}, {row.get('ethnicity_clean', 'N/A')}")
    print(f"Input length: {len(row['input_text'])} chars")
    print(f"\nREFERENCE SUMMARY:")
    print(row['target_summary'][:300])
    print("\n\nGENERATED SUMMARY:")
    print(generated[:300])
    print("-"*60)

print("="*60)

SAMPLE GENERATED SUMMARIES

--- SAMPLE 1 ---
Demographics: <18, F, WHITE
Input length: 7036 chars

REFERENCE SUMMARY:

PATIENT: 0.0-year-old F

DATES: Admitted 2198-04-12, Discharged 2198-05-23

ADMISSION: confusion

HISTORY: Past Oncologic History: Primarily taken from Dr. notes
form .

DIAGNOSIS: seizures due to metastatic melanoma

HOSPITAL COURSE: BRIEF HOSPITAL COURSE:
Ms.  is a 46y/o lady who has metastatic m


GENERATED SUMMARY:

PATIENT: 0.0-year-old F

DATES: Admitted 2167-01-19, Discharged 216701-02-25

ADMISSION: Not documented

HISTORY: Not known

DIAGNOSIS: Not reported

SUMMARY: Clinical management provided. Patient discharged.

------------------------------------------------------------

--- SAMPLE 2 ---
Demographics: 35-50, F, WHITE
Input length: 13413 chars

REFERENCE SUMMARY:

PATIENT: 39.0-year-old F

DATES: Admitted 2125-03-28, Discharged 2125-04-06

ADMISSION: sudden right sided weakness

HISTORY: -Hypertension
- Renal cancer, s/p nephrectomy by Dr.   at
 1.

DIAG

In [None]:
# Cell 11: Save Trained Model to GCP Bucket
print("="*60)
print("SAVING MODEL TO GCP")
print("="*60)

# Save model locally in Colab first
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print(f"✅ Model saved locally to: {OUTPUT_DIR}")

# Upload to GCP bucket
print(f"\nUploading to GCP bucket...")
!gsutil -m cp -r {OUTPUT_DIR}/* gs://{BUCKET_NAME}/models/biobart/

print(f"✅ Model uploaded to: gs://{BUCKET_NAME}/models/biobart/")
print("="*60)

# Verify upload
!gsutil ls gs://{BUCKET_NAME}/models/biobart/

SAVING MODEL TO GCP
✅ Model saved locally to: /content/biobart_model

Uploading to GCP bucket...
Copying file:///content/biobart_model/checkpoint-1200/model.safetensors [Content-Type=application/octet-stream]...
Copying file:///content/biobart_model/checkpoint-1200/training_args.bin [Content-Type=application/octet-stream]...
Copying file:///content/biobart_model/checkpoint-1200/tokenizer_config.json [Content-Type=application/json]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled 