In [1]:
# Cell 1: Install compatible libraries
!pip install -q --upgrade "transformers==4.40.1" "datasets" "accelerate" "bitsandbytes"
!pip install -q "rouge_score" "bert_score"

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m138.0/138.0 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.0/9.0 MB[0m [31m65.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m494.8/494.8 kB[0m [31m26.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m367.1/367.1 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m72.9/72.9 MB[0m [31m23.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.6/3.6 MB[0m [31m71.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m363.4/363.4 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━

In [None]:
# Cell 2: Log in to Hugging Face
import os
from huggingface_hub import HfFolder, HfApi

# Try to get the token from an environment variable
hf_token = os.getenv("HF_TOKEN")

if hf_token:
    print("Hugging Face token found in environment variables.")
    HfFolder.save_token(hf_token)
    print("Hugging Face token loaded and saved for the session.")

    # Optional: Verify login
    try:
        user_info = HfApi().whoami(token=hf_token)
        print(f"Successfully logged in to Hugging Face as: {user_info['name']}")
    except Exception as e:
        print(f"Hugging Face login verification failed: {e}")
else:
    print("!!! WARNING: HF_TOKEN environment variable not set. !!!")
    print("Downloads for gated models may fail. Please set the HF_TOKEN environment variable.")

Hugging Face token retrieved from Kaggle Secrets.
Hugging Face token loaded and saved for the session.
Successfully logged in to Hugging Face as: saimaanas49


In [3]:
# Cell 3: Define Functions
import pandas as pd
import torch
import gc  # Garbage Collector interface
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer
from rouge_score import rouge_scorer
from bert_score import score
import json

# --- 1. Data Loading and Preparation ---
def load_and_prepare_data(csv_path, num_samples=5):
    """Loads the dataset and returns a small sample."""
    print(f"Loading data from {csv_path}...")
    df = pd.read_csv(csv_path)
    print("Data loaded successfully.")
    return df.head(num_samples)

# --- 2. Model Generation Functions (Optimized for Memory) ---
def create_generator():
    """Creates a 4-bit quantized text-generation pipeline."""
    print("Loading model 'mistralai/Mistral-7B-Instruct-v0.2' in 4-bit quantization...")
    model = AutoModelForCausalLM.from_pretrained(
        "mistralai/Mistral-7B-Instruct-v0.2",
        torch_dtype=torch.float16,
        device_map="auto",
        load_in_4bit=True,
    )
    tokenizer = AutoTokenizer.from_pretrained("mistralai/Mistral-7B-Instruct-v0.2")
    return pipeline(
        "text-generation",
        model=model,
        tokenizer=tokenizer,
    )

def generate_notes_zero_shot(generator, dialogues):
    """Generates notes using a pre-loaded generator."""
    generated_notes = []
    for i, dialogue in enumerate(dialogues):
        print(f"  - Generating zero-shot note for sample {i+1}/{len(dialogues)}...")
        prompt = f"Given the following doctor-patient conversation, generate a clinical note in SOAP format.\n\n{dialogue}"
        outputs = generator(prompt, max_new_tokens=512, num_return_sequences=1, eos_token_id=generator.tokenizer.eos_token_id)
        if "SOAP format." in outputs[0]['generated_text']:
            generated_notes.append(outputs[0]['generated_text'].split("SOAP format.")[-1].strip())
        else:
            generated_notes.append(outputs[0]['generated_text'])
    return generated_notes

def generate_notes_few_shot(generator, dialogues, examples):
    """Generates notes using a pre-loaded generator and few-shot examples."""
    example_prompt = ""
    for ex in examples:
        example_prompt += f"Conversation:\n{ex['dialogue']}\nSOAP Note:\n{ex['note']}\n\n"
    generated_notes = []
    for i, dialogue in enumerate(dialogues):
        print(f"  - Generating few-shot note for sample {i+1}/{len(dialogues)}...")
        prompt = f"{example_prompt}Conversation:\n{dialogue}\nSOAP Note:"
        outputs = generator(prompt, max_new_tokens=512, num_return_sequences=1, eos_token_id=generator.tokenizer.eos_token_id)
        if "SOAP Note:" in outputs[0]['generated_text']:
            generated_notes.append(outputs[0]['generated_text'].split("SOAP Note:")[-1].strip())
        else:
            generated_notes.append(outputs[0]['generated_text'])
    return generated_notes

# --- 3. Evaluation ---
def evaluate_generation(generated_notes, reference_notes):
    """Evaluates generated notes using ROUGE and BERTScore."""
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    rouge_scores = [scorer.score(ref, gen) for ref, gen in zip(reference_notes, generated_notes)]

    print("Calculating BERTScore...")
    P, R, F1 = score(generated_notes, reference_notes, lang="en", verbose=True, model_type='roberta-large', device="cuda" if torch.cuda.is_available() else "cpu")

    return {
        "rouge": rouge_scores,
        "bert_score_precision": P.mean().item(),
        "bert_score_recall": R.mean().item(),
        "bert_score_f1": F1.mean().item()
    }

2025-07-18 10:42:15.136827: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1752835335.498621      19 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1752835335.601119      19 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [4]:
# Cell 4: Run Benchmark and Display/Save Results
def run_benchmark_and_save_results():
    # --- Configuration ---
    DATASET_PATH = "https://raw.githubusercontent.com/wyim/aci-bench/main/data/challenge_data/train.csv"
    NUM_SAMPLES_FOR_DEMO = 3
    NUM_FEW_SHOT_EXAMPLES = 2

    # --- Load Data ---
    data_sample = load_and_prepare_data(DATASET_PATH, num_samples=NUM_SAMPLES_FOR_DEMO + NUM_FEW_SHOT_EXAMPLES)
    evaluation_dialogues = data_sample['dialogue'].tolist()[NUM_FEW_SHOT_EXAMPLES:]
    reference_notes = data_sample['note'].tolist()[NUM_FEW_SHOT_EXAMPLES:]
    few_shot_examples = data_sample.head(NUM_FEW_SHOT_EXAMPLES).to_dict('records')

    # --- Run and Evaluate Zero-Shot Task ---
    print("\n--- Loading Model (with 4-bit Quantization) for Zero-Shot Task ---")
    zero_shot_generator = create_generator()

    print("\n--- Benchmarking Zero-Shot ---")
    zero_shot_notes = generate_notes_zero_shot(zero_shot_generator, evaluation_dialogues)
    zero_shot_results = evaluate_generation(zero_shot_notes, reference_notes)

    print("\n--- Clearing GPU Memory After Zero-Shot ---")
    del zero_shot_generator
    gc.collect()
    torch.cuda.empty_cache()
    print("✅ GPU Memory Cleared Successfully!")

    # --- Run and Evaluate Few-Shot Task ---
    print("\n--- Reloading Model for Few-Shot Task ---")
    few_shot_generator = create_generator()

    print("\n--- Benchmarking Few-Shot ---")
    few_shot_notes = generate_notes_few_shot(few_shot_generator, evaluation_dialogues, few_shot_examples)
    few_shot_results = evaluate_generation(few_shot_notes, reference_notes)

    print("\n--- Final GPU Memory Cleanup ---")
    del few_shot_generator
    gc.collect()
    torch.cuda.empty_cache()
    print("✅ Final GPU Memory Cleared Successfully!")

    # --- 5. Analysis and Findings ---
    print("\n--- Findings ---")
    print("\n**Side-by-Side Comparison:**")
    for i in range(len(evaluation_dialogues)):
        print(f"\n--- Sample {i+1} ---")
        print(f"**Dialogue:**\n{evaluation_dialogues[i]}\n")
        print(f"**Reference Note:**\n{reference_notes[i]}\n")
        print(f"**Generated Note (Zero-Shot):**\n{zero_shot_notes[i]}\n")
        print(f"**Generated Note (Few-Shot):**\n{few_shot_notes[i]}\n")


    print("\n**Quantitative Results:**")
    print("\n*Zero-Shot:*")
    if zero_shot_results['rouge']:
        print(f"  - Average ROUGE-1 F1: {sum([s['rouge1'].fmeasure for s in zero_shot_results['rouge']]) / len(zero_shot_results['rouge'])}")
        print(f"  - Average ROUGE-2 F1: {sum([s['rouge2'].fmeasure for s in zero_shot_results['rouge']]) / len(zero_shot_results['rouge'])}")
        print(f"  - Average ROUGE-L F1: {sum([s['rougeL'].fmeasure for s in zero_shot_results['rouge']]) / len(zero_shot_results['rouge'])}")
        print(f"  - BERTScore F1: {zero_shot_results['bert_score_f1']:.4f}")
    else:
        print("  - ROUGE scores not available for Zero-Shot.")

    print("\n*Few-Shot:*")
    if few_shot_results['rouge']:
        print(f"  - Average ROUGE-1 F1: {sum([s['rouge1'].fmeasure for s in few_shot_results['rouge']]) / len(few_shot_results['rouge'])}")
        print(f"  - Average ROUGE-2 F1: {sum([s['rouge2'].fmeasure for s in few_shot_results['rouge']]) / len(few_shot_results['rouge'])}")
        print(f"  - Average ROUGE-L F1: {sum([s['rougeL'].fmeasure for s in few_shot_results['rouge']]) / len(few_shot_results['rouge'])}")
        print(f"  - BERTScore F1: {few_shot_results['bert_score_f1']:.4f}")
    else:
        print("  - ROUGE scores not available for Few-Shot.")

    # --- Save Generated Notes to CSV ---
    print("\n--- Saving Generated Notes to CSV ---")

    results_df = pd.DataFrame({
        'Dialogue': evaluation_dialogues,
        'Reference_Note': reference_notes,
        'Generated_Note_Zero_Shot': zero_shot_notes,
        'Generated_Note_Few_Shot': few_shot_notes
    })

    output_path_csv = '/kaggle/working/generated_clinical_notes.csv'
    results_df.to_csv(output_path_csv, index=False)
    print(f"✅ Generated notes saved to {output_path_csv}")

    # Save the quantitative results to JSON
    quantitative_results_data = {
        "zero_shot": {
            "rouge1_f1": sum([s['rouge1'].fmeasure for s in zero_shot_results['rouge']]) / len(zero_shot_results['rouge']) if zero_shot_results['rouge'] else 0,
            "rouge2_f1": sum([s['rouge2'].fmeasure for s in zero_shot_results['rouge']]) / len(zero_shot_results['rouge']) if zero_shot_results['rouge'] else 0,
            "rougeL_f1": sum([s['rougeL'].fmeasure for s in zero_shot_results['rouge']]) / len(zero_shot_results['rouge']) if zero_shot_results['rouge'] else 0,
            "bert_score_f1": zero_shot_results['bert_score_f1']
        },
        "few_shot": {
            "rouge1_f1": sum([s['rouge1'].fmeasure for s in few_shot_results['rouge']]) / len(few_shot_results['rouge']) if few_shot_results['rouge'] else 0,
            "rouge2_f1": sum([s['rouge2'].fmeasure for s in few_shot_results['rouge']]) / len(few_shot_results['rouge']) if few_shot_results['rouge'] else 0,
            "rougeL_f1": sum([s['rougeL'].fmeasure for s in few_shot_results['rouge']]) / len(few_shot_results['rouge']) if few_shot_results['rouge'] else 0,
            "bert_score_f1": few_shot_results['bert_score_f1']
        }
    }
    output_path_json = '/kaggle/working/quantitative_results.json'
    with open(output_path_json, 'w') as f:
        json.dump(quantitative_results_data, f, indent=4)
    print(f"✅ Quantitative results saved to {output_path_json}")

    # --- 6. Suggestions for Improvement ---
    print("\n--- Suggestions for Improvement ---")
    print("""
1.  **Fine-Tuning:** The most significant improvements are likely to come from fine-tuning a model like MedAlpaca on the ACI-Bench dataset. This will allow the model to learn the specific format and nuances of clinical note generation.

2.  **Chain-of-Thought (CoT) Prompting:** Instead of asking for the SOAP note directly, prompt the model to first identify the subjective, objective, assessment, and plan sections from the dialogue and then construct the note. This can improve factuality.
    * *Example CoT Prompt:* "First, extract the subjective complaints from the patient. Second, list the objective findings from the doctor's examination. Third, state the doctor's assessment. Fourth, outline the proposed plan. Finally, combine these into a formal SOAP note."

3.  **Domain-Specific Conditioning:** For models that are not medically pre-trained, providing a brief preamble in the prompt that sets the context (e.g., "You are a medical scribe summarizing a doctor-patient conversation.") can improve the tone and terminology of the generated text.

4.  **Output Reranking and Post-processing:**
    * **Reranking:** Generate multiple candidate notes and use a scoring function (e.g., based on factuality checks or a trained classifier) to select the best one.
    * **Post-processing:** Implement rules to clean up the output, such as ensuring the standard SOAP headings are present and correctly formatted.

5.  **Novel Evaluation Metrics:**
    * **Section-wise Evaluation:** Evaluate the performance on each section of the SOAP note independently to identify specific weaknesses (e.g., the model might be good at capturing subjective information but poor at formulating a plan).
    * **Medical Entity Extraction:** Use a Named Entity Recognition (NER) model to extract medical entities (symptoms, diagnoses, medications) from both the generated and reference notes and compare the recall and precision. This provides a more granular assessment of factual accuracy.
    """)

# Execute the combined pipeline
run_benchmark_and_save_results()

Loading data from https://raw.githubusercontent.com/wyim/aci-bench/main/data/challenge_data/train.csv...
Data loaded successfully.

--- Loading Model (with 4-bit Quantization) for Zero-Shot Task ---
Loading model 'mistralai/Mistral-7B-Instruct-v0.2' in 4-bit quantization...




config.json:   0%|          | 0.00/596 [00:00<?, ?B/s]

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


model.safetensors.index.json:   0%|          | 0.00/25.1k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/3 [00:00<?, ?it/s]

model-00001-of-00003.safetensors:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

model-00002-of-00003.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00003.safetensors:   0%|          | 0.00/4.54G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/111 [00:00<?, ?B/s]



tokenizer_config.json:   0%|          | 0.00/2.10k [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/493k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.80M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



--- Benchmarking Zero-Shot ---
  - Generating zero-shot note for sample 1/3...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


  - Generating zero-shot note for sample 2/3...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


  - Generating zero-shot note for sample 3/3...
Calculating BERTScore...




tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.78 seconds, 3.86 sentences/sec

--- Clearing GPU Memory After Zero-Shot ---


The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


✅ GPU Memory Cleared Successfully!

--- Reloading Model for Few-Shot Task ---
Loading model 'mistralai/Mistral-7B-Instruct-v0.2' in 4-bit quantization...


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.



--- Benchmarking Few-Shot ---
  - Generating few-shot note for sample 1/3...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


  - Generating few-shot note for sample 2/3...


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


  - Generating few-shot note for sample 3/3...
Calculating BERTScore...


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.53 seconds, 5.65 sentences/sec

--- Final GPU Memory Cleanup ---
✅ Final GPU Memory Cleared Successfully!

--- Findings ---

**Side-by-Side Comparison:**

--- Sample 1 ---
**Dialogue:**
[doctor] hi , john . how are you ?
[patient] hey . well , relatively speaking , okay . good to see you .
[doctor] good to see you as well . so i know the nurse told you about dax . i'm gon na tell dax a little bit about you .
[patient] okay .
[doctor] so john is a 61-year-old male with a past medical history significant for kidney stones , migraines and reflux , who presents with some back pain . so john , what's going on with your back ?
[patient] uh , i'm feeling a lot of the same pain that i had when i had kidney stones about two years ago , so i'm a little concerned .
[doctor] yeah . and so wh- what side of your back is it on ?
[patient] uh , honestly , it shifts . it started from the right side and it kinda moved over , and now i feel it in the left side of my back .
[doctor] okay . and ,