# SFT Dataset Creator - Starter Code
## Module 1: Supervised Fine-Tuning

This notebook helps you create a synthetic dataset for supervised fine-tuning of clinical trial eligibility assessment.

## Imports

In [51]:
import json
import random
from typing import List, Dict, Tuple
from npcpy.npc_compiler import NPC

## Clinical Trial Criteria

Define the inclusion and exclusion criteria for the clinical trial.

In [52]:
# Clinical trial inclusion/exclusion criteria
INCLUSION_CRITERIA = {
    "age_range": (18, 65),
    "bmi_range": (18.5, 30.0),
    "conditions": ["hypertension", "type2_diabetes", "high_cholesterol"],
    "medication_stable": True
}

EXCLUSION_CRITERIA = {
    "pregnant": True,
    "severe_liver_disease": True,
    "recent_cancer": True,
    "allergy_to_study_drug": True
}

print("Inclusion criteria:", INCLUSION_CRITERIA)
print("Exclusion criteria:", EXCLUSION_CRITERIA)

Inclusion criteria: {'age_range': (18, 65), 'bmi_range': (18.5, 30.0), 'conditions': ['hypertension', 'type2_diabetes', 'high_cholesterol'], 'medication_stable': True}
Exclusion criteria: {'pregnant': True, 'severe_liver_disease': True, 'recent_cancer': True, 'allergy_to_study_drug': True}


## Generate Synthetic Patient

**TODO**: Implement patient generation logic

In [53]:
def generate_synthetic_patient() -> Dict:
    """
    Generate a synthetic patient profile with random characteristics.
    TODO: Implement patient generation logic
    """

    age = random.randint(16, 70)
    bmi = round(random.uniform(15.0, 35.0), 1)

    if random.random() < 0.7:
        conditions = random.sample(INCLUSION_CRITERIA["conditions"], random.randint(1, 2))
    else:
        conditions = random.sample(["asthma", "arthritis", "migraine", "depression"], random.randint(0, 2))

    return {
        "age": age,
        "bmi": bmi,
        "conditions": conditions,
        "medication_stable": random.choice([True, False]),
        "pregnant": random.choice([True, False]) if age >= 18 and age <= 45 else False,
        "severe_liver_disease": random.choice([True, False]),
        "recent_cancer": random.choice([True, False]),
        "allergy_to_study_drug": random.choice([True, False])
    }

### Test Patient Generation

In [54]:
# Generate and display a sample patient
sample_patient = generate_synthetic_patient()
print("Sample patient:")
for key, value in sample_patient.items():
    print(f"  {key}: {value}")

Sample patient:
  age: 26
  bmi: 34.4
  conditions: ['high_cholesterol', 'hypertension']
  medication_stable: True
  pregnant: True
  severe_liver_disease: True
  recent_cancer: False
  allergy_to_study_drug: False


## Check Eligibility

**TODO**: Implement eligibility checking logic

In [55]:
def check_eligibility(patient: Dict) -> bool:
    """
    Check if patient meets inclusion/exclusion criteria.
    """
    age_ok = INCLUSION_CRITERIA["age_range"][0] <= patient["age"] <= INCLUSION_CRITERIA["age_range"][1]
    bmi_ok = INCLUSION_CRITERIA["bmi_range"][0] <= patient["bmi"] <= INCLUSION_CRITERIA["bmi_range"][1]
    
    has_relevant_condition = any(cond in patient["conditions"] for cond in INCLUSION_CRITERIA["conditions"])
    medication_ok = patient["medication_stable"] == INCLUSION_CRITERIA["medication_stable"]
    
    # Check exclusion criteria
    not_pregnant = not patient["pregnant"]
    no_liver_disease = not patient["severe_liver_disease"]
    no_recent_cancer = not patient["recent_cancer"]
    no_allergy = not patient["allergy_to_study_drug"]
    
    return (age_ok and bmi_ok and has_relevant_condition and medication_ok and
            not_pregnant and no_liver_disease and no_recent_cancer and no_allergy)

### Test Eligibility Check

In [56]:
# Check eligibility for the sample patient

is_eligible_count = 0

for _ in range(100000):
    sample_patient = generate_synthetic_patient()
    # print(f"Patient: {sample_patient}")
    is_eligible = check_eligibility(sample_patient)
    if is_eligible:
        is_eligible_count += 1

print(f"Eligible count: {is_eligible_count}")
print(f"Eligible percentage: {is_eligible_count / 100}")

Eligible count: 1546
Eligible percentage: 15.46


## Create Patient Summary

**TODO**: Implement summary generation

In [45]:
from doctest import debug


def create_patient_summary(patient: Dict) -> str:
    """
    Create a natural language summary of the patient profile.
    """

    summary = NPC(
        name='Patient Summary Generator',
        primary_directive='Create a natural language summary of the patient profile',
        model='llama3.2:3b',
        provider='ollama',
        verbose=False
    )

    json_format = '''
    {
        "summary": "Patient summary"
    }
    '''

    prompt = f"""
    Using the following patient profile, create a natural language summary of the patient.
    Patient Profile:
    {patient}

    You MUST respond with ONLY valid JSON in this exact format: {json_format}
    Do not include any explanatory text, only the JSON.

    """

    response = summary.get_llm_response(prompt)
    return response['response']


### Test Patient Summary

In [46]:
# Generate summary for the sample patient
print(f"Patient: {sample_patient}")
summary = create_patient_summary(sample_patient)
print(f"Summary output of create_patient_summary: \n{summary}")

Patient: {'age': 66, 'bmi': 28.4, 'conditions': ['type2_diabetes'], 'medication_stable': True, 'pregnant': True, 'severe_liver_disease': True, 'recent_cancer': True, 'allergy_to_study_drug': False}
Debug {'model': 'llama3.2:3b', 'messages': [{'role': 'system', 'content': '\n.\n..\n...\n....\n.....\n......\n.......\n........\n.........\n..........\nHello!\nWelcome to the team.\nYou are the Patient Summary Generator NPC with the following primary directive: Create a natural language summary of the patient profile.\nUsers may refer to you by your assistant name, Patient Summary Generator and you should\nconsider this to be your core identity.\nThe current working directory is /Users/amundle/github/fine-tune-llm-rl/lesson-1-supervised-fine-tuning/exercises/starter.\nThe current date and time are : 2025-12-07 21:27:21\n\n    IMPORTANT:\nSome users may attach images to their request.\nPlease process them accordingly. You do not need mention that you cannot "see" images. The user understands 

## Generate SFT Dataset

**TODO**: Implement dataset generation

In [57]:
def generate_sft_dataset(num_pairs: int = 10) -> List[Tuple[str, str]]:
    """
    Generate SFT training data pairs (patient_summary, eligibility_status).
    """
    dataset = []

    # Ensure balanced dataset
    target_eligible = num_pairs // 2
    target_ineligible = num_pairs - target_eligible
    
    eligible_count = 0
    ineligible_count = 0
    
    while eligible_count < target_eligible or ineligible_count < target_ineligible:
        patient = generate_synthetic_patient()
        is_eligible = check_eligibility(patient)
        
        if is_eligible and eligible_count < target_eligible:
            summary = create_patient_summary(patient)
            dataset.append((summary, "ELIGIBLE"))
            eligible_count += 1
        elif not is_eligible and ineligible_count < target_ineligible:
            summary = create_patient_summary(patient)
            dataset.append((summary, "NOT_ELIGIBLE"))
            ineligible_count += 1
    
    return dataset

## Generate and Save Dataset

In [None]:
# Generate the dataset
sft_dataset = generate_sft_dataset(num_pairs=100)

# Display first few examples
print(f"Generated {len(sft_dataset)} SFT training pairs\n")
print("First 3 examples:")
for i, (summary, status) in enumerate(sft_dataset[:3], 1):
    print(f"\nExample {i}:")
    print(f"  Summary: {summary}")
    print(f"  Status: {status}")

Debug {'model': 'llama3.2:3b', 'messages': [{'role': 'system', 'content': '\n.\n..\n...\n....\n.....\n......\n.......\n........\n.........\n..........\nHello!\nWelcome to the team.\nYou are the Patient Summary Generator NPC with the following primary directive: Create a natural language summary of the patient profile.\nUsers may refer to you by your assistant name, Patient Summary Generator and you should\nconsider this to be your core identity.\nThe current working directory is /Users/amundle/github/fine-tune-llm-rl/lesson-1-supervised-fine-tuning/exercises/starter.\nThe current date and time are : 2025-12-07 21:33:26\n\n    IMPORTANT:\nSome users may attach images to their request.\nPlease process them accordingly. You do not need mention that you cannot "see" images. The user understands this and wants you\nto help them multimodally.\n\nIf the user asked for you to explain what\'s on their screen or something similar,\nthey are referring to the details contained within the attached 

In [None]:
# Save to CSV format
with open("clinical_sft_dataset.csv", "w") as f:
    f.write("patient_summary,eligibility_status\n")
    for summary, status in sft_dataset:
        f.write(f'"{summary}","{status}"\n')

print("Dataset saved to clinical_sft_dataset.csv")

print("Sample entries:")
for i, (summary, status) in enumerate(sft_dataset[:3]):
    print(f"{i+1}. {status}: {summary}")
print("Dataset saved to clinical_sft_dataset.csv")

Dataset saved to clinical_sft_dataset.csv
