In [5]:
!pip install faker

Collecting faker
  Downloading faker-37.5.3-py3-none-any.whl.metadata (15 kB)
Downloading faker-37.5.3-py3-none-any.whl (1.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m1.9/1.9 MB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m-:--:--[0m
[?25hInstalling collected packages: faker
Successfully installed faker-37.5.3


In [3]:
import requests
import json
import time
import os
from faker import Faker
import random

OLLAMA_API = "http://localhost:11434/api/generate"
OLLAMA_MODEL = "gpt-oss:20b"
OUTPUT_FILE = "deid_dataset.jsonl"
MAX_LEN = 600  # max chars per section
DELAY = 3      # seconds between calls

fake = Faker()

# concise mini-note generation prompt with inline Faker injection
def build_prompt():
    phi = {
        "name": fake.name(),
        "dob": fake.date_of_birth(minimum_age=20, maximum_age=85).strftime("%m/%d/%Y"),
        "ssn": fake.ssn(),
        "address": fake.address().replace("\n", ", "),
        "phone": fake.phone_number(),
        "email": fake.email(),
        "passport": fake.bothify(text='??########'),
        "provider": fake.name(),
        "organization": fake.company(),
        "hospital": f"{fake.last_name()} Medical Center",
        "visit_date": fake.date_between(start_date='-2y', end_date='today').strftime("%m/%d/%Y")
    }

    return f"""
You are generating concise training data to help a medical AI model learn to detect and redact Protected Health Information (PHI) from clinical notes.

Each example should have two sections:
1. <data_with_phi>: A short, realistic clinical note (3 to 5 sentences) that includes PHI like patient name, SSN, birthdate, address, phone number, passport number, provider name, organization name, etc.
2. <data_hipaa_compliant>: A redacted version with [REDACTED] or placeholders like [DOB], [NAME], [HOSPITAL], while preserving medically relevant content (conditions, meds, vitals, symptoms).

Please be creative with the examples. Below I have an example of how to use the phi info, but feel free to restructure and scatter PHI in creative ways throughout the text. Also use diverse kinds of visit reasons and findings.
<example>
  <data_with_phi>
    Patient: {phi['name']}, born on {phi['dob']} (SSN: {phi['ssn']}) came to {phi['hospital']} on {phi['visit_date']} complaining of abdominal pain and nausea. 
    She was evaluated by Dr. {phi['provider']} and prescribed ondansetron and fluids. 
    Her address is {phi['address']}, phone: {phi['phone']}, email: {phi['email']}, passport: {phi['passport']}. 
    Follow-up with {phi['organization']} was recommended.
  </data_with_phi>

  <data_hipaa_compliant>
    Patient: [NAME], born on [DOB] (SSN: [REDACTED]) came to [HOSPITAL] on [REDACTED] complaining of abdominal pain and nausea. 
    She was evaluated by Dr. [REDACTED] and prescribed ondansetron and fluids. 
    Her address is [ADDRESS], phone: [PHONE], email: [EMAIL], passport: [REDACTED]. 
    Follow-up with [ORGANIZATION] was recommended.
  </data_hipaa_compliant>
</example>
"""

def call_ollama(prompt, model=OLLAMA_MODEL):
    payload = {
        "model": model,
        "prompt": prompt,
        "stream": False
    }
    response = requests.post(OLLAMA_API, json=payload)
    if response.status_code == 200:
        return response.json()["response"]
    else:
        print(f"‚ùå Request failed: {response.status_code} - {response.text}")
        return None

def parse_response(text):
    try:
        data_with_phi = text.split("<data_with_phi>")[1].split("</data_with_phi>")[0].strip()
        data_hipaa_compliant = text.split("<data_hipaa_compliant>")[1].split("</data_hipaa_compliant>")[0].strip()

        if len(data_with_phi) > MAX_LEN or len(data_hipaa_compliant) > MAX_LEN:
            print("‚ö†Ô∏è Skipping too-long example")
            return None

        return {
            "data_with_phi": data_with_phi,
            "data_hipaa_compliant": data_hipaa_compliant
        }
    except Exception as e:
        print(f"‚ö†Ô∏è Failed to parse response: {e}")
        return None

def load_existing_examples(filepath):
    if not os.path.exists(filepath):
        return set()
    with open(filepath, "r", encoding="utf-8") as f:
        return set(json.loads(line)["data_with_phi"] for line in f if line.strip())

def save_to_jsonl(data, filename=OUTPUT_FILE):
    with open(filename, "a", encoding="utf-8") as f:
        f.write(json.dumps(data, ensure_ascii=False) + "\n")

def generate_dataset(n_total=5000, delay=DELAY):
    seen = load_existing_examples(OUTPUT_FILE)
    count_existing = len(seen)
    print(f"üìÇ Found {count_existing} existing examples. Target: {n_total}")

    for i in range(count_existing, n_total):
        print(f"\nüëâ Generating example {i+1}/{n_total}")
        prompt = build_prompt()
        raw_output = call_ollama(prompt)
        if raw_output:
            parsed = parse_response(raw_output)
            if parsed and parsed["data_with_phi"] not in seen:
                save_to_jsonl(parsed)
                seen.add(parsed["data_with_phi"])
                print(f"‚úÖ Saved example {i+1}")
            else:
                print("‚ö†Ô∏è Duplicate or parse issue, skipping")
        else:
            print("‚ö†Ô∏è No response from model")
        time.sleep(delay)

if __name__ == "__main__":
    generate_dataset(n_total=5000)

üìÇ Found 154 existing examples. Target: 5000

üëâ Generating example 155/5000
‚ö†Ô∏è Skipping too-long example
‚ö†Ô∏è Duplicate or parse issue, skipping

üëâ Generating example 156/5000
‚úÖ Saved example 156

üëâ Generating example 157/5000
‚úÖ Saved example 157

üëâ Generating example 158/5000
‚úÖ Saved example 158

üëâ Generating example 159/5000
‚úÖ Saved example 159

üëâ Generating example 160/5000
‚úÖ Saved example 160

üëâ Generating example 161/5000
‚úÖ Saved example 161

üëâ Generating example 162/5000
‚ö†Ô∏è Skipping too-long example
‚ö†Ô∏è Duplicate or parse issue, skipping

üëâ Generating example 163/5000
‚úÖ Saved example 163

üëâ Generating example 164/5000
‚úÖ Saved example 164

üëâ Generating example 165/5000
‚úÖ Saved example 165

üëâ Generating example 166/5000
‚úÖ Saved example 166

üëâ Generating example 167/5000
‚úÖ Saved example 167

üëâ Generating example 168/5000
‚úÖ Saved example 168

üëâ Generating example 169/5000
‚úÖ Saved example 169

üëâ G

KeyboardInterrupt: 