In [1]:
# Install required packages
!pip install sdv pandas numpy networkx faker matplotlib seaborn

import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta
import random
from faker import Faker

# SDV imports for multi-table synthesis
from sdv.metadata import Metadata
from sdv.multi_table import HMASynthesizer
from sdv.evaluation.multi_table import run_diagnostic, evaluate_quality

# NetworkX for relationship analysis
import networkx as nx
from typing import Dict, List, Optional

# Initialize Faker for realistic sensitive data generation
fake = Faker(['en_US'])
Faker.seed(42)
np.random.seed(42)

print("✅ All dependencies installed and configured")
print("🔒 Ready to generate sensitive healthcare data for synthesis")

✅ All dependencies installed and configured
🔒 Ready to generate sensitive healthcare data for synthesis


In [6]:
def create_sensitive_healthcare_system():
    """
    Create a realistic healthcare system with 5 tables containing sensitive information:

    1. PATIENTS - Personal and demographic information (1000 records)
    2. DOCTORS - Healthcare provider information (1000 records)
    3. MEDICAL_RECORDS - Patient medical history and diagnoses (1000 records)
    4. PRESCRIPTIONS - Medication prescriptions (1000 records)
    5. APPOINTMENTS - Patient-doctor appointments (1000 records)

    Contains sensitive information: SSN, medical conditions, medications, insurance, etc.
    """
    print("🏥 CREATING SENSITIVE HEALTHCARE SYSTEM")
    print("=" * 60)
    print("⚠️  Generating synthetic SENSITIVE data for demonstration purposes")
    print("🔒 This data contains: SSN, medical conditions, medications, insurance info")

    # Set seeds for reproducibility
    np.random.seed(42)
    fake.seed_instance(42)

    # 1. PATIENTS TABLE - Core patient demographic and sensitive information
    print("\n1️⃣ Creating PATIENTS table (1000 records)...")
    patients = pd.DataFrame({
        'patient_id': range(1, 1001),  # Primary Key
        'social_security_number': [fake.ssn() for _ in range(1000)],  # SENSITIVE
        'first_name': [fake.first_name() for _ in range(1000)],
        'last_name': [fake.last_name() for _ in range(1000)],
        'date_of_birth': [fake.date_of_birth(minimum_age=1, maximum_age=95) for _ in range(1000)],
        'gender': np.random.choice(['Male', 'Female', 'Non-binary'], 1000, p=[0.48, 0.48, 0.04]),
        'phone_number': [fake.phone_number() for _ in range(1000)],  # SENSITIVE
        'email': [fake.email() for _ in range(1000)],  # SENSITIVE
        'insurance_number': [fake.bothify(text='INS-#########') for _ in range(1000)],  # SENSITIVE
        'emergency_contact': [fake.name() + ' - ' + fake.phone_number() for _ in range(1000)]  # SENSITIVE
    })

    # 2. DOCTORS TABLE - Healthcare provider information with credentials
    print("2️⃣ Creating DOCTORS table (1000 records)...")

    specialties = [
        'Cardiology', 'Dermatology', 'Emergency Medicine', 'Endocrinology',
        'Family Medicine', 'Gastroenterology', 'Internal Medicine', 'Neurology',
        'Oncology', 'Orthopedics', 'Pediatrics', 'Psychiatry', 'Radiology',
        'Surgery', 'Urology'
    ]

    doctors = pd.DataFrame({
        'doctor_id': range(1, 1001),  # Primary Key
        'medical_license_number': [fake.bothify(text='MD-######-??') for _ in range(1000)],  # SENSITIVE
        'first_name': [fake.first_name() for _ in range(1000)],
        'last_name': [fake.last_name() for _ in range(1000)],
        'specialty': np.random.choice(specialties, 1000),
        'years_experience': np.random.randint(1, 40, 1000),
        'hospital_affiliation': [fake.company() + ' Medical Center' for _ in range(1000)],
        'phone_number': [fake.phone_number() for _ in range(1000)],  # SENSITIVE
        'email': [fake.email() for _ in range(1000)],  # SENSITIVE
        'referring_doctor_id': [None] * 800 + list(np.random.choice(range(1, 801), 200))  # Self-referencing
    })

    # 3. MEDICAL_RECORDS TABLE - Sensitive medical history and diagnoses
    print("3️⃣ Creating MEDICAL_RECORDS table (1000 records)...")

    medical_conditions = [
        'Hypertension', 'Diabetes Type 2', 'Hyperlipidemia', 'Anxiety Disorder',
        'Depression', 'Asthma', 'Arthritis', 'COPD', 'Heart Disease', 'Obesity',
        'Chronic Kidney Disease', 'Thyroid Disorder', 'Migraine', 'PTSD',
        'Bipolar Disorder', 'Fibromyalgia', 'Sleep Apnea', 'Gastroesophageal Reflux'
    ]

    medical_records = pd.DataFrame({
        'record_id': range(1, 1001),  # Primary Key
        'patient_id': np.random.choice(patients['patient_id'], 1000),  # Foreign Key
        'doctor_id': np.random.choice(doctors['doctor_id'], 1000),  # Foreign Key
        'visit_date': [fake.date_between(start_date='-2y', end_date='today') for _ in range(1000)],
        'primary_diagnosis': np.random.choice(medical_conditions, 1000),  # SENSITIVE
        'secondary_diagnosis': [np.random.choice(medical_conditions) if np.random.random() < 0.4 else None for _ in range(1000)],  # SENSITIVE
        'symptoms': [fake.text(max_nb_chars=150) for _ in range(1000)],  # SENSITIVE
        'treatment_plan': [fake.text(max_nb_chars=200) for _ in range(1000)],  # SENSITIVE
        'blood_pressure': [f"{np.random.randint(90, 180)}/{np.random.randint(60, 120)}" for _ in range(1000)],  # SENSITIVE
        'confidential_notes': [fake.text(max_nb_chars=100) for _ in range(1000)]  # HIGHLY SENSITIVE
    })

    # 4. PRESCRIPTIONS TABLE - Medication prescriptions and dosages
    print("4️⃣ Creating PRESCRIPTIONS table (1000 records)...")

    medications = [
        'Lisinopril', 'Metformin', 'Amlodipine', 'Metoprolol', 'Omeprazole',
        'Simvastatin', 'Losartan', 'Albuterol', 'Gabapentin', 'Sertraline',
        'Levothyroxine', 'Trazodone', 'Fluoxetine', 'Prednisone', 'Tramadol',
        'Hydrochlorothiazide', 'Atorvastatin', 'Citalopram', 'Montelukast', 'Escitalopram'
    ]

    prescriptions = pd.DataFrame({
        'prescription_id': range(1, 1001),  # Primary Key
        'patient_id': np.random.choice(patients['patient_id'], 1000),  # Foreign Key
        'doctor_id': np.random.choice(doctors['doctor_id'], 1000),  # Foreign Key
        'record_id': np.random.choice(medical_records['record_id'], 1000),  # Foreign Key
        'medication_name': np.random.choice(medications, 1000),  # SENSITIVE
        'dosage': [f"{np.random.choice([5, 10, 20, 25, 50, 100, 200])}mg" for _ in range(1000)],  # SENSITIVE
        'frequency': np.random.choice(['Once daily', 'Twice daily', 'Three times daily', 'As needed'], 1000),
        'quantity_prescribed': np.random.randint(30, 90, 1000),
        'refills_remaining': np.random.randint(0, 6, 1000),
        'special_instructions': [fake.text(max_nb_chars=100) if np.random.random() < 0.3 else None for _ in range(1000)]  # SENSITIVE
    })

    # 5. APPOINTMENTS TABLE - Scheduled appointments with sensitive information
    print("5️⃣ Creating APPOINTMENTS table (1000 records)...")

    appointment_types = [
        'Annual Physical', 'Follow-up', 'Consultation', 'Emergency Visit',
        'Specialist Referral', 'Lab Results Review', 'Medication Management',
        'Mental Health Check', 'Chronic Care Visit', 'Preventive Care'
    ]

    appointments = pd.DataFrame({
        'appointment_id': range(1, 1001),  # Primary Key
        'patient_id': np.random.choice(patients['patient_id'], 1000),  # Foreign Key
        'doctor_id': np.random.choice(doctors['doctor_id'], 1000),  # Foreign Key
        'appointment_datetime': [
            fake.date_time_between(start_date='-6m', end_date='+6m') for _ in range(1000)
        ],
        'appointment_type': np.random.choice(appointment_types, 1000),
        'reason_for_visit': [fake.text(max_nb_chars=100) for _ in range(1000)],  # SENSITIVE
        'insurance_copay': np.round(np.random.uniform(10, 75, 1000), 2),  # SENSITIVE
        'appointment_status': np.random.choice(['Scheduled', 'Completed', 'Cancelled', 'No-show'], 1000, p=[0.3, 0.5, 0.15, 0.05]),
        'patient_arrival_time': [
            fake.date_time_between(start_date='-6m', end_date='+6m') for _ in range(1000)
        ],
        'confidential_notes': [fake.text(max_nb_chars=150) if np.random.random() < 0.3 else None for _ in range(1000)]  # HIGHLY SENSITIVE
    })

    # Compile all tables
    healthcare_data = {
        'patients': patients,
        'doctors': doctors,
        'medical_records': medical_records
    }

    # Display summary with sensitivity warnings
    print(f"\n✅ SENSITIVE HEALTHCARE DATASET CREATED!")
    print(f"⚠️  WARNING: This dataset contains HIGHLY SENSITIVE information")
    print(f"🔒 Includes: SSN, medical conditions, medications, insurance, personal details")

    total_records = sum(len(df) for df in healthcare_data.values())
    total_columns = sum(len(df.columns) for df in healthcare_data.values())

    print(f"\n📊 DATASET SUMMARY:")
    print(f"   📋 Total tables: {len(healthcare_data)}")
    print(f"   📈 Total records: {total_records:,}")
    print(f"   📝 Total columns: {total_columns}")

    print(f"\n📋 TABLE BREAKDOWN:")
    for table_name, df in healthcare_data.items():
        sensitive_cols = []
        if 'social_security_number' in df.columns:
            sensitive_cols.append('SSN')
        if 'medical_license_number' in df.columns:
            sensitive_cols.append('License')
        if 'primary_diagnosis' in df.columns:
            sensitive_cols.append('Medical Conditions')
        if 'medication_name' in df.columns:
            sensitive_cols.append('Medications')
        if 'confidential_notes' in df.columns:
            sensitive_cols.append('Confidential Notes')
        if 'phone_number' in df.columns:
            sensitive_cols.append('Phone')
        if 'email' in df.columns:
            sensitive_cols.append('Email')

        sensitive_info = ', '.join(sensitive_cols) if sensitive_cols else 'Basic Info'
        print(f"   {table_name:15} | {len(df):4,} rows | {len(df.columns):2} cols | Sensitive: {sensitive_info}")

    return healthcare_data

# Create the sensitive healthcare dataset
healthcare_data = create_sensitive_healthcare_system()

🏥 CREATING SENSITIVE HEALTHCARE SYSTEM
⚠️  Generating synthetic SENSITIVE data for demonstration purposes
🔒 This data contains: SSN, medical conditions, medications, insurance info

1️⃣ Creating PATIENTS table (1000 records)...
2️⃣ Creating DOCTORS table (1000 records)...
3️⃣ Creating MEDICAL_RECORDS table (1000 records)...
4️⃣ Creating PRESCRIPTIONS table (1000 records)...
5️⃣ Creating APPOINTMENTS table (1000 records)...

✅ SENSITIVE HEALTHCARE DATASET CREATED!
🔒 Includes: SSN, medical conditions, medications, insurance, personal details

📊 DATASET SUMMARY:
   📋 Total tables: 3
   📈 Total records: 3,000
   📝 Total columns: 30

📋 TABLE BREAKDOWN:
   patients        | 1,000 rows | 10 cols | Sensitive: SSN, Phone, Email
   doctors         | 1,000 rows | 10 cols | Sensitive: License, Phone, Email
   medical_records | 1,000 rows | 10 cols | Sensitive: Medical Conditions, Confidential Notes


In [7]:
from sdv.metadata import Metadata

In [8]:
metadata = Metadata.detect_from_dataframes(healthcare_data)

In [9]:
synthesizer = HMASynthesizer(metadata)
synthesizer.fit(healthcare_data)

Preprocess Tables: 100%|██████████| 3/3 [00:00<00:00,  6.61it/s]



Learning relationships:


(1/2) Tables 'patients' and 'medical_records' ('patient_id'): 100%|██████████| 634/634 [01:31<00:00,  6.96it/s]
(2/2) Tables 'doctors' and 'medical_records' ('doctor_id'): 100%|██████████| 632/632 [01:31<00:00,  6.91it/s]





Modeling Tables: 100%|██████████| 3/3 [00:02<00:00,  1.18it/s]


In [10]:
synthetic_data = synthesizer.sample()

In [11]:
synthetic_data['patients'].head()

Unnamed: 0,patient_id,social_security_number,first_name,last_name,date_of_birth,gender,phone_number,email,insurance_number,emergency_contact
0,13757301,064-27-6884,James,White,1986-04-16,Female,(855)700-6579,okelly@example.net,INS-702377184,Eric Mcpherson - (778)496-3558x774
1,3578873,816-08-4807,Tonya,Munoz,1946-10-01,Male,001-791-738-8190x481,audrey57@example.com,INS-282076238,Cassandra Reynolds - 001-792-803-2735x712
2,7196201,559-05-6050,Carrie,Miller,1989-11-19,Male,524.893.2303,fwolfe@example.com,INS-799623569,Kyle Smith - 001-772-628-5825
3,2219795,483-04-9199,William,Bray,1971-01-14,Female,352-916-6800,hwise@example.org,INS-578061902,Sarah Clark - 001-625-972-7346x89270
4,6480141,282-02-9273,Anna,Barnes,1938-05-16,Male,530-656-9118x84968,meredithdominguez@example.com,INS-106567337,Christopher Wolf - 001-900-272-2768x352


In [12]:
healthcare_data['patients'].head()

Unnamed: 0,patient_id,social_security_number,first_name,last_name,date_of_birth,gender,phone_number,email,insurance_number,emergency_contact
0,1,655-15-0410,Suzanne,Davis,1953-10-12,Male,+1-203-795-5124,davisbrian@example.org,INS-855780024,Laurie Pierce - 497.667.3887
1,2,760-36-4013,David,Stephens,1958-10-21,Female,(882)399-6291x4504,mccoydonna@example.com,INS-432362216,James Smith - 001-643-583-0009x14352
2,3,229-18-1680,Yvonne,Lopez,1962-05-21,Female,335.974.3842x1503,atkinsonrachel@example.net,INS-906968574,Mrs. Valerie Ramirez - 599.765.4662x897
3,4,693-95-8936,Timothy,Mcclure,1996-09-10,Female,407.582.3578x45218,younggabrielle@example.net,INS-670675169,Larry Gray - (452)575-6002
4,5,090-76-6913,John,Wood,1999-07-28,Male,001-207-368-1040x9157,hlester@example.org,INS-139521501,Danny Finley - 7562771251
