In [None]:
import pandas as pd
import numpy as np

n_patients = 100  # Number of patients
n_sites = 5       # Number of sites

# Generate Site Metadata
site_ids = [f"SITE{str(i).zfill(2)}" for i in range(1, n_sites + 1)]
site_metadata = pd.DataFrame({
    'site_id': site_ids,
    'site_name': [f"Site {i}" for i in range(1, n_sites + 1)],
    'country': np.random.choice(['USA', 'CAN', 'UK'], n_sites),
    'target_enrollment': np.random.randint(50, 150, n_sites),
    'pi_email': [f"pi{i}@site.org" for i in range(1, n_sites + 1)]
})
site_metadata.to_csv('site_metadata.csv', index=False)

# Generate Patient List
patient_ids = [f"PAT{str(i).zfill(4)}" for i in range(1, n_patients + 1)]
patient_sites = np.random.choice(site_ids, n_patients)

# Screening Data
screen_status_choices = ['Passed', 'Failed']
screen_status_probs = [0.8, 0.2]
failure_reasons = ['Low Hb', 'Non-Compliance', 'Medical History', 'Other']

screening_data = pd.DataFrame({
    'patient_id': patient_ids,
    'site_id': patient_sites,
    'date_screened': pd.to_datetime('2025-09-01') + pd.to_timedelta(np.random.randint(0, 30, n_patients), unit='d'),
    'screen_status': np.random.choice(screen_status_choices, n_patients, p=screen_status_probs),
    'failure_reason': [''] * n_patients
})
# Assign failure reasons only for failed screenings
screening_data.loc[screening_data['screen_status'] == 'Failed', 'failure_reason'] = np.random.choice(failure_reasons, screening_data['screen_status'].value_counts()['Failed'])
screening_data.to_csv('screening.csv', index=False)

# Enrollment Data (only patients who passed screening)
passed_patients = screening_data[screening_data['screen_status'] == 'Passed']['patient_id']
enrolled_flags = np.random.choice([0, 1], size=passed_patients.shape[0], p=[0.2, 0.8])
enrollment_dates = pd.to_datetime('2025-09-05') + pd.to_timedelta(np.random.randint(0, 20, passed_patients.shape[0]), unit='d')
randomization_dates = [date if flag == 1 else pd.NaT for date, flag in zip(enrollment_dates, enrolled_flags)]

enrollment_data = pd.DataFrame({
    'patient_id': passed_patients.values,
    'site_id': screening_data.set_index('patient_id').loc[passed_patients, 'site_id'].values,
    'date_enrolled': enrollment_dates,
    'randomized': enrolled_flags,
    'date_randomized': randomization_dates
})
enrollment_data.to_csv('enrollment.csv', index=False)

# Visits Data
visit_names = ['Visit1', 'Visit2', 'Visit3', 'Visit4']

visits_list = []
for patient in enrollment_data['patient_id']:
    site = enrollment_data.loc[enrollment_data['patient_id'] == patient, 'site_id'].values[0]
    enroll_date = enrollment_data.loc[enrollment_data['patient_id'] == patient, 'date_enrolled'].values[0]
    for i, visit in enumerate(visit_names):
        scheduled = pd.to_datetime(enroll_date) + pd.Timedelta(days=7 * i)
        # Randomize actual visit date +/- 2 days
        delta = np.random.randint(-2, 3)
        actual = scheduled + pd.Timedelta(days=delta)
        visit_status = np.random.choice(['Completed', 'Missed', 'Rescheduled'], p=[0.8, 0.15, 0.05])
        adherence = np.random.randint(70, 101) if visit_status == 'Completed' else 0
        diary = np.random.choice(['Y', 'N']) if visit_status == 'Completed' else 'N'

        visits_list.append({
            'patient_id': patient,
            'site_id': site,
            'visit_name': visit,
            'scheduled_date': scheduled.date(),
            'actual_date': actual.date() if visit_status != 'Missed' else pd.NaT,
            'visit_status': visit_status,
            'medication_adherence_pct': adherence,
            'diary_submitted': diary
        })

visits_data = pd.DataFrame(visits_list)
visits_data.to_csv('visits.csv', index=False)

print("All four CSV files generated: site_metadata.csv, screening.csv, enrollment.csv, visits.csv")


All four CSV files generated: site_metadata.csv, screening.csv, enrollment.csv, visits.csv


In [None]:
!ls


enrollment.csv	sample_data  screening.csv  site_metadata.csv  visits.csv


In [None]:
from google.colab import files
files.download('screening.csv')
files.download('enrollment.csv')
files.download('visits.csv')
files.download('site_metadata.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>