##Python Synthetic Data Generator
This Python script:

Creates realistic data for 20 countries with mixed income levels.

Simulates contracts with varying doses and prices.

Generates shipments per contract with batches.

Simulates daily vaccinations by age group & dose number over a year.

Creates adverse events tied to vaccinations with realistic severity rates.

Simulates clinical trials per country and phase.

In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random

np.random.seed(42)
random.seed(42)

# --- Constants and helpers ---
income_levels = ['Low', 'Middle', 'High']
age_groups = ['0-17', '18-45', '46-65', '65+']
dose_numbers = [1, 2]
severity_levels = ['Mild', 'Moderate', 'Severe']
event_types = ['Fever', 'Headache', 'Fatigue', 'Injection site pain', 'Anaphylaxis', 'Seizure', 'Blood clot', 'Nausea', 'Chills']

start_year = 2023
days_in_year = 365

def random_date(start, end):
    return start + timedelta(days=random.randint(0, (end - start).days))

# --- 1. Countries ---
countries = []
populations = {
    'Low': (5_000_000, 30_000_000),
    'Middle': (20_000_000, 80_000_000),
    'High': (30_000_000, 150_000_000)
}

for i in range(1, 21):
    income = random.choices(income_levels, weights=[0.3, 0.4, 0.3])[0]
    pop = random.randint(*populations[income])
    countries.append([i, f'Country_{i}', income, pop])

df_countries = pd.DataFrame(countries, columns=['country_id', 'country_name', 'income_level', 'population'])

# --- 2. Contracts ---
contracts = []
contract_id = 1000
for c in df_countries.itertuples():
    # 2-5 contracts per country
    num_contracts = random.randint(2,5)
    for _ in range(num_contracts):
        contract_date = random_date(datetime(start_year, 1, 1), datetime(start_year, 6, 30))
        total_doses = random.randint(500_000, int(c.population * 0.3))
        # Price based on income
        if c.income_level == 'High':
            price = round(np.random.normal(16, 1), 2)
        elif c.income_level == 'Middle':
            price = round(np.random.normal(13, 1), 2)
        else:
            price = round(np.random.normal(9, 1), 2)
        delivery_start = contract_date + timedelta(days=15)
        delivery_end = delivery_start + timedelta(days=random.randint(60, 180))
        contracts.append([contract_id, c.country_id, contract_date.strftime('%Y-%m-%d'), total_doses, price,
                          delivery_start.strftime('%Y-%m-%d'), delivery_end.strftime('%Y-%m-%d')])
        contract_id += 1

df_contracts = pd.DataFrame(contracts, columns=['contract_id', 'country_id', 'contract_date', 'total_doses',
                                               'price_per_dose', 'delivery_start', 'delivery_end'])

# --- 3. Shipments ---
shipments = []
shipment_id = 2000
for contract in df_contracts.itertuples():
    doses_remaining = contract.total_doses
    delivery_start = datetime.strptime(contract.delivery_start, '%Y-%m-%d')
    delivery_end = datetime.strptime(contract.delivery_end, '%Y-%m-%d')
    # 2-5 shipments per contract, spaced evenly
    shipment_dates = pd.date_range(delivery_start, delivery_end, periods=random.randint(2,5)).to_pydatetime().tolist()
    for i, ship_date in enumerate(shipment_dates):
        # Ship doses roughly evenly but randomize by +-20%
        doses_shipped = int(doses_remaining / (len(shipment_dates) - i) * random.uniform(0.8, 1.2))
        doses_shipped = min(doses_shipped, doses_remaining)
        batch_id = f"BATCH-{contract.contract_id}-{i+1}"
        shipments.append([shipment_id, contract.contract_id, ship_date.strftime('%Y-%m-%d'), doses_shipped, batch_id])
        doses_remaining -= doses_shipped
        shipment_id += 1
        if doses_remaining <= 0:
            break

df_shipments = pd.DataFrame(shipments, columns=['shipment_id', 'contract_id', 'shipment_date', 'doses_shipped', 'batch_id'])

# --- 4. Vaccinations ---
vaccinations = []
vaccination_id = 3000

for c in df_countries.itertuples():
    pop = c.population
    # Vaccination period: Jan 1 to Dec 31 2023
    vacc_start = datetime(start_year, 1, 1)
    vacc_end = datetime(start_year, 12, 31)
    total_days = (vacc_end - vacc_start).days + 1

    # Assume weekly aggregation: 52 weeks
    weeks = [vacc_start + timedelta(days=7*i) for i in range(52)]

    # Distribution of population per age group (approx)
    age_group_pop_dist = {
        '0-17': 0.25,
        '18-45': 0.4,
        '46-65': 0.2,
        '65+': 0.15
    }

    for week_start in weeks:
        for age_group in age_groups:
            # Number of people in age group
            age_pop = int(pop * age_group_pop_dist[age_group])

            # Weekly doses given: Random percent of age group, lower for dose 2
            dose1_rate = random.uniform(0.005, 0.02)  # 0.5% - 2% weekly get dose 1
            dose2_rate = dose1_rate * 0.8  # slightly fewer dose 2
            doses_given_1 = int(age_pop * dose1_rate)
            doses_given_2 = int(age_pop * dose2_rate)

            # Add dose 1 record
            vaccinations.append([vaccination_id, c.country_id, week_start.strftime('%Y-%m-%d'), age_group, doses_given_1, 1])
            vaccination_id += 1
            # Add dose 2 record
            vaccinations.append([vaccination_id, c.country_id, (week_start + timedelta(days=7)).strftime('%Y-%m-%d'), age_group, doses_given_2, 2])
            vaccination_id += 1

df_vaccinations = pd.DataFrame(vaccinations, columns=['vaccination_id', 'country_id', 'date_administered',
                                                      'age_group', 'doses_given', 'dose_number'])

# --- 5. Adverse Events ---
events = []
event_id = 4000

# Severe event rate ~ 5 per million doses; Mild+Moderate more frequent
# We'll sample events per vaccination record based on doses_given
for vac in df_vaccinations.itertuples():
    doses = vac.doses_given
    # Number of adverse events by severity
    mild_rate = 1500 / 1_000_000  # 1500 mild events per million doses
    moderate_rate = 200 / 1_000_000
    severe_rate = 5 / 1_000_000

    n_mild = np.random.poisson(doses * mild_rate)
    n_moderate = np.random.poisson(doses * moderate_rate)
    n_severe = np.random.poisson(doses * severe_rate)

    for _ in range(n_mild):
        event_date = random_date(datetime.strptime(vac.date_administered, '%Y-%m-%d'),
                                 datetime.strptime(vac.date_administered, '%Y-%m-%d') + timedelta(days=14))
        events.append([event_id, vac.vaccination_id, event_date.strftime('%Y-%m-%d'), 'Mild', random.choice(event_types), True])
        event_id += 1
    for _ in range(n_moderate):
        event_date = random_date(datetime.strptime(vac.date_administered, '%Y-%m-%d'),
                                 datetime.strptime(vac.date_administered, '%Y-%m-%d') + timedelta(days=21))
        events.append([event_id, vac.vaccination_id, event_date.strftime('%Y-%m-%d'), 'Moderate', random.choice(event_types), True])
        event_id += 1
    for _ in range(n_severe):
        event_date = random_date(datetime.strptime(vac.date_administered, '%Y-%m-%d'),
                                 datetime.strptime(vac.date_administered, '%Y-%m-%d') + timedelta(days=30))
        resolved = random.choice([True, False])
        events.append([event_id, vac.vaccination_id, event_date.strftime('%Y-%m-%d'), 'Severe', random.choice(event_types), resolved])
        event_id += 1

df_adverse_events = pd.DataFrame(events, columns=['event_id', 'vaccination_id', 'event_date', 'severity', 'event_type', 'resolved'])

# --- 6. Clinical Trials ---
trials = []
trial_id = 5000
phases = ['I', 'II', 'III', 'IV']

for c in df_countries.itertuples():
    n_trials = random.randint(1, 3)
    for _ in range(n_trials):
        phase = random.choices(phases, weights=[0.1, 0.2, 0.5, 0.2])[0]
        start_date = random_date(datetime(2022,1,1), datetime(2022,6,30))
        duration_days = random.randint(90, 270)
        end_date = start_date + timedelta(days=duration_days)
        participants = random.randint(5000, 25000) if phase in ['III', 'IV'] else random.randint(1000, 5000)
        efficacy = round(random.uniform(75, 95), 2) if phase in ['III', 'IV'] else round(random.uniform(50, 75), 2)
        severe_events = random.randint(0, 15)
        trials.append([trial_id, c.country_id, phase, start_date.strftime('%Y-%m-%d'), end_date.strftime('%Y-%m-%d'),
                       participants, efficacy, severe_events])
        trial_id += 1

df_clinical_trials = pd.DataFrame(trials, columns=['trial_id', 'country_id', 'phase', 'start_date', 'end_date',
                                                  'participants', 'efficacy_rate', 'severe_events'])

# --- Export CSVs ---
df_countries.to_csv('synthetic_countries.csv', index=False)
df_contracts.to_csv('synthetic_contracts.csv', index=False)
df_shipments.to_csv('synthetic_shipments.csv', index=False)
df_vaccinations.to_csv('synthetic_vaccinations.csv', index=False)
df_adverse_events.to_csv('synthetic_adverse_events.csv', index=False)
df_clinical_trials.to_csv('synthetic_clinical_trials.csv', index=False)

print("Synthetic datasets generated and saved as CSV files.")


Synthetic datasets generated and saved as CSV files.
