<a href="https://colab.research.google.com/github/cbonnin88/Hospital_Admissions/blob/main/data_generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import polars as pl
import numpy as np
from datetime import datetime, timedelta
import random

In [None]:
# Setting seed for reproducibility
np.random.seed(42)
num_rows = 120000

# **Generate dim_patients**

In [None]:
depts = ['75','92','93','94','Paris','Nanterre','75001']

In [None]:
patients = pl.DataFrame({
    'patient_id': range(1,num_rows + 1),
    'age': np.random.randint(18,95, num_rows),
    'gender': np.random.choice(['M','F',None], num_rows, p=[0.48,0.48,0.04]),
    'dept_code': np.random.choice(depts,num_rows),
    'is_cmu': np.random.choice([0,1], num_rows, p=[0.8,0.2])
})

In [None]:
# Creating 2000 duplicate rows to practice de-duplication
duplicates = patients.sample(n=2000)
patients = pl.concat([patients,duplicates])

# **Generate dim_hospitals**

In [None]:
hospitals = pl.DataFrame({
    'hospital_id': range(1,51),
    'hospital_name': [f'Hôpital {i}' for i in range(1,51)],
    'category': np.random.choice(['Public (AP-HP)','Privé (ESPIC)','Clinique'],50),
    'region': 'ile-de-France'
})

# **Generate fact_hospital_stays**

In [None]:
# Linking patients to hospital stays
start_date = datetime(2023,1,1)
stay_data = {
    'stay_id': range(1,num_rows + 1),
    'patient_id': np.random.randint(1,num_rows + 1, num_rows),
    'hospital_id': np.random.randint(1,51,num_rows),
    'admission_date': [start_date + timedelta(days=np.random.randint(0,365)) for _ in range(num_rows)],
    'diagnosis_code': np.random.choice(['J44.0','J45.9','I10','E11.9'], num_rows), # COPD, Asthma, HTN, Diabetes
    'stay_cost': np.random.uniform(500,15000, num_rows).round(2)
}

fact_stays = pl.DataFrame(stay_data)
# Adding length of stay to create discharge_date
fact_stays = fact_stays.with_columns([
    (pl.col('admission_date') + pl.duration(days=np.random.randint(1,20))).alias('discharge_date')
])

# **Generate fact_pharmacy_claims**

In [None]:
pharmacy = pl.DataFrame({
    'claim_id': range(1,num_rows + 1),
    'patient_id': np.random.randint(1,num_rows + 1, num_rows),
    'drug_type': np.random.choice(["Bronchodilator", "Insulin", "Beta-blocker", "Antibiotic"], num_rows),
    'claim_amount': np.random.uniform(10,200,num_rows).round(2)
})

# **Export to CSV**

In [None]:
patients.write_csv('raw_patients.csv')
hospitals.write_csv('raw_hospitals.csv')
fact_stays.write_csv('raw_hospitals_stay.csv')
pharmacy.write_csv('raw_pharmacy_claims.csv')

In [None]:
print('Files generated: 120k+ rows per table')

Files generated: 120k+ rows per table
