In [1]:
import pandas as pd
import numpy as np
from datetime import datetime, timedelta
import random
# imports
from pathlib import Path
# Parent directory
parent_dir = str(Path().resolve().parents[0])

In [2]:
# Params
RAND_SEED = 42
NUM_USERS = 500
NUM_RECORDS = 3000  # total records, not per user
NUMBER_OF_OCCUPATIONS = 100 # total number of unique occupation categories
NUMBER_OF_CITIES = 25 # total number of unique cities
NUMBER_OF_INDUSTRIES = 47 # total number of unique industries
OCCUPATIONS = [i for i in range(NUMBER_OF_OCCUPATIONS)]
INDUSTRIES = [i for i in range(NUMBER_OF_INDUSTRIES)]
CITIES = [i for i in range(NUMBER_OF_CITIES)]

DIAGNOSIS = ['A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J']
PATIENT_TYPE = ['Inpatient', 'Outpatient', 'Emergency', 'Urgent Care', 'Ambulatory', 'Telehealth']


## Number of records per user in the synthetic labor dataset
MIN_RECORDS_PER_USER = 5
MAX_RECORDS_PER_USER = 30

## Number of records per user in the synthetic health dataset
MIN_RECORDS_PER_USER_HEALTH = 0
MAX_RECORDS_PER_USER_HEALTH = 5

START_DATE = datetime(2020, 1, 1) #start date for all the "records" in the synthetic dataset

# Helper function to generate random dates
def random_dates(start, num_days, num_dates):
    return [start + timedelta(days=random.randint(0, num_days)) for _ in range(num_dates)]

def random_birthdates(start, end, n, seed=0, replace=True):
    dates = pd.date_range(start, end).to_series()
    return dates.sample(n, replace=replace, random_state=seed)

In [3]:
# SEEDS
random.seed(RAND_SEED)
np.random.seed(RAND_SEED)
# Generate random user base
users = pd.DataFrame({
    "USER_ID": range(1, NUM_USERS + 1),
    "BIRTHDAY": random_birthdates(datetime(1950, 1,1), datetime(1994, 1,1), NUM_USERS),
    "SEX": np.random.choice(["Male", "Female"], NUM_USERS),
}).reset_index(drop=True)
# Visualise the records from the user base
users.sample(10)

Unnamed: 0,USER_ID,BIRTHDAY,SEX
86,87,1978-03-12,Female
431,432,1985-12-30,Female
308,309,1953-04-13,Female
495,496,1976-08-16,Male
458,459,1959-04-20,Male
282,283,1984-06-10,Female
200,201,1972-09-08,Male
273,274,1993-02-05,Female
192,193,1956-10-28,Male
21,22,1957-02-12,Male


In [4]:
users.to_csv(parent_dir +"/data/rawdata/users.csv")

In [5]:
users.shape

(500, 3)

# Synthetic Labor Data

In [6]:
# SEEDS
random.seed(RAND_SEED)
np.random.seed(RAND_SEED)
# Expand users into records
records = []
for _, user in users.iterrows():
    num_records = random.randint(MIN_RECORDS_PER_USER, MAX_RECORDS_PER_USER)  # Each user has between 12 to 100 records
    for _ in range(num_records):
        record = {
            "USER_ID": user["USER_ID"],
            "RECORD_DATE": random.choice(random_dates(START_DATE, 365 * 10, 1)),
            "CITY": random.choice(CITIES),
            "OCCUPATION": random.choice(OCCUPATIONS),
            "INDUSTRY": random.choice(INDUSTRIES),
            "INCOME": np.random.lognormal(mean=10, sigma=0.3)  # Income based on log-normal distribution
        }
        records.append(record)

# Create DataFrame
df_records = pd.DataFrame(records)
# Sort by UserID and the record date
df_records.sort_values(by=["USER_ID", "RECORD_DATE"], inplace=True)
df_records.to_csv(parent_dir + "/data/rawdata/synth_labor.csv", index=False) #move to the data/rawdata folder
df_records.head(20) # Display the first 20 rows to check

Unnamed: 0,USER_ID,RECORD_DATE,CITY,OCCUPATION,INDUSTRY,INCOME
3,1,2020-05-02,6,29,32,34783.866673
21,1,2020-09-28,18,91,20,20583.953807
13,1,2020-10-11,21,29,18,12406.997744
17,1,2020-10-19,17,93,15,24204.024495
14,1,2020-11-22,3,48,17,13128.289049
9,1,2021-01-31,11,77,16,25919.944599
0,1,2021-04-01,23,35,15,25565.888102
11,1,2021-05-26,2,70,18,19154.272446
23,1,2021-08-08,4,31,35,14365.367357
18,1,2021-10-31,12,34,40,16774.124628


In [8]:
df_records["USER_ID"].nunique() # Check the number of unique users

500

# Synthetic Health Data
Dataset of `Fake Diagnosis` with missing values

In [9]:
# SEEDS
random.seed(RAND_SEED)
np.random.seed(RAND_SEED + 1)
# Expand users into records
records = []
for _, user in users.iterrows():
    num_records = random.randint(MIN_RECORDS_PER_USER_HEALTH, MAX_RECORDS_PER_USER_HEALTH)  # Each user has between 12 to 100 records
    for _ in range(num_records):
        record = {
            "USER_ID": user["USER_ID"],
            "RECORD_DATE": random.choice(random_dates(START_DATE, 365 * 10, 1)),
            "DIAGNOSIS": random.choice(DIAGNOSIS),
            "PATIENT_TYPE": random.choice(PATIENT_TYPE),
        }
        records.append(record)

# Create DataFrame
df_records = pd.DataFrame(records)
# Sort by UserID and the record date
df_records.sort_values(by=["USER_ID", "RECORD_DATE"], inplace=True)

# Mask out some of the values 
MAX_ROWS_TO_MODIFY = 0.3
num_rows = df_records.shape[0]
num_rows_to_modify = int(MAX_ROWS_TO_MODIFY * num_rows)
rows_to_modify = np.random.choice(df_records.index, num_rows_to_modify, replace=False)

# Make either DIAGNOSIS or PATIENT_TYPE NULL in these rows
for row in rows_to_modify:
    if random.choice([True, False]):
        df_records.at[row, 'DIAGNOSIS'] = np.nan
    else:
        df_records.at[row, 'PATIENT_TYPE'] = np.nan


df_records.to_csv(parent_dir + "/data/rawdata/synth_health.csv", index=False) #move to the data/rawdata folder
df_records.head(20) # Display the first 20 rows to check

Unnamed: 0,USER_ID,RECORD_DATE,DIAGNOSIS,PATIENT_TYPE
3,1,2020-05-10,,Outpatient
0,1,2021-04-01,E,Outpatient
1,1,2022-07-03,B,Telehealth
4,1,2022-08-10,I,Outpatient
2,1,2028-04-21,J,Urgent Care
9,2,2021-01-14,B,Emergency
8,2,2021-09-28,F,Inpatient
6,2,2026-08-10,,Outpatient
5,2,2027-04-15,,Urgent Care
7,2,2027-10-30,F,Emergency
