In [0]:
#define functions needed to create example data

In [0]:
import random
import datetime
import pyspark.sql.functions as spark_funcs

In [0]:
#define function to create example pyspark dataframe with 250,000 rows and the following columns
#id_code, date_of_birth, gender, death_indicator, date_of_death
#date_of_death is blank for living members and populated for dead members
#dates of death will be from 2021-2023 if populated

def create_example_dataset():

    #set seed for reproducibility
    random.seed(1)

    #array of ids with same number of digits
    ids = [i for i in range(100000, 350001)]

    #array of dates of birth, randomly generated from 1920-1963
    start_date = datetime.date(1920, 1, 1)
    end_date = datetime.date(1963, 12, 31)
    dates_between = end_date - start_date
    total_days = dates_between.days

    dates_of_birth = []
    for i in range(250000):
        random_day = random.randrange(total_days)

        dates_of_birth.append(start_date + datetime.timedelta(days=random_day))
    

    #array of genders evenly split
    genders = ['M', 'F'] * 125000


    #create death rate based on year of birth and gender for reference in death indicator coinflip
    death_rate_F = []
    for i in range(44):
        rate = 0.010 + (i / 850)
        if i > 31:
            rate = rate + (i*i / 10000)
        if i > 40:
            rate = min(rate + (i / 44), 1)
        death_rate_F.append(rate)

    death_rate_M = []
    for i in range(44):
        rate = 0.012 + (i / 700)
        if i > 29:
            rate = rate + (i*i / 8500)
        if i > 40:
            rate = min(rate + (i / 44), 1)
        death_rate_M.append(rate)


    #array of binary death indicators, coin flip based on year of birth
    death_indicators = []
    for i in range(250000):
        year_of_birth = dates_of_birth[i].year
        comp = 43 - (year_of_birth - 1920)
        if genders[i] == 'M':
            death_indicators.append(random.random() < death_rate_M[comp])
        else:
            death_indicators.append(random.random() < death_rate_F[comp])


    #array of dates of death, randomly generated from 2021-2023
    #create death dates for all members in range 2021-2023, then remove for non-dead members
    start_date = datetime.date(2021, 1, 1)
    end_date = datetime.date(2023, 12, 31)
    dates_between = end_date - start_date
    total_days = dates_between.days

    dates_of_death_temp = []
    for i in range(250000):
        random_day = random.randrange(total_days)

        dates_of_death_temp.append(start_date + datetime.timedelta(days=random_day))

    dates_of_death = []
    for i in range(250000):
        if death_indicators[i]:
            dates_of_death.append(dates_of_death_temp[i])
        else:
            dates_of_death.append(None)

    #create final output
    data = []
    for i in range(250000):
        data.append([ids[i], dates_of_birth[i], genders[i], death_indicators[i], dates_of_death[i]])

    return spark.createDataFrame(data, ['id_code', 'date_of_birth', 'gender', 'death_indicator', 'date_of_death'])


