In [1]:
import pandas as pd
import numpy as np
import random
import math
from tqdm import tqdm

In [2]:
def ci_summary(my_list, percent=True, Label='Mean'):
  # Calculate mean
  mean_val = np.mean(my_list)
  # Calculate 2.5 percentile
  percentile_025 = np.percentile(my_list, 2.5)
  # Calculate 97.5 percentile
  percentile_975 = np.percentile(my_list, 97.5)
  if percent==True:
    print(f"\n{Label}: {mean_val*100:.2f}%, (95% SI: {percentile_025*100:.2f}%, {percentile_975*100:.2f}%)")
  else:
    print(f"\n{Label}: {mean_val:.2f}, (95% SI: {percentile_025:.2f}, {percentile_975:.2f})")

In [3]:
def beta_parameters(mean, stddev):
    # make sure mean and stddev are in valid range
    assert 0 < mean < 1, "Mean should be in (0, 1) range"
    assert 0 < stddev < np.sqrt(mean * (1 - mean)), "Standard deviation should be in (0, sqrt(mean * (1 - mean))) range"
    
    # convert stddev to variance
    variance = stddev ** 2
    
    # common part of both alpha and beta
    common = mean * (1 - mean) / variance - 1
    
    # calculate alpha and beta
    alpha = mean * common
    beta = (1 - mean) * common
    
    return alpha, beta

In [4]:
asd_prev=pd.read_csv('ASD Prevalence.csv')
asd_prev['Age_group']=18
pop = pd.read_csv('Population Data.csv')
mortality_rate=pd.read_csv('Mortality Rates for General Population.csv')
hr = pd.read_csv('ASD Mortality RR.csv')

In [5]:
# Create a boolean mask for rows that contain the text
import re

# Define the age groups to match
age_groups = ['0 to 4', '5 to 9', '10 to 14', '90 to 94', '95 to 99', '100 and over']

# Create a regex pattern that matches any of the age groups as a whole word (not part of another string)
# The \b denotes a word boundary in regex, ensuring we're matching the entire string and not a part of it.
pattern = r'\b(?:' + '|'.join(age_groups) + r')\b'

# Apply a function to each cell in the DataFrame. The function checks if the cell's value (converted to a string and stripped of leading/trailing whitespace)
# matches the regex pattern exactly. This returns a DataFrame of the same shape as 'pop' with True for matches and False for non-matches.
# Then, .any(axis=1) checks if any cells in each row are True (i.e., if any cells in the row match one of the age groups).
# This results in a Series where each element corresponds to a row of 'pop', and the element is True if any cell in that row matches one of the age groups.
mask = pop.apply(lambda row: row.astype(str).apply(lambda x: bool(re.fullmatch(pattern, x.strip()))).any(), axis=1)

# Filter the DataFrame to exclude the rows with the specified text
pop = pop[~mask].copy()
pop.reset_index(inplace=True, drop=True)

# Replace "15 to 19" with "18 to 19" in the "Age_group" column
pop['Age_group'] = np.where(pop["Age_group"]=="15 to 19", "18 to 19", pop["Age_group"])

# Convert "object_column" to integer
pop['Persons'] = pop['Persons'].astype(int)

# Conditionally divide "person" column values by 5 and multiply by 2
pop['Persons'] = np.where(pop["Age_group"]=="18 to 19", (pop["Persons"]/5)*2, pop["Persons"])


# Create a new DataFrame to store the new data
dfs = []

# Iterate through the original DataFrame
for index, row in pop.iterrows():
    # Get the start and end ages for each age group
    start_age, end_age = [int(age) for age in row['Age_group'].split(' to ')]
    
    # Create new rows for each individual age within the age group
    for age in range(start_age, end_age + 1):
        new_row = row.copy()
        new_row['Age_group'] = age
        new_row['Persons'] = new_row['Persons'] / 2 if new_row['Age_group'] == 18 or new_row['Age_group'] == 19 else new_row['Persons'] / 5

        new_row['Persons']=round(new_row['Persons'],0)
        dfs.append(pd.DataFrame(new_row).T)

        # Concatenate the list of DataFrames into a single DataFrame
new_data = pd.concat(dfs, ignore_index=True)
new_data['Persons']=new_data['Persons'].astype(int)
pop=new_data.copy()

In [6]:
# Create a boolean mask for rows that contain the text
mask = mortality_rate.apply(lambda row: any(row.astype(str).str.contains('0 to 1|1 to 4|5 to 9|10 to 14|90+')), axis=1)

# Filter the DataFrame to exclude the rows with the text
mortality_rate = mortality_rate[~mask].copy()
mortality_rate.reset_index(inplace=True, drop=True)


# Replace "15 to 19" with "18 to 19" in the "Age_group" column
mortality_rate['Age_group'] = np.where(mortality_rate["Age_group"]=="15 to 19", "18 to 19", mortality_rate["Age_group"])

# Convert "object_column" to integer
mortality_rate['Rate'] = mortality_rate['Rate'].astype(float)
mortality_rate = mortality_rate[mortality_rate['Year'] == 2019]
mortality_rate = mortality_rate.drop('Year', axis=1)
mortality_rate = mortality_rate.rename(columns={'Rate': 'Mortality Rate'})

# Create a new DataFrame to store the new data
dfs = []

# Iterate through the original DataFrame
for index, row in mortality_rate.iterrows():
    # Get the start and end ages for each age group
    start_age, end_age = [int(age) for age in row['Age_group'].split(' to ')]
    
    # Create new rows for each individual age within the age group
    for age in range(start_age, end_age + 1):
        new_row = row.copy()
        new_row['Age_group'] = age
        dfs.append(pd.DataFrame(new_row).T)
# Concatenate the list of DataFrames into a single DataFrame
new_data = pd.concat(dfs, ignore_index=True)
mortality_rate=new_data.copy()

In [7]:
data = pd.DataFrame()
data = pd.merge(left=pop,right=mortality_rate, how='left', on=['Geography','Sex','Age_group'])
data = pd.merge(left=data,right=asd_prev, how='left', on=['Geography','Sex', 'Age_group'])
data = pd.merge(left=data,right=hr, how='left', on=['Sex'])

In [8]:
data = data.rename(columns={'Mortality Rate':'mortality_rate', 'rate':'hazard_ratio'})
data['Persons']=data['Persons'].astype(int)
data['hazard_ratio']=np.exp(data['hazard_ratio'])


data["mortality_rate"]=data["mortality_rate"].astype(float)
data["Persons"]=data["Persons"].astype(float)
data["asd_prevalence"]=data["asd_prevalence"].astype(float)

data['pop_survival']=1-((data['Persons']*((data['mortality_rate']/1000)))/data['Persons'])
data = data.rename(columns={'Persons':'population', 'Age_group': 'age', 'Geography':'province', 'Sex':'sex'})


In [9]:
data[(data['province'] == 'Alberta') & (data['sex'] == 'Female')]

# province = The name of Canadain province
# sex = sex male/females
# Age = Age of group increasing by 1 year increments
# population = Population size for the respective province, sex, and age group. This data was obtained from Statistics canada
# mortality_rate = Mortality rate per 1000 individuals from the general population
# asd_prevalence = asd prevalence from 2019 CHSCY for those 1-17 years of age.
# asd_prevalence_se = standard error for asd prevalence from 2019 CHSCY for those 1-17 years of age.
# hazard_ratio = hazard ratio for death for the respective sex. We assumed constant hazard ratio for all ages and all provinces.
# se = standard error for the hazard ratio for death in the respective sex. We assumed constant hazard ratio for all ages and all provinces.
# pop_survival = probability of survival for the respective province, sex, and age group. This variable was derived from mortality_rate.


Unnamed: 0,province,sex,age,population,mortality_rate,asd_prevalence,asd_prevalence_se,hazard_ratio,se,pop_survival
1224,Alberta,Female,18,24840.0,0.2,0.009583,0.003102,4.87,0.236,0.9998
1225,Alberta,Female,19,24840.0,0.2,,,4.87,0.236,0.9998
1226,Alberta,Female,20,26493.0,0.5,,,4.87,0.236,0.9995
1227,Alberta,Female,21,26493.0,0.5,,,4.87,0.236,0.9995
1228,Alberta,Female,22,26493.0,0.5,,,4.87,0.236,0.9995
...,...,...,...,...,...,...,...,...,...,...
1291,Alberta,Female,85,5109.0,72.8,,,4.87,0.236,0.9272
1292,Alberta,Female,86,5109.0,72.8,,,4.87,0.236,0.9272
1293,Alberta,Female,87,5109.0,72.8,,,4.87,0.236,0.9272
1294,Alberta,Female,88,5109.0,72.8,,,4.87,0.236,0.9272


### Step 6: Conduct analysis to estimate prevalence
This code below is manipulating a dataset based on ASD (Autism Spectrum Disorder). It's iterating through groups of data by province and sex, calculating various statistics such as ASD prevalence, mortality rate, number of ASD cases, ASD survival rate, rho_adj, and gamma_adj, and appending these values to respective lists.

**Methods**

In our research, we employed a simulation-based methodology to estimate the prevalence of Autism Spectrum Disorder (ASD) across various Canadian provinces. This involved considering a range of factors such as age, sex, and overall population. The process hinged on a Monte Carlo simulation, a recognized technique in computational statistics that allows for the generation of numerous potential outcomes and their probabilities through random sampling.

Initially, we categorized our dataset into distinct groups based on the province and sex, with each group encapsulating data associated with age, population, and several ASD-related parameters.

However, we were unable to derive estimates for ASD prevalence and standard errors for Yukon, Northwest Territories, Nunavut, and the female population in Prince Edward Island using the Canadian Health Survey on Children and Youth (CHSCY). To manage this data limitation, we made two informed assumptions. Firstly, we assumed that the ASD prevalence in these regions would align with the national sex-specific average. Secondly, we proposed that the standard error in these regions would equate to 50% of the ASD prevalence, signifying a coefficient of variation of 50%.

Within each simulation run, the hazard ratio for death was recalculated by selecting a value from a normal distribution defined by a specific mean and standard error. This technique facilitated the simulation of a plausible level of random variation.

For every province-sex group, the initial calculation of ASD prevalence was based on a presumed normal distribution centered around the given prevalence value and its standard error. 

Following this, within each age category of a group, we calculated several variables. These included the ASD mortality rate, calculated as the product of the group's hazard ratio for death and the survival rate of the general population. The number of ASD cases was also calculated, using the group's population and the ASD prevalence. An adjusted ASD prevalence was computed to reflect the probable effect of the ASD survival rate on the prevalence. Lastly, the adjusted number of ASD cases in the population was estimated by multiplying the group's population by the adjusted ASD prevalence.

This procedure was repeated across all age groups within each province-sex grouping. The data from each simulation run, including the calculated values and the province, sex, age, and population data, was then aggregated. 

This approach was conducted over a pre-set number of simulations, each iteration generating a unique data scenario, thereby constructing a wide array of plausible outcomes. This extensive collection of scenarios enabled a comprehensive understanding of the variability and potential range of ASD prevalence across Canadian provinces.

This simulation-based approach provided a robust mechanism for estimating ASD prevalence, taking into account inherent uncertainties in population-based studies and potential random variation in key parameters such as the hazard ratio for death and ASD prevalence. Additionally, it addressed data limitations by making evidence-based assumptions where required.

In [10]:
def run_simulation(data, hr, beta_parameters, num_simulations=10000):
    # Group the data by 'province' and 'sex' columns
    grouped = {province: data for province, data in data.groupby(['province','sex'])}

    sim_data = []

    # Initialize the tqdm progress bar
    for num_sim in tqdm(range(num_simulations), desc="Running simulations"):
        hr['current_simulated_hr'] = np.exp(np.random.normal(loc=hr['rate'], scale=hr['se']))

        # Initialize empty lists to store data
        province, sex, ages, pops, ASD_prev_3_17, N_ASD, HR, asd_survival, rho_adj, gamma_adj = ([] for _ in range(10))

        # Loop through each key in the dictionary
        for key in grouped:
            # Get ASD prevalence for the current group
            asd_prevalence_prev = grouped[key]['asd_prevalence'].values[0]
            se_ASD_prev=grouped[key]['asd_prevalence_se'].values[0]

            alpha, beta = beta_parameters(asd_prevalence_prev, se_ASD_prev)
            asd_prevalence_prev = np.random.beta(alpha, beta, 1)[0]
            #asd_prevalence_prev = np.random.normal(loc=asd_prevalence_prev, scale=se_ASD_prev)

            # Store the first ASD prevalence for the current group
            first_asd_prevalence = asd_prevalence_prev

            # Get ASD mortality rate for the current group
            mortality_rate = grouped[key]['mortality_rate'].values
            exponentiated_HR = hr[hr['Sex'] == key[1]]['current_simulated_hr'].values[0]
            
            # Get population for the current group
            population = grouped[key]['population'].values
            # Get population survival rate for the current group
            pop_survival = grouped[key]['pop_survival'].values
            # Get age for the current group
            age = grouped[key]['age'].values

            # Loop through each record in the current group
            for i, (age_val, pop_val, pop_survival_val) in enumerate(zip(age, population, pop_survival)):
                asd_mortality_rate = exponentiated_HR * pop_survival_val
                asd_survival_temp = 1 - (asd_mortality_rate / 1000)
                N_ASD_temp = pop_val * asd_prevalence_prev
                rho_adj_temp = asd_prevalence_prev * asd_survival_temp
                gamma_adj_temp = pop_val * rho_adj_temp

                # Append province, sex and age to their respective lists
                province.append(key[0])
                sex.append(key[1])
                ages.append(age_val)

                # Append population and ASD prevalence for ages 3-17 to their respective lists
                pops.append(pop_val)
                ASD_prev_3_17.append(first_asd_prevalence)
                
                # Calculate number of ASD cases and append to the list
                N_ASD.append(N_ASD_temp)
                # Calculate ASD survival rate and append to the list
                asd_survival.append(asd_survival_temp)
                # Calculate rho_adj (adjusted ASD prevalence) and append to the list
                rho_adj.append(rho_adj_temp)
                # Calculate gamma_adj (adjusted ASD cases in the population) and append to the list
                gamma_adj.append(gamma_adj_temp)
                HR.append(exponentiated_HR)
                # Update the ASD prevalence (from current ASD province, sex, age group) for the next iteration
                asd_prevalence_prev = rho_adj_temp if age_val != 89 else 0

        mydata = {'province': province, 'age': ages, 'sex': sex, 'population': pops,
                  'ASD_prev_3_17': ASD_prev_3_17, 'N_ASD': N_ASD, 'Hazard_Ratio': HR,
                  'asd_survival': asd_survival, 'rho_adj': rho_adj, 
                  'gamma_adj': gamma_adj}

        data2 = pd.DataFrame(mydata)
        sim_data.append(data2)

    return sim_data

# Use the function
if __name__ == "__main__":
    sim_data = run_simulation(data, hr, beta_parameters)

Running simulations: 100%|██████████| 10000/10000 [02:36<00:00, 63.78it/s]


### Functions to aggregate simulated data

#### By a particular group

In [11]:
# Define bins and labels
bins = [18, 20, 25, 30,35,40,45,50,55,60,65,70,75,80,85, np.inf]
labels = ['18-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54',
          '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85+']

# Add "Simulation Number" to each dataframe in the list before concatenating
for i in tqdm(range(len(sim_data)), desc="Processing simulation data"):
    sim_data[i]['Simulation Number'] = i

aggregated_sim_data = pd.concat(sim_data)

# Create a new column 'age_group' based on 'age' column
aggregated_sim_data['age_group'] = pd.cut(aggregated_sim_data['age'], bins=bins, labels=labels, right=False)
aggregated_sim_data=aggregated_sim_data.reset_index(drop=True)
aggregated_sim_data = aggregated_sim_data[variables+['population','gamma_adj']+["Simulation Number"]].groupby(variables+["Simulation Number"]).sum().reset_index()
aggregated_sim_data['rho_adj']=aggregated_sim_data['gamma_adj']/aggregated_sim_data['population']

 # Apply the aggregation functions to each group
agg_funcs = ['mean', 'min', 'max', lambda x: np.percentile(x, 2.5), lambda x: np.percentile(x, 97.5)]
agg_func_names = ['Mean', 'Minimum', 'Maximum', '2.5th_quintile', '97.5th_quintile']

descriptive_stats = aggregated_sim_data[variables+['rho_adj']].groupby(variables).agg(agg_funcs).reset_index()
# Now the columns will be MultiIndex, so you'll need to flatten them and rename
descriptive_stats.columns = descriptive_stats.columns.map('{0[0]}_{0[1]}'.format)
for i, func_name in enumerate(agg_func_names):
    descriptive_stats.rename(columns={
        f'rho_adj_{func_name}': f'rho_adj_{i}'
    }, inplace=True)

descriptive_stats

Processing simulation data: 100%|██████████| 10000/10000 [00:03<00:00, 2977.91it/s]


NameError: name 'variables' is not defined

In [12]:
aggregated_sim_data

Unnamed: 0,province,age,sex,population,ASD_prev_3_17,N_ASD,Hazard_Ratio,asd_survival,rho_adj,gamma_adj,Simulation Number,age_group
0,Alberta,18,Female,24840.0,0.009742,242.000539,5.808047,0.994193,0.009686,240.595269,0,18-19
1,Alberta,19,Female,24840.0,0.009742,240.595269,5.808047,0.994193,0.009630,239.198160,0,18-19
2,Alberta,20,Female,26493.0,0.009742,255.115816,5.808047,0.994195,0.009574,253.634832,0,20-24
3,Alberta,21,Female,26493.0,0.009742,253.634832,5.808047,0.994195,0.009518,252.162445,0,20-24
4,Alberta,22,Female,26493.0,0.009742,252.162445,5.808047,0.994195,0.009463,250.698606,0,20-24
...,...,...,...,...,...,...,...,...,...,...,...,...
18719995,Yukon,85,Male,16.0,0.033919,0.472592,2.090569,0.998097,0.029481,0.471693,9999,85+
18719996,Yukon,86,Male,16.0,0.033919,0.471693,2.090569,0.998097,0.029425,0.470795,9999,85+
18719997,Yukon,87,Male,16.0,0.033919,0.470795,2.090569,0.998097,0.029369,0.469899,9999,85+
18719998,Yukon,88,Male,16.0,0.033919,0.469899,2.090569,0.998097,0.029313,0.469005,9999,85+


In [14]:
import numpy as np
import pandas as pd
from tqdm import tqdm

def get_stats_grouped(dfs, variables):
    # Define bins and labels
    bins = [18, 20, 25, 30, 35, 40, 45, 50, 55, 60, 65, 70, 75, 80, 85, np.inf]
    labels = ['18-19', '20-24', '25-29', '30-34', '35-39', '40-44', '45-49', '50-54',
              '55-59', '60-64', '65-69', '70-74', '75-79', '80-84', '85+']

    # Add "Simulation Number" to each dataframe in the list before concatenating
    for i in tqdm(range(len(dfs)), desc="Processing simulation data"):
        dfs[i]['Simulation Number'] = i

    aggregated_sim_data = pd.concat(dfs)

    # Create a new column 'age_group' based on 'age' column
    aggregated_sim_data['age_group'] = pd.cut(aggregated_sim_data['age'], bins=bins, labels=labels, right=False)
    aggregated_sim_data = aggregated_sim_data.reset_index(drop=True)
    aggregated_sim_data = aggregated_sim_data[variables+['population','gamma_adj']+["Simulation Number"]].groupby(variables+["Simulation Number"]).sum().reset_index()
    aggregated_sim_data['rho_adj'] = aggregated_sim_data['gamma_adj']/aggregated_sim_data['population']

    # Apply the aggregation functions to each group
    agg_funcs = ['mean', 'min', 'max', lambda x: np.percentile(x, 2.5), lambda x: np.percentile(x, 97.5)]
    agg_func_names = ['Mean', 'Minimum', 'Maximum', '2.5th_quintile', '97.5th_quintile']

    descriptive_stats = aggregated_sim_data[variables+['rho_adj']].groupby(variables).agg(agg_funcs).reset_index()

    # Now the columns will be MultiIndex, so you'll need to flatten them and rename
    descriptive_stats.columns = descriptive_stats.columns.map('{0[0]}_{0[1]}'.format)

    for i, func_name in enumerate(agg_func_names):
        descriptive_stats.rename(columns={
            f'rho_adj_{func_name}': f'rho_adj_{i}'
        }, inplace=True)

    return descriptive_stats


#### Nationally

In [15]:
def get_stats(df, column):
    """
    Given a dataframe and column name, this function will return
    the mean, 2.5th percentile, 97.5th percentile, min, and max of the column.
    These statistics will be returned as percentages.
    """
    mean = df[column].mean() * 100
    percentile_2_5 = np.percentile(df[column], 2.5) * 100
    percentile_97_5 = np.percentile(df[column], 97.5) * 100
    min_val = df[column].min() * 100
    max_val = df[column].max() * 100

    return {
        "mean": f'{mean:.2f}%',
        "2.5th SI": f'{percentile_2_5:.2f}%',
        "97.5th SI": f'{percentile_97_5:.2f}%',
        "min": f'{min_val:.2f}%',
        "max": f'{max_val:.2f}%',
    }

## Results

### 1. Results aggregated dataset by Province, Sex, Age Group, and Simulation Number

In [16]:
variables=['province', 'sex', 'age_group']
my_data = get_stats_grouped(sim_data, variables=variables)
display(my_data)

Processing simulation data: 100%|██████████| 10000/10000 [00:00<00:00, 12856.89it/s]


Unnamed: 0,province_,sex_,age_group_,rho_adj_mean,rho_adj_min,rho_adj_max,rho_adj_<lambda_0>,rho_adj_<lambda_1>
0,Alberta,Female,18-19,0.009514,0.002021,0.025639,0.004500,0.016469
1,Alberta,Female,20-24,0.009349,0.002001,0.025104,0.004426,0.016148
2,Alberta,Female,25-29,0.009119,0.001973,0.024358,0.004317,0.015729
3,Alberta,Female,30-34,0.008894,0.001946,0.023634,0.004219,0.015368
4,Alberta,Female,35-39,0.008675,0.001919,0.022931,0.004114,0.015007
...,...,...,...,...,...,...,...,...
385,Yukon,Male,65-69,0.028081,0.000540,0.128943,0.007556,0.061636
386,Yukon,Male,70-74,0.027792,0.000535,0.127446,0.007490,0.061140
387,Yukon,Male,75-79,0.027510,0.000530,0.125991,0.007426,0.060548
388,Yukon,Male,80-84,0.027236,0.000525,0.124574,0.007347,0.059942


### 2. Results by Province and Sex

In [17]:
variables=['province', 'sex']
my_data = get_stats_grouped(sim_data, variables=variables)
display(my_data)

Processing simulation data: 100%|██████████| 10000/10000 [00:00<00:00, 16070.81it/s]


Unnamed: 0,province_,sex_,rho_adj_mean,rho_adj_min,rho_adj_max,rho_adj_<lambda_0>,rho_adj_<lambda_1>
0,Alberta,Female,0.008317,0.001873,0.021802,0.003922,0.014446
1,Alberta,Male,0.028616,0.013984,0.053792,0.01999,0.03861
2,British Columbia,Female,0.006941,0.00163,0.017774,0.003531,0.011429
3,British Columbia,Male,0.032853,0.016673,0.056847,0.023262,0.043726
4,Manitoba,Female,0.003691,0.000138,0.01719,0.000897,0.008373
5,Manitoba,Male,0.02725,0.008159,0.061204,0.015098,0.042466
6,New Brunswick,Female,0.009201,0.000656,0.03482,0.002603,0.019735
7,New Brunswick,Male,0.064343,0.022833,0.118378,0.040929,0.091651
8,Newfoundland and Labrador,Female,0.003089,0.000178,0.012943,0.000853,0.006843
9,Newfoundland and Labrador,Male,0.036999,0.011164,0.079626,0.020485,0.057686


### 3. Results by Province

In [18]:
variables=['province']
my_data = get_stats_grouped(sim_data, variables=variables)
display(my_data)

Processing simulation data: 100%|██████████| 10000/10000 [00:00<00:00, 15940.80it/s]


Unnamed: 0,province_,rho_adj_mean,rho_adj_min,rho_adj_max,rho_adj_<lambda_0>,rho_adj_<lambda_1>
0,Alberta,0.018513,0.010492,0.031948,0.013432,0.024313
1,British Columbia,0.019714,0.011028,0.034715,0.014558,0.025484
2,Manitoba,0.015439,0.004711,0.031683,0.009157,0.023268
3,New Brunswick,0.036452,0.012324,0.066725,0.024025,0.050734
4,Newfoundland and Labrador,0.019772,0.006526,0.042339,0.011521,0.030103
5,Northwest Territories,0.018596,0.002659,0.062637,0.006923,0.036763
6,Nova Scotia,0.013285,0.003765,0.030803,0.007323,0.02088
7,Nunavut,0.018791,0.001839,0.063833,0.006895,0.037147
8,Ontario,0.019149,0.015526,0.024009,0.016981,0.021418
9,Prince Edward Island,0.032354,0.016704,0.057006,0.022726,0.043672


### 4. Results aggregated dataset Nationally

In [19]:
variables=["Simulation Number"]

# Add "Simulation Number" to each dataframe in the list before concatenating
for i in tqdm(range(len(sim_data)), desc="Processing simulation data"):
    sim_data[i]['Simulation Number'] = i

aggregated_sim_data = pd.concat(sim_data)
aggregated_sim_data=aggregated_sim_data.reset_index(drop=True)
aggregated_sim_data = aggregated_sim_data[variables+['population','gamma_adj']].groupby(variables).sum().reset_index()
aggregated_sim_data['rho_adj']=aggregated_sim_data['gamma_adj']/aggregated_sim_data['population']

rho_adj_stats = get_stats(aggregated_sim_data, 'rho_adj')
rho_adj_stats

Processing simulation data: 100%|██████████| 10000/10000 [00:00<00:00, 16243.37it/s]


{'mean': '1.81%',
 '2.5th SI': '1.64%',
 '97.5th SI': '1.99%',
 'min': '1.45%',
 'max': '2.15%'}

### 5. Results aggregated dataset nationally by sex

In [20]:
variables=['sex']
my_data = get_stats_grouped(sim_data, variables=variables)
display(my_data)

Processing simulation data: 100%|██████████| 10000/10000 [00:00<00:00, 16554.10it/s]


Unnamed: 0,sex_,rho_adj_mean,rho_adj_min,rho_adj_max,rho_adj_<lambda_0>,rho_adj_<lambda_1>
0,Female,0.007237,0.004183,0.010588,0.005709,0.008991
1,Male,0.029088,0.023538,0.035443,0.026075,0.032284


### 6. Results aggregated dataset nationally by age group

In [53]:
variables=['age_group']
my_data = get_stats_grouped(sim_data, variables=variables)
display(my_data)

Processing simulation data: 100%|██████████| 10000/10000 [00:00<00:00, 15310.91it/s]


Unnamed: 0,age_group_,rho_adj_mean,rho_adj_min,rho_adj_max,rho_adj_<lambda_0>,rho_adj_<lambda_1>
0,18-19,0.019902,0.016673,0.023984,0.018113,0.0218
1,20-24,0.019977,0.016771,0.024116,0.018169,0.021873
2,25-29,0.019457,0.016274,0.023563,0.017674,0.021345
3,30-34,0.018984,0.015859,0.02304,0.017223,0.02085
4,35-39,0.018515,0.015413,0.022524,0.016765,0.020376
5,40-44,0.018187,0.015072,0.022127,0.016463,0.020028
6,45-49,0.018131,0.014955,0.022068,0.016438,0.019946
7,50-54,0.017923,0.014687,0.021865,0.016233,0.019725
8,55-59,0.017636,0.014316,0.021572,0.015945,0.019454
9,60-64,0.017265,0.013876,0.021186,0.015574,0.019096


In [54]:
### 5. Results aggregated dataset nationally by sex and age group

In [55]:
variables=['sex', 'age_group']
my_data = get_stats_grouped(sim_data, variables=variables)
display(my_data)

Processing simulation data: 100%|██████████| 10000/10000 [00:00<00:00, 13894.83it/s]


Unnamed: 0,sex_,age_group_,rho_adj_mean,rho_adj_min,rho_adj_max,rho_adj_<lambda_0>,rho_adj_<lambda_1>
0,Female,18-19,0.008322,0.00497,0.011955,0.00667,0.010124
1,Female,20-24,0.008191,0.004856,0.011867,0.006559,0.009958
2,Female,25-29,0.00801,0.004642,0.011707,0.006393,0.009795
3,Female,30-34,0.00781,0.004429,0.011442,0.006218,0.009577
4,Female,35-39,0.007635,0.004229,0.011295,0.006033,0.00941
5,Female,40-44,0.007452,0.00408,0.011095,0.005868,0.009207
6,Female,45-49,0.007244,0.003957,0.010797,0.005702,0.008942
7,Female,50-54,0.007065,0.003798,0.010622,0.005525,0.008771
8,Female,55-59,0.006894,0.003633,0.010473,0.005357,0.00861
9,Female,60-64,0.006725,0.003473,0.010341,0.005184,0.008459
