<a href="https://colab.research.google.com/github/aaljaish/ASD_Prevalence/blob/main/03_Simulation_for_ASD_Prevalence.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
import random
import math
from tqdm import tqdm

In [2]:
def ci_summary(my_list, percent=True, Label='Mean'):
  # Calculate mean
  mean_val = np.mean(my_list)
  # Calculate 2.5 percentile
  percentile_025 = np.percentile(my_list, 2.5)
  # Calculate 97.5 percentile
  percentile_975 = np.percentile(my_list, 97.5)
  if percent==True:
    print(f"\n{Label}: {mean_val*100:.2f}%, (95% SI: {percentile_025*100:.2f}%, {percentile_975*100:.2f}%)")
  else:
    print(f"\n{Label}: {mean_val:.2f}, (95% SI: {percentile_025:.2f}, {percentile_975:.2f})")

In [9]:
def beta_parameters(mean, stddev):
    # make sure mean and stddev are in valid range
    assert 0 < mean < 1, "Mean should be in (0, 1) range"
    assert 0 < stddev < np.sqrt(mean * (1 - mean)), "Standard deviation should be in (0, sqrt(mean * (1 - mean))) range"
    
    # convert stddev to variance
    variance = stddev ** 2
    
    # common part of both alpha and beta
    common = mean * (1 - mean) / variance - 1
    
    # calculate alpha and beta
    alpha = mean * common
    beta = (1 - mean) * common
    
    return alpha, beta

In [3]:
asd_prev=pd.read_csv('https://raw.githubusercontent.com/aaljaish/ASD_Data/main/ASD%20Prevalence.csv')
asd_prev['Age_group']=18
pop = pd.read_csv('https://raw.githubusercontent.com/aaljaish/ASD_Data/main/Population%20Data.csv')
mortality_rate=pd.read_csv('https://raw.githubusercontent.com/aaljaish/ASD_Data/main/Mortality%20Rates%20for%20General%20Population.csv')
hr = pd.read_csv('https://raw.githubusercontent.com/aaljaish/ASD_Data/main/ASD%20Mortality%20HR.csv')

In [4]:
# Create a boolean mask for rows that contain the text
import re

# Define the age groups to match
age_groups = ['0 to 4', '5 to 9', '10 to 14', '90 to 94', '95 to 99', '100 and over']

# Create a regex pattern that matches any of the age groups as a whole word (not part of another string)
# The \b denotes a word boundary in regex, ensuring we're matching the entire string and not a part of it.
pattern = r'\b(?:' + '|'.join(age_groups) + r')\b'

# Apply a function to each cell in the DataFrame. The function checks if the cell's value (converted to a string and stripped of leading/trailing whitespace)
# matches the regex pattern exactly. This returns a DataFrame of the same shape as 'pop' with True for matches and False for non-matches.
# Then, .any(axis=1) checks if any cells in each row are True (i.e., if any cells in the row match one of the age groups).
# This results in a Series where each element corresponds to a row of 'pop', and the element is True if any cell in that row matches one of the age groups.
mask = pop.apply(lambda row: row.astype(str).apply(lambda x: bool(re.fullmatch(pattern, x.strip()))).any(), axis=1)

# Filter the DataFrame to exclude the rows with the specified text
pop = pop[~mask].copy()
pop.reset_index(inplace=True, drop=True)

# Replace "15 to 19" with "18 to 19" in the "Age_group" column
pop['Age_group'] = np.where(pop["Age_group"]=="15 to 19", "18 to 19", pop["Age_group"])

# Convert "object_column" to integer
pop['Persons'] = pop['Persons'].astype(int)

# Conditionally divide "person" column values by 5 and multiply by 2
pop['Persons'] = np.where(pop["Age_group"]=="18 to 19", (pop["Persons"]/5)*2, pop["Persons"])


# Create a new DataFrame to store the new data
dfs = []

# Iterate through the original DataFrame
for index, row in pop.iterrows():
    # Get the start and end ages for each age group
    start_age, end_age = [int(age) for age in row['Age_group'].split(' to ')]
    
    # Create new rows for each individual age within the age group
    for age in range(start_age, end_age + 1):
        new_row = row.copy()
        new_row['Age_group'] = age
        new_row['Persons'] = new_row['Persons'] / 2 if new_row['Age_group'] == 18 or new_row['Age_group'] == 19 else new_row['Persons'] / 5

        new_row['Persons']=round(new_row['Persons'],0)
        dfs.append(pd.DataFrame(new_row).T)

        # Concatenate the list of DataFrames into a single DataFrame
new_data = pd.concat(dfs, ignore_index=True)
new_data['Persons']=new_data['Persons'].astype(int)
pop=new_data.copy()

In [5]:
# Create a boolean mask for rows that contain the text
mask = mortality_rate.apply(lambda row: any(row.astype(str).str.contains('0 to 1|1 to 4|5 to 9|10 to 14|90+')), axis=1)

# Filter the DataFrame to exclude the rows with the text
mortality_rate = mortality_rate[~mask].copy()
mortality_rate.reset_index(inplace=True, drop=True)


# Replace "15 to 19" with "18 to 19" in the "Age_group" column
mortality_rate['Age_group'] = np.where(mortality_rate["Age_group"]=="15 to 19", "18 to 19", mortality_rate["Age_group"])

# Convert "object_column" to integer
mortality_rate['Rate'] = mortality_rate['Rate'].astype(float)
mortality_rate = mortality_rate[mortality_rate['Year'] == 2019]
mortality_rate = mortality_rate.drop('Year', axis=1)
mortality_rate = mortality_rate.rename(columns={'Rate': 'Mortality Rate'})

# Create a new DataFrame to store the new data
dfs = []

# Iterate through the original DataFrame
for index, row in mortality_rate.iterrows():
    # Get the start and end ages for each age group
    start_age, end_age = [int(age) for age in row['Age_group'].split(' to ')]
    
    # Create new rows for each individual age within the age group
    for age in range(start_age, end_age + 1):
        new_row = row.copy()
        new_row['Age_group'] = age
        dfs.append(pd.DataFrame(new_row).T)
# Concatenate the list of DataFrames into a single DataFrame
new_data = pd.concat(dfs, ignore_index=True)
mortality_rate=new_data.copy()

In [6]:
data = pd.DataFrame()
data = pd.merge(left=pop,right=mortality_rate, how='left', on=['Geography','Sex','Age_group'])
data = pd.merge(left=data,right=asd_prev, how='left', on=['Geography','Sex', 'Age_group'])
data = pd.merge(left=data,right=hr, how='left', on=['Sex'])

In [7]:
data = data.rename(columns={'Mortality Rate':'mortality_rate', 'rate':'hazard_ratio'})
data['Persons']=data['Persons'].astype(int)
data['hazard_ratio']=np.exp(data['hazard_ratio'])


data["mortality_rate"]=data["mortality_rate"].astype(float)
data["Persons"]=data["Persons"].astype(float)
data["asd_prevalence"]=data["asd_prevalence"].astype(float)

data['pop_survival']=1-((data['Persons']*((data['mortality_rate']/1000)))/data['Persons'])
data = data.rename(columns={'Persons':'population', 'Age_group': 'age', 'Geography':'province', 'Sex':'sex'})


In [8]:
data[(data['province'] == 'Alberta') & (data['sex'] == 'Female')]

Unnamed: 0,province,sex,age,population,mortality_rate,asd_prevalence,asd_prevalence_se,hazard_ratio,se,pop_survival
1224,Alberta,Female,18,24840.0,0.2,0.009583,0.003102,3.119897,0.145,0.9998
1225,Alberta,Female,19,24840.0,0.2,,,3.119897,0.145,0.9998
1226,Alberta,Female,20,26493.0,0.5,,,3.119897,0.145,0.9995
1227,Alberta,Female,21,26493.0,0.5,,,3.119897,0.145,0.9995
1228,Alberta,Female,22,26493.0,0.5,,,3.119897,0.145,0.9995
...,...,...,...,...,...,...,...,...,...,...
1291,Alberta,Female,85,5109.0,72.8,,,3.119897,0.145,0.9272
1292,Alberta,Female,86,5109.0,72.8,,,3.119897,0.145,0.9272
1293,Alberta,Female,87,5109.0,72.8,,,3.119897,0.145,0.9272
1294,Alberta,Female,88,5109.0,72.8,,,3.119897,0.145,0.9272


### Step 6: Conduct analysis to estimate prevalence
This code below is manipulating a dataset based on ASD (Autism Spectrum Disorder). It's iterating through groups of data by province and sex, calculating various statistics such as ASD prevalence, mortality rate, number of ASD cases, ASD survival rate, rho_adj, and gamma_adj, and appending these values to respective lists.

In [13]:
def run_simulation(data, hr, beta_parameters, num_simulations=1000):
    # Group the data by 'province' and 'sex' columns
    grouped = {province: data for province, data in data.groupby(['province','sex'])}

    sim_data = []

    # Initialize the tqdm progress bar
    for num_sim in tqdm(range(num_simulations), desc="Running simulations"):
        hr['current_simulated_hr'] = np.exp(np.random.normal(loc=hr['rate'], scale=hr['se']))

        # Initialize empty lists to store data
        province, sex, ages, pops, ASD_prev_3_17, N_ASD, HR, asd_survival, rho_adj, gamma_adj = ([] for _ in range(10))

        # Loop through each key in the dictionary
        for key in grouped:
            # Get ASD prevalence for the current group
            asd_prevalence_prev = grouped[key]['asd_prevalence'].values[0]
            se_ASD_prev=grouped[key]['asd_prevalence_se'].values[0]

            alpha, beta = beta_parameters(asd_prevalence_prev, se_ASD_prev)
            # asd_prevalence_prev = np.random.beta(alpha, beta, 1)[0]
            asd_prevalence_prev = np.random.normal(loc=asd_prevalence_prev, scale=se_ASD_prev)

            # Store the first ASD prevalence for the current group
            first_asd_prevalence = asd_prevalence_prev

            # Get ASD mortality rate for the current group
            mortality_rate = grouped[key]['mortality_rate'].values
            exponentiated_HR = hr[hr['Sex'] == key[1]]['current_simulated_hr'].values[0]
            
            # Get population for the current group
            population = grouped[key]['population'].values
            # Get population survival rate for the current group
            pop_survival = grouped[key]['pop_survival'].values
            # Get age for the current group
            age = grouped[key]['age'].values

            # Loop through each record in the current group
            for i, (age_val, pop_val, pop_survival_val) in enumerate(zip(age, population, pop_survival)):
                asd_mortality_rate = exponentiated_HR * pop_survival_val
                asd_survival_temp = 1 - (asd_mortality_rate / 1000)
                N_ASD_temp = pop_val * asd_prevalence_prev
                rho_adj_temp = asd_prevalence_prev * asd_survival_temp
                gamma_adj_temp = pop_val * rho_adj_temp

                # Append province, sex and age to their respective lists
                province.append(key[0])
                sex.append(key[1])
                ages.append(age_val)

                # Append population and ASD prevalence for ages 3-17 to their respective lists
                pops.append(pop_val)
                ASD_prev_3_17.append(first_asd_prevalence)
                
                # Calculate number of ASD cases and append to the list
                N_ASD.append(N_ASD_temp)
                # Calculate ASD survival rate and append to the list
                asd_survival.append(asd_survival_temp)
                # Calculate rho_adj (adjusted ASD prevalence) and append to the list
                rho_adj.append(rho_adj_temp)
                # Calculate gamma_adj (adjusted ASD cases in the population) and append to the list
                gamma_adj.append(gamma_adj_temp)
                HR.append(exponentiated_HR)
                # Update the ASD prevalence (from current ASD province, sex, age group) for the next iteration
                asd_prevalence_prev = rho_adj_temp if age_val != 89 else 0

        mydata = {'province': province, 'age': ages, 'sex': sex, 'population': pops,
                  'ASD_prev_3_17': ASD_prev_3_17, 'N_ASD': N_ASD, 'Hazard_Ratio': HR,
                  'asd_survival': asd_survival, 'rho_adj': rho_adj, 
                  'gamma_adj': gamma_adj}

        data2 = pd.DataFrame(mydata)
        sim_data.append(data2)

    return sim_data

# Use the function
if __name__ == "__main__":
    sim_data = run_simulation(data, hr, beta_parameters)

Running simulations: 100%|██████████| 1000/1000 [00:15<00:00, 64.13it/s]


In [14]:
national_prevalence=[]
national_prevalence_male=[]
national_prevalence_female=[]

for sim in tqdm(sim_data, desc="Processing groups"):
  males=sim[sim['sex']=='Male']['gamma_adj'].sum()
  females=sim[sim['sex']=='Female']['gamma_adj'].sum()
  pop_males=sim[sim['sex']=='Male']['population'].sum()
  pop_females=sim[sim['sex']=='Female']['population'].sum()

  national_prevalence.append(sim['gamma_adj'].sum()/sim['population'].sum())
  national_prevalence_male.append(males/pop_males)
  national_prevalence_female.append(females/pop_females)

ci_summary(national_prevalence, True, Label='National Estimate: ')
ci_summary(national_prevalence_male, True, Label='National Estimate Males: ')
ci_summary(national_prevalence_female, True, Label='National Estimate Females: ')

Processing groups: 100%|██████████| 1000/1000 [00:02<00:00, 409.48it/s]


National Estimate: : 1.78%, (95% SI: 1.61%, 1.95%)

National Estimate Males: : 2.82%, (95% SI: 2.52%, 3.11%)

National Estimate Females: : 0.76%, (95% SI: 0.60%, 0.93%)





In [15]:
HR_simulated_male=[]
HR_simulated_female=[]


for sim in tqdm(sim_data, desc="Processing groups"):
  HR_simulated_female.append(sim[sim['sex']=='Female']['Hazard_Ratio'].values[0])
  HR_simulated_male.append(sim[sim['sex']=='Male']['Hazard_Ratio'].values[0])

ci_summary(HR_simulated_male, False, Label='HR Males')
ci_summary(HR_simulated_female, False, Label='HR Females')

Processing groups: 100%|██████████| 1000/1000 [00:00<00:00, 1141.61it/s]


HR Males: 3.15, (95% SI: 2.59, 3.79)

HR Females: 3.15, (95% SI: 2.31, 4.12)



