#### Step 1:  Import required libraries

In [1]:
import pandas as pd
import numpy as np
import pymc as pm
from sklearn.preprocessing import LabelEncoder
import arviz as az
import multiprocessing
from scipy.special import logit
from scipy.special import expit

#### Step 2: Load in the necessary data

In [2]:
asd_prev=pd.read_csv('ASD Prevalence.csv')
asd_prev['Age_group']=18
# asd_prev[0:60]

In [3]:
pop = pd.read_csv('Population Data.csv')
mortality_rate=pd.read_csv('Mortality Rates for General Population.csv')
hr = pd.read_csv('ASD Mortality HR.csv')

#### Step 3: Re-format 2019 population estimates

In [4]:
# Create a boolean mask for rows that contain the text
import re

# Define the age groups to match
age_groups = ['0 to 4', '5 to 9', '10 to 14', '90 to 94', '95 to 99', '100 and over']

# Create a regex pattern that matches any of the age groups as a whole word (not part of another string)
# The \b denotes a word boundary in regex, ensuring we're matching the entire string and not a part of it.
pattern = r'\b(?:' + '|'.join(age_groups) + r')\b'

# Apply a function to each cell in the DataFrame. The function checks if the cell's value (converted to a string and stripped of leading/trailing whitespace)
# matches the regex pattern exactly. This returns a DataFrame of the same shape as 'pop' with True for matches and False for non-matches.
# Then, .any(axis=1) checks if any cells in each row are True (i.e., if any cells in the row match one of the age groups).
# This results in a Series where each element corresponds to a row of 'pop', and the element is True if any cell in that row matches one of the age groups.
mask = pop.apply(lambda row: row.astype(str).apply(lambda x: bool(re.fullmatch(pattern, x.strip()))).any(), axis=1)

# Filter the DataFrame to exclude the rows with the specified text
pop = pop[~mask].copy()
pop.reset_index(inplace=True, drop=True)

# Replace "15 to 19" with "18 to 19" in the "Age_group" column
pop['Age_group'] = np.where(pop["Age_group"]=="15 to 19", "18 to 19", pop["Age_group"])

# Convert "object_column" to integer
pop['Persons'] = pop['Persons'].astype(int)

# Conditionally divide "person" column values by 5 and multiply by 2
pop['Persons'] = np.where(pop["Age_group"]=="18 to 19", (pop["Persons"]/5)*2, pop["Persons"])


# Create a new DataFrame to store the new data
dfs = []

# Iterate through the original DataFrame
for index, row in pop.iterrows():
    # Get the start and end ages for each age group
    start_age, end_age = [int(age) for age in row['Age_group'].split(' to ')]
    
    # Create new rows for each individual age within the age group
    for age in range(start_age, end_age + 1):
        new_row = row.copy()
        new_row['Age_group'] = age
        new_row['Persons'] = new_row['Persons'] / 2 if new_row['Age_group'] == 18 or new_row['Age_group'] == 19 else new_row['Persons'] / 5

        new_row['Persons']=round(new_row['Persons'],0)
        dfs.append(pd.DataFrame(new_row).T)

        # Concatenate the list of DataFrames into a single DataFrame
new_data = pd.concat(dfs, ignore_index=True)
new_data['Persons']=new_data['Persons'].astype(int)
pop=new_data.copy()

#### Step 4: Re-format 2019 mortality rates

In [5]:
# Create a boolean mask for rows that contain the text
mask = mortality_rate.apply(lambda row: any(row.astype(str).str.contains('0 to 1|1 to 4|5 to 9|10 to 14|90+')), axis=1)

# Filter the DataFrame to exclude the rows with the text
mortality_rate = mortality_rate[~mask].copy()
mortality_rate.reset_index(inplace=True, drop=True)


# Replace "15 to 19" with "18 to 19" in the "Age_group" column
mortality_rate['Age_group'] = np.where(mortality_rate["Age_group"]=="15 to 19", "18 to 19", mortality_rate["Age_group"])

# Convert "object_column" to integer
mortality_rate['Rate'] = mortality_rate['Rate'].astype(float)
mortality_rate = mortality_rate[mortality_rate['Year'] == 2019]
mortality_rate = mortality_rate.drop('Year', axis=1)
mortality_rate = mortality_rate.rename(columns={'Rate': 'Mortality Rate'})

# Create a new DataFrame to store the new data
dfs = []

# Iterate through the original DataFrame
for index, row in mortality_rate.iterrows():
    # Get the start and end ages for each age group
    start_age, end_age = [int(age) for age in row['Age_group'].split(' to ')]
    
    # Create new rows for each individual age within the age group
    for age in range(start_age, end_age + 1):
        new_row = row.copy()
        new_row['Age_group'] = age
        dfs.append(pd.DataFrame(new_row).T)
# Concatenate the list of DataFrames into a single DataFrame
new_data = pd.concat(dfs, ignore_index=True)
mortality_rate=new_data.copy()


In [6]:
data = pd.DataFrame()
data = pd.merge(left=pop,right=mortality_rate, how='left', on=['Geography','Sex','Age_group'])
data = pd.merge(left=data,right=asd_prev, how='left', on=['Geography','Sex', 'Age_group'])
data = pd.merge(left=data,right=hr, how='left', on=['Sex'])

#### Step 5: Combine all data

In [7]:
data = data.rename(columns={'Mortality Rate':'mortality_rate', 'rate':'hazard_ratio'})
data['Persons']=data['Persons'].astype(int)
data['hazard_ratio']=np.exp(data['hazard_ratio'])


data["mortality_rate"]=data["mortality_rate"].astype(float)
data["Persons"]=data["Persons"].astype(float)
data["asd_prevalence"]=data["asd_prevalence"].astype(float)


# Adjust the hazard function
data["asd_mortality_rate"] = data["mortality_rate"] * data["hazard_ratio"]
data['N_ASD']=data['Persons'] * data['asd_prevalence']

data['asd_survival']=1-((data['N_ASD']*((data['asd_mortality_rate']/1000)))/data['N_ASD'])
data['pop_survival']=1-((data['Persons']*((data['mortality_rate']/1000)))/data['Persons'])
data = data.rename(columns={'Persons':'population', 'Age_group': 'age', 'Geography':'province', 'Sex':'sex'})

data.head(5)

Unnamed: 0,province,sex,age,population,mortality_rate,asd_prevalence,hazard_ratio,se,asd_mortality_rate,N_ASD,asd_survival,pop_survival
0,Newfoundland and Labrador,Male,18,2835.0,0.4,0.044297,3.129897,0.099,1.251959,125.580784,0.998748,0.9996
1,Newfoundland and Labrador,Male,19,2835.0,0.4,,3.129897,0.099,1.251959,,,0.9996
2,Newfoundland and Labrador,Male,20,3013.0,0.4,,3.129897,0.099,1.251959,,,0.9996
3,Newfoundland and Labrador,Male,21,3013.0,0.4,,3.129897,0.099,1.251959,,,0.9996
4,Newfoundland and Labrador,Male,22,3013.0,0.4,,3.129897,0.099,1.251959,,,0.9996


#### Step 6: Conduct analysis to estimate prevalence

This code below is manipulating a dataset based on ASD (Autism Spectrum Disorder). It's iterating through groups of data by province and sex, calculating various statistics such as ASD prevalence, mortality rate, number of ASD cases, ASD survival rate, rho_adj, and gamma_adj, and appending these values to respective lists.

In [8]:
# Initialize empty lists to store data
rho_adj = []
gamma_adj = []
N_ASD = []
asd_survival = []
rho_adj = []
gamma_adj = []
sex = []
province = []
ages = []
pops = []

# Group the data by 'province' and 'sex' columns
grouped = data.groupby(['province','sex'])

# Convert grouped data into a dictionary with 'province' as the key
grouped = {province: data for province, data in grouped}

# Get the list of keys from the dictionary
my_keys = list(grouped.keys())

# Initialize an empty list to store the ASD prevalence for ages 3-17
ASD_prev_3_17 = []

# Loop through each key in the dictionary
for key in my_keys:
    # Get ASD prevalence for the current group
    asd_prevalence_prev = grouped[key]['asd_prevalence'].values[0]
    # Store the first ASD prevalence for the current group
    first_asd_prevalence = grouped[key]['asd_prevalence'].values[0]
    
    # Get ASD mortality rate for the current group
    asd_mortality_rate = grouped[key]['asd_mortality_rate'].values
    # Get population for the current group
    population = grouped[key]['population'].values
    # Get population survival rate for the current group
    pop_survival = grouped[key]['pop_survival'].values
    # Get age for the current group
    age = grouped[key]['age'].values

    # Loop through each record in the current group
    for i, j in enumerate(zip(asd_mortality_rate, population, pop_survival)):
        # Append province, sex and age to their respective lists
        province.append(key[0])
        sex.append(key[1])
        ages.append(age[i])
        
        # Append population and ASD prevalence for ages 3-17 to their respective lists
        pops.append(population[i])
        ASD_prev_3_17.append(first_asd_prevalence)
        
        # Calculate number of ASD cases and append to the list
        N_ASD.append(j[1] * asd_prevalence_prev)
        # Calculate ASD survival rate and append to the list
        asd_survival.append(1 - ((N_ASD[-1] * ((asd_mortality_rate[i] / 1000)) / N_ASD[-1])))
        # Calculate rho_adj (adjusted ASD prevalence) and append to the list
        rho_adj.append((population[i] * asd_prevalence_prev * asd_survival[-1]) / population[i])
        # Calculate gamma_adj (adjusted ASD cases in the population) and append to the list
        gamma_adj.append(population[i] * rho_adj[-1])
        
        # Update the ASD prevalence (from current ASD province, sex, age group) for the next iteration
        asd_prevalence_prev = rho_adj[-1]

In [9]:
# Create a dictionary from the lists
mydata = {'province': province, 'age': ages, 'sex': sex, 'population':pops,
          'ASD_prev_3_17': ASD_prev_3_17, 'N_ASD': N_ASD,
          'asd_survival': asd_survival, 'rho_adj': rho_adj, 
          'gamma_adj': gamma_adj}

data2=pd.DataFrame(mydata)
data2[['province', 'age', 'sex', 'rho_adj', 'gamma_adj']][0:20]

Unnamed: 0,province,age,sex,rho_adj,gamma_adj
0,Alberta,18,Female,0.011167,277.387676
1,Alberta,19,Female,0.01116,277.214592
2,Alberta,20,Female,0.011143,295.200867
3,Alberta,21,Female,0.011125,294.740369
4,Alberta,22,Female,0.011108,294.280589
5,Alberta,23,Female,0.011091,293.821526
6,Alberta,24,Female,0.011073,293.36318
7,Alberta,25,Female,0.011056,344.271539
8,Alberta,26,Female,0.011039,343.734493
9,Alberta,27,Female,0.011021,343.198285


In [10]:
# Sum the adjusted number of ASD cases by province and sex
asd_cases_by_province_sex = (
    data2.groupby(["province", "sex"])["gamma_adj"].sum().reset_index()
)

# Sum the adjusted number of ASD cases by province
asd_cases_by_province = data2.groupby("province")["gamma_adj"].sum().reset_index()
pop_prov_sex=pop[['Geography','Sex','Persons']].groupby(['Geography','Sex']).sum().reset_index()
pop_prov_sex = pop_prov_sex.rename(columns={'Geography':'province', 'Sex':'sex'})
asd_cases_by_province_sex=pd.merge(left=asd_cases_by_province_sex,right=pop_prov_sex,on=['province','sex'])
asd_cases_by_province_sex['Prevalence']=asd_cases_by_province_sex['gamma_adj']/asd_cases_by_province_sex['Persons']

# Convert proportion to percentage and format output
asd_cases_by_province_sex['Percentage'] = asd_cases_by_province_sex['Prevalence'].apply(lambda x: '{:.2%}'.format(x))

asd_cases_by_province_sex = asd_cases_by_province_sex.rename(columns={'Persons':'Population', 'province':'Province', 'sex':'Sex','gamma_adj': 'N with ASD '})

# Calculate national ASD cases
national_asd_cases = data2["gamma_adj"].sum()

# Calculate national population
national_population = data2["population"].sum()

# Calculate prevalence at the national level
national_prevalence = national_asd_cases / national_population

In [11]:
data2.to_csv('ASD Dataset.csv', index=False)