In [7]:
# Perform imports
import pandas as pd
import matplotlib.pyplot as plt
import plotly as p
import plotly.express as px
import plotly.graph_objects as go
import os

In [8]:
# Load in the data
data = pd.read_csv("./Asthma.csv").drop(columns=["Unnamed: 0"])
data.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,...,StratificationCategory1,Stratification1,Stratification2,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
0,2014,2014,AR,Arkansas,SEDD; SID,Asthma,Hospitalizations for asthma,,Number,916.0,...,Gender,Male,,POINT (-92.27449074299966 34.74865012400045),5,AST,AST3_1,NMBR,GENDER,GENM
1,2018,2018,CO,Colorado,SEDD; SID,Asthma,Hospitalizations for asthma,,Number,2227.0,...,Overall,Overall,,POINT (-106.13361092099967 38.843840757000464),8,AST,AST3_1,NMBR,OVERALL,OVR
2,2018,2018,DC,District of Columbia,SEDD; SID,Asthma,Hospitalizations for asthma,,Number,708.0,...,Overall,Overall,,POINT (-77.036871 38.907192),11,AST,AST3_1,NMBR,OVERALL,OVR
3,2017,2017,GA,Georgia,SEDD; SID,Asthma,Hospitalizations for asthma,,Number,3520.0,...,Gender,Female,,POINT (-83.62758034599966 32.83968109300048),13,AST,AST3_1,NMBR,GENDER,GENF
4,2010,2010,MI,Michigan,SEDD; SID,Asthma,Hospitalizations for asthma,,Number,123.0,...,Race/Ethnicity,Hispanic,,POINT (-84.71439026999968 44.6613195430005),26,AST,AST3_1,NMBR,RACE,HIS


## Quick analysis of Questions

I noticed the structure of the data varies a little between each question
so I might either need to:

    1) Implement a cleaning process to standardize
    2) Tackle each question individually

In [22]:
# First what are the questions?
questions = list(data['Question'].unique())
questions

['Hospitalizations for asthma',
 'Asthma mortality rate',
 'Emergency department visit rate for asthma',
 'Asthma prevalence among women aged 18-44 years',
 'Current asthma prevalence among adults aged >= 18 years',
 'Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma',
 'Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma',
 'Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma',
 'Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma']

In [13]:
# Understanding "Hospitalizations for asthma" looking at Overall for now for simplicity
hosp_overall = data[
    (data["Question"] == "Hospitalizations for asthma")
    & (data["StratificationCategory1"] == "Overall")
]
hosp_overall.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,...,StratificationCategory1,Stratification1,Stratification2,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
1,2018,2018,CO,Colorado,SEDD; SID,Asthma,Hospitalizations for asthma,,Number,2227.0,...,Overall,Overall,,POINT (-106.13361092099967 38.843840757000464),8,AST,AST3_1,NMBR,OVERALL,OVR
2,2018,2018,DC,District of Columbia,SEDD; SID,Asthma,Hospitalizations for asthma,,Number,708.0,...,Overall,Overall,,POINT (-77.036871 38.907192),11,AST,AST3_1,NMBR,OVERALL,OVR
7,2013,2013,PR,Puerto Rico,SEDD; SID,Asthma,Hospitalizations for asthma,,Number,,...,Overall,Overall,,POINT (-66.590149 18.220833),72,AST,AST3_1,NMBR,OVERALL,OVR
8,2017,2017,PR,Puerto Rico,SEDD; SID,Asthma,Hospitalizations for asthma,,Number,,...,Overall,Overall,,POINT (-66.590149 18.220833),72,AST,AST3_1,NMBR,OVERALL,OVR
44,2013,2013,GU,Guam,SEDD; SID,Asthma,Hospitalizations for asthma,"cases per 10,000",Crude Rate,,...,Overall,Overall,,POINT (144.793731 13.444304),66,AST,AST3_1,CRDRATE,OVERALL,OVR


In [17]:
# Create adictionary that generates the pairings of DataValueType and DataValueUnit
hosp_overall_pairing = {
    data_type: list(data[data["DataValueType"] == data_type]["DataValueUnit"].unique())
    for data_type in data["DataValueType"].unique()
}
hosp_overall_pairing

{'Number': [nan, 'Number'],
 'Crude Rate': ['cases per 10,000', 'cases per 1,000,000'],
 'Age-adjusted Rate': ['cases per 10,000', 'cases per 1,000,000'],
 'Crude Prevalence': ['%'],
 'Age-adjusted Prevalence': ['%']}

In [19]:
# I think it would be interesting to perform this
# pairing but with the Stratification1 and StratificationCategory1 columns

strat_pairings = {
    strat_category: list(data[data["StratificationCategory1"] == strat_category]['Stratification1'].unique())
    for strat_category in data["StratificationCategory1"].unique()
}
strat_pairings

{'Gender': ['Male', 'Female'],
 'Overall': ['Overall'],
 'Race/Ethnicity': ['Hispanic',
  'White, non-Hispanic',
  'Asian or Pacific Islander',
  'American Indian or Alaska Native',
  'Black, non-Hispanic',
  'Other, non-Hispanic',
  'Multiracial, non-Hispanic']}

### Now can we abstract these pairings into each question?

In [20]:
question_dict = {}              # Key:Value => Question:DataFrame
question_pairings = {}          # Key:Value => Question:Pairing Dictionary

for question in data["Question"].unique():
    question_dict[question] = data[data["Question"] == question]
    question_pairings[question] = {
        data_type: list(data[data["DataValueType"] == data_type]["DataValueUnit"].unique())
        for data_type in data["DataValueType"].unique()
    }

In [25]:
question_dict[questions[-1]].head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,...,StratificationCategory1,Stratification1,Stratification2,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
28938,2020,2020,AK,Alaska,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,%,Age-adjusted Prevalence,68.3,...,Overall,Overall,,POINT (-147.72205903599973 64.84507995700051),2,AST,AST6_2,AGEADJPREV,OVERALL,OVR
28944,2015,2015,AK,Alaska,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,%,Crude Prevalence,,...,Race/Ethnicity,"Multiracial, non-Hispanic",,POINT (-147.72205903599973 64.84507995700051),2,AST,AST6_2,CRDPREV,RACE,MRC
28945,2021,2021,AL,Alabama,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,%,Crude Prevalence,,...,Race/Ethnicity,"Other, non-Hispanic",,POINT (-86.63186076199969 32.84057112200048),1,AST,AST6_2,CRDPREV,RACE,OTH
28949,2014,2014,AK,Alaska,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,%,Age-adjusted Prevalence,,...,Race/Ethnicity,"Other, non-Hispanic",,POINT (-147.72205903599973 64.84507995700051),2,AST,AST6_2,AGEADJPREV,RACE,OTH
28951,2012,2012,AK,Alaska,BRFSS,Asthma,Pneumococcal vaccination among noninstitutiona...,%,Age-adjusted Prevalence,,...,Race/Ethnicity,"Other, non-Hispanic",,POINT (-147.72205903599973 64.84507995700051),2,AST,AST6_2,AGEADJPREV,RACE,OTH


In [29]:
# Print out the question and the question_pairing dictionary corresponding to the said question
for question in questions:
    print(f"Question: {question}")
    print(question_pairings[question])
    print()

Question: Hospitalizations for asthma
{'Number': [nan, 'Number'], 'Crude Rate': ['cases per 10,000', 'cases per 1,000,000'], 'Age-adjusted Rate': ['cases per 10,000', 'cases per 1,000,000'], 'Crude Prevalence': ['%'], 'Age-adjusted Prevalence': ['%']}

Question: Asthma mortality rate
{'Number': [nan, 'Number'], 'Crude Rate': ['cases per 10,000', 'cases per 1,000,000'], 'Age-adjusted Rate': ['cases per 10,000', 'cases per 1,000,000'], 'Crude Prevalence': ['%'], 'Age-adjusted Prevalence': ['%']}

Question: Emergency department visit rate for asthma
{'Number': [nan, 'Number'], 'Crude Rate': ['cases per 10,000', 'cases per 1,000,000'], 'Age-adjusted Rate': ['cases per 10,000', 'cases per 1,000,000'], 'Crude Prevalence': ['%'], 'Age-adjusted Prevalence': ['%']}

Question: Asthma prevalence among women aged 18-44 years
{'Number': [nan, 'Number'], 'Crude Rate': ['cases per 10,000', 'cases per 1,000,000'], 'Age-adjusted Rate': ['cases per 10,000', 'cases per 1,000,000'], 'Crude Prevalence': ['