In [127]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import os, sys
import sys

sys.path.append(os.path.realpath('../src/data/'))
from sb_utils import save_file

from pandas_profiling import ProfileReport

Data for this notebook is sourced from https://healthdata.gov/dataset/U-S-Chronic-Disease-Indicators-CDI-/67sp-6x8j.

A helpful UI that contains definitions for the data (as well as a graphical drill down display) is [here](https://nccd.cdc.gov/cdi/rdPage.aspx?rdReport=DPH_CDI.ExploreByTopic&islTopic=AST&islYear=9999&go=GO)

The data dictionary is available [here](https://www.cdc.gov/cdi/definitions/index.html)

### Import Data
Import the raw data from `data > raw` and write it out to the `data > interim directory`. We'll work from the interim folder.

In [2]:
# the supplied CSV data file is the raw_data directory
cdi_df = pd.read_csv('../data/raw/U.S._Chronic_Disease_Indicators__CDI_.csv')
cdi_df.shape

  cdi_df = pd.read_csv('../data/raw/U.S._Chronic_Disease_Indicators__CDI_.csv')


(1082328, 34)

### Audit Data

As a first step, lets look at a sample of data

In [4]:
cdi_df.sample(10)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,Response,DataValueUnit,DataValueType,...,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1,StratificationCategoryID2,StratificationID2,StratificationCategoryID3,StratificationID3
604008,2011,2011,GU,Guam,BRFSS,Cardiovascular Disease,Pneumococcal vaccination among noninstitutiona...,,%,Crude Prevalence,...,66,CVD,CVD10_1,CRDPREV,GENDER,GENM,,,,
71887,2010,2010,ID,Idaho,NVSS,Cardiovascular Disease,Mortality from total cardiovascular diseases,,,Number,...,16,CVD,CVD1_1,NMBR,RACE,HIS,,,,
503798,2015,2015,AR,Arkansas,BRFSS,"Nutrition, Physical Activity, and Weight Status",Overweight or obesity among adults aged >= 18 ...,,%,Age-adjusted Prevalence,...,5,NPAW,NPAW2_1,AGEADJPREV,RACE,BLK,,,,
152756,2014,2014,MT,Montana,NVSS,Chronic Obstructive Pulmonary Disease,Mortality with chronic obstructive pulmonary d...,,"cases per 100,000",Age-adjusted Rate,...,30,COPD,COPD1_2,AGEADJRATE,RACE,AIAN,,,,
315722,2009,2013,CO,Colorado,Statewide central cancer registries,Cancer,"Cancer of the colon and rectum (colorectal), i...",,"per 100,000",Average Annual Age-adjusted Rate,...,8,CAN,CAN7_1,AVGANNAGEADJRATE,GENDER,GENF,,,,
748814,2016,2016,MS,Mississippi,BRFSS,Diabetes,Glycosylated hemoglobin measurement among adul...,,%,Age-adjusted Prevalence,...,28,DIA,DIA6_0,AGEADJPREV,RACE,OTH,,,,
312590,2011,2015,CA,California,Death Certificate,Cancer,"Cancer of the female cervix, mortality",,,Average Annual Number,...,6,CAN,CAN6_2,AVGANNNMBR,GENDER,GENF,,,,
821976,2016,2016,NJ,New Jersey,BRFSS,Tobacco,Current smoking among adults aged >= 18 years,,%,Age-adjusted Prevalence,...,34,TOB,TOB1_2,AGEADJPREV,RACE,WHT,,,,
996322,2011,2011,WA,Washington,BRFSS,Arthritis,Severe joint pain due to arthritis among adult...,,%,Age-adjusted Prevalence,...,53,ART,ART2_2,AGEADJPREV,GENDER,GENF,,,,
889659,2018,2018,PR,Puerto Rico,BRFSS,Alcohol,Heavy drinking among adults aged >= 18 years,,%,Age-adjusted Prevalence,...,72,ALC,ALC5_1,AGEADJPREV,RACE,HIS,,,,


In [5]:
cdi_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1082328 entries, 0 to 1082327
Data columns (total 34 columns):
 #   Column                     Non-Null Count    Dtype  
---  ------                     --------------    -----  
 0   YearStart                  1082328 non-null  int64  
 1   YearEnd                    1082328 non-null  int64  
 2   LocationAbbr               1082328 non-null  object 
 3   LocationDesc               1082328 non-null  object 
 4   DataSource                 1082328 non-null  object 
 5   Topic                      1082328 non-null  object 
 6   Question                   1082328 non-null  object 
 7   Response                   0 non-null        float64
 8   DataValueUnit              942938 non-null   object 
 9   DataValueType              1082328 non-null  object 
 10  DataValue                  729235 non-null   object 
 11  DataValueAlt               727192 non-null   float64
 12  DataValueFootnoteSymbol    366753 non-null   object 
 13  DatavalueFoo

Get rid of columns that have no data (Nan)

In [7]:
# 34 columns to start, removing 10
cdi_df.drop(inplace=True, columns=['Response', 'StratificationCategory2', 'Stratification2', 'StratificationCategory3', 'Stratification3', 'ResponseID', 'StratificationCategoryID2', 'StratificationID2', 'StratificationCategoryID3', 'StratificationID3'])
assert cdi_df.shape[1] == 24

# Also drop the following columns: 'DataValueFootnoteSymbol','DatavalueFootnote','LowConfidenceLimit','HighConfidenceLimit'.
# We don't need references for this analysis, and CDC documentation says that all data are 
# within the 95% confidence interval
cdi_df.drop(inplace=True, columns=['DataValueFootnoteSymbol','DatavalueFootnote','LowConfidenceLimit','HighConfidenceLimit'])
assert cdi_df.shape[1] == 20

This dataset is being considered for prediction in conjunction with a Kaggle derived dataset from the EPA on air pollution measures. Therefore, we are only really concerned about outcomes that might be associated with pollution. The topics covered in this dataset are:

We are only concerned with US states, excluding commonwealths and such. Lets see what we have:

In [8]:
cdi_df.loc[:,('LocationAbbr','LocationDesc')].value_counts()

LocationAbbr  LocationDesc        
WI            Wisconsin               20608
NY            New York                20608
NJ            New Jersey              20602
NE            Nebraska                20570
IA            Iowa                    20562
VT            Vermont                 20542
WA            Washington              20509
NM            New Mexico              20509
MI            Michigan                20474
HI            Hawaii                  20472
CO            Colorado                20469
WV            West Virginia           20459
AR            Arkansas                20436
NC            North Carolina          20433
KY            Kentucky                20427
UT            Utah                    20404
OR            Oregon                  20404
MA            Massachusetts           20401
MD            Maryland                20398
FL            Florida                 20363
RI            Rhode Island            20360
NV            Nevada                  203

Lets remove the following locations from the dataframe:
PR (Puerto Rico), VI (Virgin Islands), GU (Guam)
Lets also remove the 'United States' (US) data - its a compilation of all the individual locations. But since we are throwing out non-states, this data is meaningless.

In [9]:
remove_rows = ['PR','VI','GU','US']
cdi_df = cdi_df.loc[~cdi_df['LocationAbbr'].isin(remove_rows), :]
cdi_df.loc[:,('LocationAbbr','LocationDesc')].value_counts()

LocationAbbr  LocationDesc        
NY            New York                20608
WI            Wisconsin               20608
NJ            New Jersey              20602
NE            Nebraska                20570
IA            Iowa                    20562
VT            Vermont                 20542
NM            New Mexico              20509
WA            Washington              20509
MI            Michigan                20474
HI            Hawaii                  20472
CO            Colorado                20469
WV            West Virginia           20459
AR            Arkansas                20436
NC            North Carolina          20433
KY            Kentucky                20427
OR            Oregon                  20404
UT            Utah                    20404
MA            Massachusetts           20401
MD            Maryland                20398
FL            Florida                 20363
RI            Rhode Island            20360
NV            Nevada                  203

In [10]:
cdi_df.Topic.unique()

array(['Cardiovascular Disease', 'Asthma', 'Alcohol', 'Tobacco',
       'Overarching Conditions', 'Chronic Kidney Disease', 'Diabetes',
       'Reproductive Health', 'Cancer', 'Arthritis', 'Oral Health',
       'Chronic Obstructive Pulmonary Disease',
       'Nutrition, Physical Activity, and Weight Status', 'Older Adults',
       'Disability', 'Mental Health', 'Immunization'], dtype=object)

Of these, we are definitely interested in 'Asthma' and 'Chronic Obstructive Pulmonary Disease' (aka COPD) topics. Other interesting columns might be 'Cardiovascular Disease', 'Overarching Conditions', 'Cancer', 'Tobacco' (which must be examined for correlation to cardiopulmonary disease). 'Older Adults' might be interesting too. But lets remove rows associated with the other topics before we explore these.

First though, define a function for pruning rows from the dataframe. It looks as if we will be doing a bunch of that:

In [11]:
# pass in dataframe, the column to target, and the rows to keep.
# returns the pruned dataframe
def prune_rows(df=None, col=None, keep_rows=[]):
    '''Remove all except keep_rows from column col'''
    assert df.shape[0] > 0
    assert col != None
    assert len(keep_rows) > 0
    return df.loc[df[col].isin(keep_rows), :]

In [12]:
nrows = cdi_df.shape[0]
keep_topics = ['Asthma','Chronic Obstructive Pulmonary Disease','Cardiovascular Disease','Overarching Conditions','Cancer','Tobacco','Older Adults']
cdi_df = prune_rows(cdi_df, 'Topic', keep_topics)
assert cdi_df.shape[0] < nrows

There are also YearStart and YearEnd columns. They could be collapsed if the start year is always equal to the end year. Let's define some functions to check this.

In [20]:
def is_year(start_date: int, end_date: int) -> bool:
    '''Returns true when start and end dates are the same. False otherwise.'''
    return True if (start_date == end_date) else False

In [21]:
# vector implementation of function for speed (see https://towardsdatascience.com/apply-function-to-pandas-dataframe-rows-76df74165ee4)
def check_year(df):
    return np.vectorize(is_year)(
        df['YearStart'],
        df['YearEnd']
      )

In [73]:
check_year(cdi_df).all()

False

So at least some topics have multi-year ranges. Lets find out which ones:

In [23]:
def multi_year_topics(df=None, col='Topic'):
    '''Returns a set of topics (or specified column) for the given dataframe that have multi-year ranges'''
    assert df.shape[0] > 0 
    topics = set()
    check = check_year(df)
    for i in range(len(check)):
        if check[i] == False:
            topics.add(df.iloc[i,:][col])
    return topics

In [74]:
multi_year_topics(cdi_df)

{'Cancer', 'Overarching Conditions'}

We'll explore these as we partition the dataset below.

## Summary

## Dataset partitioning
We are going to break our dataset into seperate datasets based on topics that we want to explore. This allows us to confine our wrangling efforts to the topic at hand. At the end, we may decide to throw out some topics.

### Asthma

#### Question

Lets look into the Questions for the asthma topic to prune rows where the question is not pertainant

In [13]:
cdi_df[cdi_df['Topic'] == 'Asthma']['Question'].unique()

array(['Asthma mortality rate', 'Hospitalizations for asthma',
       'Emergency department visit rate for asthma',
       'Asthma prevalence among women aged 18-44 years',
       'Influenza vaccination among noninstitutionalized adults aged 18-64 years with asthma',
       'Influenza vaccination among noninstitutionalized adults aged >= 65 years with asthma',
       'Pneumococcal vaccination among noninstitutionalized adults aged >= 65 years with asthma',
       'Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years with asthma',
       'Current asthma prevalence among adults aged >= 18 years'],
      dtype=object)

It looks like only 'Asthma mortality rate', 'Hospitalizations for asthma', 'Emergency department visit rate for asthma' and 'Current asthma prevalence among adults aged >= 18 years' rows are general to keep as indicators for Asthma. We'll prune the others.

In [14]:
keep_questions = ['Asthma mortality rate', 'Hospitalizations for asthma', 'Emergency department visit rate for asthma', 'Current asthma prevalence among adults aged >= 18 years']
# can't use prune_rows because of equivalency check
asthma_df = cdi_df[cdi_df.Topic == 'Asthma'].loc[cdi_df.Question.isin(keep_questions), :]
assert len(asthma_df['Question'].unique()) == 4
asthma_df.sample(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
175328,2016,2016,NH,New Hampshire,NVSS,Asthma,Asthma mortality rate,,Number,,,Gender,Male,POINT (-71.50036091999965 43.65595011300047),33,AST,AST4_1,NMBR,GENDER,GENM
953768,2013,2013,UT,Utah,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Age-adjusted Prevalence,,,Race/Ethnicity,"Multiracial, non-Hispanic",POINT (-111.58713063499971 39.360700171000474),49,AST,AST1_1,AGEADJPREV,RACE,MRC
122202,2016,2016,MI,Michigan,NVSS,Asthma,Asthma mortality rate,,Number,,,Race/Ethnicity,Hispanic,POINT (-84.71439026999968 44.6613195430005),26,AST,AST4_1,NMBR,RACE,HIS
111725,2014,2014,MA,Massachusetts,SEDD; SID,Asthma,Hospitalizations for asthma,"cases per 10,000",Age-adjusted Rate,,,Race/Ethnicity,"White, non-Hispanic",POINT (-72.08269067499964 42.27687047000046),25,AST,AST3_1,AGEADJRATE,RACE,WHT
119471,2011,2011,ME,Maine,NVSS,Asthma,Asthma mortality rate,,Number,,,Overall,Overall,POINT (-68.98503133599962 45.254228894000505),23,AST,AST4_1,NMBR,OVERALL,OVR


#### StratificationCategory1

The stratification category (StratificationCategory1) column shows partitioning of the data by different factors:

In [15]:
asthma_df['StratificationCategory1'].unique()

array(['Race/Ethnicity', 'Overall', 'Gender'], dtype=object)

We are only interested in the 'Overall' category, so prune other rows.

In [16]:
keep_categories = ['Overall']
asthma_df = prune_rows(asthma_df, 'StratificationCategory1', keep_categories)
assert len(asthma_df['StratificationCategory1'].unique()) == 1
asthma_df.sample(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
730466,2019,2019,MN,Minnesota,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Crude Prevalence,8.3,8.3,Overall,Overall,POINT (-94.79420050299967 46.35564873600049),27,AST,AST1_1,CRDPREV,OVERALL,OVR
286053,2015,2015,WV,West Virginia,SEDD; SID,Asthma,Hospitalizations for asthma,,Number,1341.0,1341.0,Overall,Overall,POINT (-80.71264013499967 38.66551020200046),54,AST,AST3_1,NMBR,OVERALL,OVR
904924,2012,2012,SD,South Dakota,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Crude Prevalence,7.5,7.5,Overall,Overall,POINT (-100.3735306369997 44.353130053000484),46,AST,AST1_1,CRDPREV,OVERALL,OVR
131274,2014,2014,MO,Missouri,NVSS,Asthma,Asthma mortality rate,"cases per 1,000,000",Crude Rate,14.7,14.7,Overall,Overall,POINT (-92.56630005299968 38.635790776000476),29,AST,AST4_1,CRDRATE,OVERALL,OVR
244646,2010,2010,TN,Tennessee,SEDD; SID,Asthma,Emergency department visit rate for asthma,"cases per 10,000",Age-adjusted Rate,,,Overall,Overall,POINT (-85.77449091399967 35.68094058000048),47,AST,AST2_1,AGEADJRATE,OVERALL,OVR


#### DataValueUnit

Lets also pare down the dataset to include only measures that are a percentage of per-capita residents of the state:

In [17]:
keep_categories = ['%']
asthma_df = prune_rows(asthma_df, 'DataValueUnit', keep_categories)
assert len(asthma_df['DataValueUnit'].unique()) == 1
asthma_df.sample(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
640459,2018,2018,IN,Indiana,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Age-adjusted Prevalence,9.9,9.9,Overall,Overall,POINT (-86.14996019399968 39.766910452000445),18,AST,AST1_1,AGEADJPREV,OVERALL,OVR
583839,2017,2017,FL,Florida,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Age-adjusted Prevalence,7.5,7.5,Overall,Overall,POINT (-81.92896053899966 28.932040377000476),12,AST,AST1_1,AGEADJPREV,OVERALL,OVR
499510,2012,2012,AL,Alabama,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Age-adjusted Prevalence,8.5,8.5,Overall,Overall,POINT (-86.63186076199969 32.84057112200048),1,AST,AST1_1,AGEADJPREV,OVERALL,OVR
1020954,2011,2011,WY,Wyoming,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Age-adjusted Prevalence,9.1,9.1,Overall,Overall,POINT (-108.10983035299967 43.23554134300048),56,AST,AST1_1,AGEADJPREV,OVERALL,OVR
1061395,2020,2020,OH,Ohio,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Age-adjusted Prevalence,10.0,10.0,Overall,Overall,POINT (-82.40426005599966 40.06021014100048),39,AST,AST1_1,AGEADJPREV,OVERALL,OVR


#### DataValueType

We also only need one overall measure type for the dataset. Currently we have these two:

In [18]:
asthma_df['DataValueType'].unique()

array(['Age-adjusted Prevalence', 'Crude Prevalence'], dtype=object)

But lets only use 'Crude Prevalence', defined by the data dictionary as "The measured number of deaths, cases of conditions, diseases or hospitalizations during a specific year – specified as rates per 1,000, per 10,000, per 100,000 or rates per 1,000,000 persons."

In [19]:
keep_categories = ['Crude Prevalence']
asthma_df = prune_rows(asthma_df, 'DataValueType', keep_categories)
assert len(asthma_df['DataValueType'].unique()) == 1
asthma_df.sample(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
720314,2014,2014,MN,Minnesota,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Crude Prevalence,8.4,8.4,Overall,Overall,POINT (-94.79420050299967 46.35564873600049),27,AST,AST1_1,CRDPREV,OVERALL,OVR
913279,2018,2018,SC,South Carolina,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Crude Prevalence,9.1,9.1,Overall,Overall,POINT (-81.04537120699968 33.998821303000454),45,AST,AST1_1,CRDPREV,OVERALL,OVR
753197,2013,2013,MT,Montana,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Crude Prevalence,8.6,8.6,Overall,Overall,POINT (-109.42442064499971 47.06652897200047),30,AST,AST1_1,CRDPREV,OVERALL,OVR
730466,2019,2019,MN,Minnesota,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Crude Prevalence,8.3,8.3,Overall,Overall,POINT (-94.79420050299967 46.35564873600049),27,AST,AST1_1,CRDPREV,OVERALL,OVR
824112,2016,2016,NY,New York,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Crude Prevalence,9.5,9.5,Overall,Overall,POINT (-75.54397042699964 42.82700103200045),36,AST,AST1_1,CRDPREV,OVERALL,OVR


#### Year

The 'YearStart' and 'YearEnd' columns look to all span one year, in which case we can drop those and replace with a single 'Year' column. Lets check this:

In [70]:
check = check_year(asthma_df)
check.all()

True

No multi-year rows are in this dataframe.

As a final check, lets see the overview of our dataset:

In [75]:
asthma_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 510 entries, 478485 to 1081867
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   YearStart                  510 non-null    int64  
 1   YearEnd                    510 non-null    int64  
 2   LocationAbbr               510 non-null    object 
 3   LocationDesc               510 non-null    object 
 4   DataSource                 510 non-null    object 
 5   Topic                      510 non-null    object 
 6   Question                   510 non-null    object 
 7   DataValueUnit              510 non-null    object 
 8   DataValueType              510 non-null    object 
 9   DataValue                  509 non-null    object 
 10  DataValueAlt               509 non-null    float64
 11  StratificationCategory1    510 non-null    object 
 12  Stratification1            510 non-null    object 
 13  GeoLocation                510 non-null  

It looks like we are missing one DataValue. Lets see which one:

In [77]:
asthma_df[asthma_df['DataValue'].isnull()]

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
800207,2019,2019,NJ,New Jersey,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Crude Prevalence,,,Overall,Overall,POINT (-74.27369128799967 40.13057004800049),34,AST,AST1_1,CRDPREV,OVERALL,OVR


The 2019 prevalence data is missing for NJ. We'll leave it as Nan because it would be misleading to impute any other data, and because it still acts as a marker for missed data collection, so I don't want to drop it.

### Cancer

Lets create the cancer topic dataset and take a look at the set of questions in it.

In [25]:
cancer_df = cdi_df[cdi_df.Topic == 'Cancer']
cancer_df.sample(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
447987,2008,2012,SC,South Carolina,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Age-adjusted Rate,41.5,41.5,Race/Ethnicity,Hispanic,POINT (-81.04537120699968 33.998821303000454),45,CAN,CAN8_1,AVGANNAGEADJRATE,RACE,HIS
710701,2013,2013,MI,Michigan,BRFSS,Cancer,Papanicolaou smear use among adult women aged ...,%,Age-adjusted Prevalence,,,Race/Ethnicity,"Multiracial, non-Hispanic",POINT (-84.71439026999968 44.6613195430005),26,CAN,CAN2_1,AGEADJPREV,RACE,MRC
433124,2008,2012,PA,Pennsylvania,Death Certificate,Cancer,"Invasive cancer (all sites combined), mortality",,Average Annual Number,,,Race/Ethnicity,"White, non-Hispanic",POINT (-77.86070029399963 40.79373015200048),42,CAN,CAN4_2,AVGANNNMBR,RACE,WHT
1029052,2020,2020,CT,Connecticut,BRFSS,Cancer,Papanicolaou smear use among adult women aged ...,%,Age-adjusted Prevalence,82.8,82.8,Race/Ethnicity,"White, non-Hispanic",POINT (-72.64984095199964 41.56266102000046),9,CAN,CAN2_1,AGEADJPREV,RACE,WHT
367771,2013,2017,MD,Maryland,Statewide central cancer registries,Cancer,"Cancer of the colon and rectum (colorectal), i...",,Average Annual Number,99.0,99.0,Race/Ethnicity,Asian or Pacific Islander,POINT (-76.60926011099963 39.29058096400047),24,CAN,CAN7_1,AVGANNNMBR,RACE,APIO


In keeping with the dataframe cleanup process above, we'll take the following steps to prune data:
1. Filter by relevant Question
2. StratificationCategory1, keeping ['Overall']
3. DataValueUnit, keeping ['%'] (or units that represent comparative measures)
4. DataValueType, keeping ['Crude Prevalence'] (or types that represent comparative measures)
5. Year: check if keeping multi-year data makes sense

#### Question

In [26]:
cancer_df['Question'].unique()

array(['Cancer of the prostate, mortality',
       'Mammography use among women aged 50-74 years',
       'Invasive cancer (all sites combined), incidence',
       'Invasive cancer of the cervix, incidence',
       'Invasive cancer of the female breast, incidence',
       'Invasive cancer (all sites combined), mortality',
       'Invasive cancer of the prostate, incidence',
       'Cancer of the oral cavity and pharynx, mortality',
       'Cancer of the female cervix, mortality',
       'Cancer of the female breast, mortality',
       'Invasive cancer of the oral cavity or pharynx, incidence',
       'Cancer of the lung and bronchus, mortality',
       'Invasive melanoma, incidence', 'Melanoma, mortality',
       'Cancer of the colon and rectum (colorectal), incidence',
       'Cancer of the lung and bronchus, incidence',
       'Cancer of the colon and rectum (colorectal), mortality',
       'Fecal occult blood test, sigmoidoscopy, or colonoscopy among adults aged 50-75 years',
      

Looks like we should keep 'Cancer of the lung and bronchus, incidence', 'Cancer of the lung and bronchus, mortality'

In [27]:
nrows = cdi_df.shape[0]
keep_questions = ['Cancer of the lung and bronchus, incidence', 'Cancer of the lung and bronchus, mortality']
cancer_df = prune_rows(cancer_df, 'Question', keep_questions)
assert cancer_df.shape[0] < nrows

#### StratificationCategory1

In [28]:
cancer_df['StratificationCategory1'].unique()

array(['Race/Ethnicity', 'Gender', 'Overall'], dtype=object)

We only want to keep the 'Overall' category

In [29]:
nrows = cancer_df.shape[0]
keep_categories = ['Overall']
cancer_df = prune_rows(cancer_df, 'StratificationCategory1', keep_categories)
assert cancer_df.shape[0] < nrows

#### DataValueUnit

In [30]:
cancer_df['DataValueUnit'].unique()

array(['per 100,000', nan], dtype=object)

Lets look at the rows with nan

In [31]:
nan_DataValueType = cancer_df[cancer_df['DataValueUnit'].isnull()]['DataValueType'].unique()
nan_DataValueType

array(['Average Annual Number'], dtype=object)

Well, these all belong to the 'Average Annual Number' type. The gamut of available DataValueUnit's in the entire dataset are:

In [32]:
cdi_df['DataValueUnit'].unique()

array([nan, 'cases per 1,000,000', 'Number', '%', 'pack sales per capita',
       'cases per 100,000', 'cases per 1,000', 'cases per 10,000',
       'Years', 'per 100,000'], dtype=object)

It seems that the 'Number' unit is generic enough to impute here.

In [33]:
cancer_df['DataValueUnit'].fillna(value='Number', inplace=True)
cancer_df.head()

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
294354,2010,2014,AL,Alabama,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Age-adjusted Rate,69.8,69.8,Overall,Overall,POINT (-86.63186076199969 32.84057112200048),1,CAN,CAN8_1,AVGANNAGEADJRATE,OVERALL,OVR
294417,2011,2015,AR,Arkansas,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence",Number,Average Annual Number,2834.0,2834.0,Overall,Overall,POINT (-92.27449074299966 34.74865012400045),5,CAN,CAN8_1,AVGANNNMBR,OVERALL,OVR
294440,2010,2014,AL,Alabama,Death Certificate,Cancer,"Cancer of the lung and bronchus, mortality","per 100,000",Average Annual Age-adjusted Rate,55.6,55.6,Overall,Overall,POINT (-86.63186076199969 32.84057112200048),1,CAN,CAN8_2,AVGANNAGEADJRATE,OVERALL,OVR
294441,2008,2012,AZ,Arizona,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Age-adjusted Rate,53.4,53.4,Overall,Overall,POINT (-111.76381127699972 34.865970280000454),4,CAN,CAN8_1,AVGANNAGEADJRATE,OVERALL,OVR
294533,2011,2015,AR,Arkansas,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Crude Rate,95.8,95.8,Overall,Overall,POINT (-92.27449074299966 34.74865012400045),5,CAN,CAN8_1,AVGANNCRDRATE,OVERALL,OVR


#### DataValueType

In [34]:
cancer_df['DataValueType'].unique()

array(['Average Annual Age-adjusted Rate', 'Average Annual Number',
       'Average Annual Crude Rate'], dtype=object)

We don't want 'Average Annual Age-adjusted Rate' (we are aiming to keep only overall values), so let's remove that.

In [35]:
nrows = cancer_df.shape[0]
keep_categories = ['Average Annual Number','Average Annual Crude Rate']
cancer_df = prune_rows(cancer_df, 'DataValueType', keep_categories)
assert cancer_df.shape[0] < nrows

It looks like we should keep the Average Annual Crude Rate. This is the state average relative to 100,000 (cases per 100K). This data is comparable between states since the overall numbers mean little without knowing the state population measures that CDC used.

In [36]:
nrows = cancer_df.shape[0]
keep_categories = ['Average Annual Crude Rate']
cancer_df = prune_rows(cancer_df, 'DataValueType', keep_categories)
assert cancer_df.shape[0] < nrows

In [37]:
cancer_df.DataValueType.unique()

array(['Average Annual Crude Rate'], dtype=object)

In [38]:
# create seperate dataframes for cancer incidenence and mortality.
# exclude LocationAbbr == US
cancer_incidence_df = cancer_df[(cancer_df.QuestionID == 'CAN8_1') & (cancer_df.LocationAbbr != 'US')]
cancer_mortality_df = cancer_df[(cancer_df.QuestionID == 'CAN8_2') & (cancer_df.LocationAbbr != 'US')]

In [39]:
print('incidence: '+cancer_incidence_df.sample(5)['DataSource'])
print('mortality: '+cancer_mortality_df.sample(5)['DataSource'])

411010    incidence: Statewide central cancer registries
344458    incidence: Statewide central cancer registries
398970    incidence: Statewide central cancer registries
379006    incidence: Statewide central cancer registries
400648    incidence: Statewide central cancer registries
Name: DataSource, dtype: object
343778    mortality: Death Certificate
407769    mortality: Death Certificate
321250    mortality: Death Certificate
334939    mortality: Death Certificate
425642    mortality: Death Certificate
Name: DataSource, dtype: object


From this, it's clear that incidence data is from Statewide central cancer registries and mortality data was derived from death certificates. Nevada is missing incidence data for all periods. I'll leave the Nan numbers in place, as imputing any value is likely to misinform.

#### Year

The dates for all cancer stats are collected over a four year period. I'm not sure if this will be problematic when comparing to annual data. I'll leave it be for now, but keep the following cell, which shows how to implement intervals over the collection period.

In [42]:
start = 2008
end = 2012
start_dt = pd.to_datetime(start, format='%Y')
end_dt = pd.to_datetime(end, format='%Y')
start_dts = pd.to_datetime(cancer_df.YearStart, format='%Y')
end_dts = pd.to_datetime(cancer_df.YearEnd, format='%Y')
#pd.date_range(start=start_dt, end=end_dt, periods=6, freq='Y')
#range = pd.date_range(start=start_dt, end=end_dt, periods=6, inclusive='left')
pd.interval_range(start=start, periods=4, closed='right')


IntervalIndex([(2008, 2009], (2009, 2010], (2010, 2011], (2011, 2012]], dtype='interval[int64, right]')

In [78]:
cancer_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 714 entries, 294533 to 490034
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   YearStart                  714 non-null    int64  
 1   YearEnd                    714 non-null    int64  
 2   LocationAbbr               714 non-null    object 
 3   LocationDesc               714 non-null    object 
 4   DataSource                 714 non-null    object 
 5   Topic                      714 non-null    object 
 6   Question                   714 non-null    object 
 7   DataValueUnit              714 non-null    object 
 8   DataValueType              714 non-null    object 
 9   DataValue                  707 non-null    object 
 10  DataValueAlt               707 non-null    float64
 11  StratificationCategory1    714 non-null    object 
 12  Stratification1            714 non-null    object 
 13  GeoLocation                714 non-null   

We are missing some DataValue values, lets find out which ones:

In [79]:
cancer_df[cancer_df['DataValue'].isnull()]

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
407203,2012,2016,NV,Nevada,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Crude Rate,,,Overall,Overall,POINT (-117.07184056399967 39.493240390000494),32,CAN,CAN8_1,AVGANNCRDRATE,OVERALL,OVR
410957,2011,2015,NV,Nevada,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Crude Rate,,,Overall,Overall,POINT (-117.07184056399967 39.493240390000494),32,CAN,CAN8_1,AVGANNCRDRATE,OVERALL,OVR
411010,2010,2014,NV,Nevada,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Crude Rate,,,Overall,Overall,POINT (-117.07184056399967 39.493240390000494),32,CAN,CAN8_1,AVGANNCRDRATE,OVERALL,OVR
413793,2014,2018,NV,Nevada,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Crude Rate,,,Overall,Overall,POINT (-117.07184056399967 39.493240390000494),32,CAN,CAN8_1,AVGANNCRDRATE,OVERALL,OVR
413850,2009,2013,NV,Nevada,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Crude Rate,,,Overall,Overall,POINT (-117.07184056399967 39.493240390000494),32,CAN,CAN8_1,AVGANNCRDRATE,OVERALL,OVR
416697,2013,2017,NV,Nevada,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Crude Rate,,,Overall,Overall,POINT (-117.07184056399967 39.493240390000494),32,CAN,CAN8_1,AVGANNCRDRATE,OVERALL,OVR
416740,2008,2012,NV,Nevada,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Crude Rate,,,Overall,Overall,POINT (-117.07184056399967 39.493240390000494),32,CAN,CAN8_1,AVGANNCRDRATE,OVERALL,OVR


Nevada is missing all of its incidence data. But let's keep these rows in place as markers and not try to impute any misleading values.

### Overarching Conditions

As we saw earlier, the 'Overarching Conditions' topic has some multi-year rows. Lets see what these are.

In [43]:
cdi_df[cdi_df['Topic'] == 'Overarching Conditions']['Question'].unique()

array(['Premature mortality among adults aged 45-64 years',
       'High school completion among adults aged 18-24 years', 'Poverty',
       'High school completion among women aged 18-44 years',
       'Health insurance coverage before pregnancy',
       'Poverty among women aged 18-44 years',
       'Gini Index of income inequality',
       'Life expectancy at age 65 years', 'Life expectancy at birth',
       'Recent physically unhealthy days among adults aged >= 18 years',
       'Fair or poor self-rated health status among adults aged >= 18 years',
       'Current lack of health insurance among adults aged 18-64 years',
       'Recent activity limitation among adults aged >= 18 years',
       'Prevalence of sufficient sleep among adults aged >= 18 years',
       'Self-rated health status among women aged 18-44 years',
       'Current health care coverage among women aged 18-44 years'],
      dtype=object)

There are some interesting data here, but none that are specific to lung health. Let's leave this data out.

### COPD

In [83]:
# establish the COPD dataframe
copd_df = cdi_df[cdi_df['Topic'] == 'Chronic Obstructive Pulmonary Disease']
copd_df.shape

(136602, 20)

In keeping with the dataframe cleanup process above, we'll take the following steps to prune data:
1. Filter by relevant Question
2. StratificationCategory1, keeping ['Overall']
3. DataValueUnit, keeping ['%'] (or units that represent comparative measures)
4. DataValueType, keeping ['Crude Prevalence'] (or types that represent comparative measures)
5. Year: check if keeping multi-year data makes sense

#### Filter by Question
Choose questions that might be insightful.

In [84]:
cdi_df[cdi_df['Topic'] == 'Chronic Obstructive Pulmonary Disease']['Question'].unique()

array(['Mortality with chronic obstructive pulmonary disease as underlying or contributing cause among adults aged >= 45 years',
       'Hospitalization for chronic obstructive pulmonary disease as any diagnosis among Medicare-eligible persons aged >= 65 years',
       'Mortality with chronic obstructive pulmonary disease as underlying cause among adults aged >= 45 years',
       'Emergency department visit rate for chronic obstructive pulmonary disease as any diagnosis',
       'Hospitalization for chronic obstructive pulmonary disease as first-listed diagnosis among Medicare-eligible persons aged >= 65 years',
       'Hospitalization for chronic obstructive pulmonary disease as first-listed diagnosis',
       'Hospitalization for chronic obstructive pulmonary disease as any diagnosis',
       'Emergency department visit rate for chronic obstructive pulmonary disease as first-listed diagnosis',
       'Prevalence of chronic obstructive pulmonary disease among adults >= 18',
       'Pn

Below are the subset of questions to explore. Some of these will be thrown out upon closer inspection:

In [85]:
keep_categories = ['Mortality with chronic obstructive pulmonary disease as underlying or contributing cause among adults aged >= 45 years',
       'Mortality with chronic obstructive pulmonary disease as underlying cause among adults aged >= 45 years',
       'Emergency department visit rate for chronic obstructive pulmonary disease as any diagnosis',
       'Hospitalization for chronic obstructive pulmonary disease as first-listed diagnosis',
       'Hospitalization for chronic obstructive pulmonary disease as any diagnosis',
       'Emergency department visit rate for chronic obstructive pulmonary disease as first-listed diagnosis',
       'Prevalence of chronic obstructive pulmonary disease among adults >= 18',
       'Prevalence of current smoking among adults >= 18 with diagnosed chronic obstructive pulmonary disease',
       'Prevalence of current smoking among adults >= 45 years with diagnosed chronic obstructive pulmonary disease',
       'Prevalence of chronic obstructive pulmonary disease among adults >= 45 years']

In [86]:
# filter for selected questions 
nrows = copd_df.shape[0]
copd_df = prune_rows(copd_df, 'Question', keep_categories)
assert copd_df.shape[0] < nrows

copd_df.sample(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
117284,2015,2015,ME,Maine,SEDD; SID,Chronic Obstructive Pulmonary Disease,Emergency department visit rate for chronic ob...,"cases per 10,000",Age-adjusted Rate,,,Overall,Overall,POINT (-68.98503133599962 45.254228894000505),23,COPD,COPD6_1,AGEADJRATE,OVERALL,OVR
9493,2015,2015,AK,Alaska,NVSS,Chronic Obstructive Pulmonary Disease,Mortality with chronic obstructive pulmonary d...,"cases per 100,000",Crude Rate,73.2,73.2,Overall,Overall,POINT (-147.72205903599973 64.84507995700051),2,COPD,COPD1_1,CRDRATE,OVERALL,OVR
165861,2015,2015,ND,North Dakota,NVSS,Chronic Obstructive Pulmonary Disease,Mortality with chronic obstructive pulmonary d...,,Number,339.0,339.0,Overall,Overall,POINT (-100.11842104899966 47.47531977900047),38,COPD,COPD1_1,NMBR,OVERALL,OVR
609525,2019,2019,HI,Hawaii,BRFSS,Chronic Obstructive Pulmonary Disease,Prevalence of current smoking among adults >= ...,%,Crude Prevalence,28.9,28.9,Race/Ethnicity,"Multiracial, non-Hispanic",POINT (-157.85774940299973 21.304850435000446),15,COPD,COPD3_0,CRDPREV,RACE,MRC
649828,2014,2014,IN,Indiana,BRFSS,Chronic Obstructive Pulmonary Disease,Prevalence of current smoking among adults >= ...,%,Age-adjusted Prevalence,46.3,46.3,Race/Ethnicity,"White, non-Hispanic",POINT (-86.14996019399968 39.766910452000445),18,COPD,COPD3_0_1,AGEADJPREV,RACE,WHT


#### Filter by StratificationCategory1

In [87]:
copd_df['StratificationCategory1'].unique()

array(['Overall', 'Race/Ethnicity', 'Gender'], dtype=object)

Let's keep the 'Overall' category

In [88]:
# filter for selected categories 
nrows = copd_df.shape[0]
keep_categories = ['Overall']
copd_df = prune_rows(copd_df, 'StratificationCategory1', keep_categories)
assert copd_df.shape[0] < nrows

#### Filter by DataValueUnit
keeping ['%'] (or units that represent comparative measures)

In [89]:
copd_df['DataValueUnit'].unique()

array(['cases per 100,000', nan, 'cases per 10,000', 'Number', '%'],
      dtype=object)

Lets take a look at each of the DataValueUnit rows, keeping a list of ones to keep.

In [90]:
keep_categories = []

In [91]:
# cases per 100K
copd_df[copd_df['DataValueUnit'] == 'cases per 100,000']['Question'].unique()

array(['Mortality with chronic obstructive pulmonary disease as underlying or contributing cause among adults aged >= 45 years',
       'Mortality with chronic obstructive pulmonary disease as underlying cause among adults aged >= 45 years'],
      dtype=object)

These are all mortality measures normalized to cases per 100K of the state population. We need these.

In [92]:
keep_categories.append('cases per 100,000')

In [93]:
# cases per 10,000
copd_df[copd_df['DataValueUnit'] == 'cases per 10,000']['Question'].unique()

array(['Hospitalization for chronic obstructive pulmonary disease as first-listed diagnosis',
       'Emergency department visit rate for chronic obstructive pulmonary disease as any diagnosis',
       'Emergency department visit rate for chronic obstructive pulmonary disease as first-listed diagnosis',
       'Hospitalization for chronic obstructive pulmonary disease as any diagnosis'],
      dtype=object)

These are the data on Emergency department visit rates. We should keep these.

In [94]:
# 'Number'
copd_df[copd_df['DataValueUnit'] == 'Number']['Question'].unique()

array(['Mortality with chronic obstructive pulmonary disease as underlying or contributing cause among adults aged >= 45 years',
       'Mortality with chronic obstructive pulmonary disease as underlying cause among adults aged >= 45 years'],
      dtype=object)

These are absolute numbers by state, we'll prune these in favor of normalized values.

In [95]:
# '%'
copd_df[copd_df['DataValueUnit'] == '%']['Question'].unique()

array(['Prevalence of current smoking among adults >= 45 years with diagnosed chronic obstructive pulmonary disease',
       'Prevalence of current smoking among adults >= 18 with diagnosed chronic obstructive pulmonary disease',
       'Prevalence of chronic obstructive pulmonary disease among adults >= 18',
       'Prevalence of chronic obstructive pulmonary disease among adults >= 45 years'],
      dtype=object)

These are relative measures on smoking and COPD. We definately want these.

In [96]:
keep_categories.append('%')

In [97]:
# 'Nan'
copd_df[copd_df['DataValueUnit'].isnull()]['Question'].unique()

array(['Mortality with chronic obstructive pulmonary disease as underlying or contributing cause among adults aged >= 45 years',
       'Mortality with chronic obstructive pulmonary disease as underlying cause among adults aged >= 45 years',
       'Hospitalization for chronic obstructive pulmonary disease as first-listed diagnosis',
       'Emergency department visit rate for chronic obstructive pulmonary disease as any diagnosis',
       'Hospitalization for chronic obstructive pulmonary disease as any diagnosis',
       'Emergency department visit rate for chronic obstructive pulmonary disease as first-listed diagnosis'],
      dtype=object)

In [98]:
copd_df[copd_df['DataValueUnit'].isnull()].sample(5)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
146720,2014,2014,MS,Mississippi,SEDD; SID,Chronic Obstructive Pulmonary Disease,Emergency department visit rate for chronic ob...,,Number,,,Overall,Overall,POINT (-89.53803082499968 32.745510099000455),28,COPD,COPD6_1,NMBR,OVERALL,OVR
76403,2017,2017,IA,Iowa,SEDD; SID,Chronic Obstructive Pulmonary Disease,Hospitalization for chronic obstructive pulmon...,,Number,6547.0,6547.0,Overall,Overall,POINT (-93.81649055599968 42.46940091300047),19,COPD,COPD5_1,NMBR,OVERALL,OVR
50633,2012,2012,DE,Delaware,NVSS,Chronic Obstructive Pulmonary Disease,Mortality with chronic obstructive pulmonary d...,,Number,824.0,824.0,Overall,Overall,POINT (-75.57774116799965 39.008830667000495),10,COPD,COPD1_2,NMBR,OVERALL,OVR
94044,2016,2016,IN,Indiana,SEDD; SID,Chronic Obstructive Pulmonary Disease,Hospitalization for chronic obstructive pulmon...,,Number,,,Overall,Overall,POINT (-86.14996019399968 39.766910452000445),18,COPD,COPD5_2,NMBR,OVERALL,OVR
34439,2015,2015,CT,Connecticut,NVSS,Chronic Obstructive Pulmonary Disease,Mortality with chronic obstructive pulmonary d...,,Number,2619.0,2619.0,Overall,Overall,POINT (-72.64984095199964 41.56266102000046),9,COPD,COPD1_2,NMBR,OVERALL,OVR


These look to be absolute number versions of data that we are already keeping that have been population adjusted. We don't need these.

Prune the COPD dataframe, keeping our identified records:

In [99]:
# filter for selected categories 
nrows = copd_df.shape[0]
copd_df = prune_rows(copd_df, 'DataValueUnit', keep_categories)
assert copd_df.shape[0] < nrows

#### Filter by DataValueType
keeping ['Crude Prevalence'] (or types that represent comparative measures)

In [100]:
copd_df['DataValueType'].unique()

array(['Age-adjusted Rate', 'Crude Rate', 'Age-adjusted Prevalence',
       'Crude Prevalence'], dtype=object)

We can immediately filter out 'Age-adjusted Rate' and 'Age-adjusted Prevalence'. The other types need to be explored.

In [101]:
# filter for selected types 
nrows = copd_df.shape[0]
keep_categories = ['Number','Crude Rate','Crude Prevalence']
copd_df = prune_rows(copd_df, 'DataValueType', keep_categories)
assert copd_df.shape[0] < nrows

Lets take a look at each of the remaining DataValueType rows:

In [102]:
# Crude Rate
copd_df[copd_df['DataValueType'] == 'Crude Rate']['Question'].unique()

array(['Mortality with chronic obstructive pulmonary disease as underlying or contributing cause among adults aged >= 45 years',
       'Mortality with chronic obstructive pulmonary disease as underlying cause among adults aged >= 45 years'],
      dtype=object)

Mortality measures. We keep these.

In [103]:
# Crude Prevalence
copd_df[copd_df['DataValueType'] == 'Crude Prevalence']['Question'].unique()

array(['Prevalence of current smoking among adults >= 18 with diagnosed chronic obstructive pulmonary disease',
       'Prevalence of current smoking among adults >= 45 years with diagnosed chronic obstructive pulmonary disease',
       'Prevalence of chronic obstructive pulmonary disease among adults >= 18',
       'Prevalence of chronic obstructive pulmonary disease among adults >= 45 years'],
      dtype=object)

Prevalence data for both COPD and smoking. These will be kept.

#### Filter by Year

Check if we have multi-year data and if it makes sense to keep.

In [104]:
multi_year_topics(copd_df)

set()

No multi-year data for COPD dataframe.

In [105]:
copd_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3060 entries, 490 to 1082300
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   YearStart                  3060 non-null   int64  
 1   YearEnd                    3060 non-null   int64  
 2   LocationAbbr               3060 non-null   object 
 3   LocationDesc               3060 non-null   object 
 4   DataSource                 3060 non-null   object 
 5   Topic                      3060 non-null   object 
 6   Question                   3060 non-null   object 
 7   DataValueUnit              3060 non-null   object 
 8   DataValueType              3060 non-null   object 
 9   DataValue                  3056 non-null   object 
 10  DataValueAlt               3056 non-null   float64
 11  StratificationCategory1    3060 non-null   object 
 12  Stratification1            3060 non-null   object 
 13  GeoLocation                3060 non-null   

The following columns contain incomplete data (fewer values than rows) and need to be examined for potential imputation: 
DataValueUnit, DataValue, DataValueAlt, DataValueFootnoteSymbol, DatavalueFootnote, LowConfidenceLimit, HighConfidenceLimit, GeoLocation.

In [106]:
copd_df[copd_df.DataValue.isnull()]

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
798873,2019,2019,NJ,New Jersey,BRFSS,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,%,Crude Prevalence,,,Overall,Overall,POINT (-74.27369128799967 40.13057004800049),34,COPD,COPD2_0_1,CRDPREV,OVERALL,OVR
800122,2019,2019,NJ,New Jersey,BRFSS,Chronic Obstructive Pulmonary Disease,Prevalence of current smoking among adults >= ...,%,Crude Prevalence,,,Overall,Overall,POINT (-74.27369128799967 40.13057004800049),34,COPD,COPD3_0_1,CRDPREV,OVERALL,OVR
800158,2019,2019,NJ,New Jersey,BRFSS,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,%,Crude Prevalence,,,Overall,Overall,POINT (-74.27369128799967 40.13057004800049),34,COPD,COPD2_0,CRDPREV,OVERALL,OVR
805898,2019,2019,NJ,New Jersey,BRFSS,Chronic Obstructive Pulmonary Disease,Prevalence of current smoking among adults >= ...,%,Crude Prevalence,,,Overall,Overall,POINT (-74.27369128799967 40.13057004800049),34,COPD,COPD3_0,CRDPREV,OVERALL,OVR


The 2019 data is missing for NJ. Leave is as Nan - any other imputed value would be misleading.

### Tobacco

In [68]:
cdi_df[cdi_df['Topic'] == 'Tobacco']['Question'].unique()

array(['Amount of tobacco product excise tax',
       'Current smoking among adults aged >= 18 years',
       'Sale of cigarette packs',
       'Current smokeless tobacco use among youth',
       'Cigarette smoking before pregnancy',
       'Percent tobacco revenue to fund at CDC recommended level',
       'Current cigarette smoking among youth',
       'Secondary schools that have a comprehensive tobacco-free school policy in place',
       'Proportion of the population protected by a comprehensive smoke-free policy prohibiting smoking in all indoor areas of workplaces and public places, including restaurants and bars',
       'States that allow stronger local tobacco control and prevention laws',
       'States with strong polices that require retail licenses to sell tobacco products',
       'Current smokeless tobacco use among adults aged >= 18 years',
       'Pneumococcal vaccination among noninstitutionalized adults aged 18-64 years who smoke',
       'Quit attempts in the past y

There is definately interesting data here - but we already have smoking prevalence data in the COPD dataset. I don't see a need right now to dig deeper that than.

We have three dataframes that we have partitioned:
- asthma_df
- cancer_df
- copd_df

Lets see if we can align these and then join them

In [112]:
print('asthma dataframe has shape: ', asthma_df.shape)
print('cancer dataframe has shape: ', cancer_df.shape)
print('COPD dataframe has shape: ', copd_df.shape)

asthma dataframe has shape:  (510, 20)
cancer dataframe has shape:  (714, 20)
COPD dataframe has shape:  (3060, 20)


Colums all align, so lets concat these together and look

In [123]:
temp = pd.concat([asthma_df, cancer_df])
cdi_df = pd.concat([temp,copd_df]).reset_index(drop=True)

In [125]:
cdi_df.sample(10)

Unnamed: 0,YearStart,YearEnd,LocationAbbr,LocationDesc,DataSource,Topic,Question,DataValueUnit,DataValueType,DataValue,DataValueAlt,StratificationCategory1,Stratification1,GeoLocation,LocationID,TopicID,QuestionID,DataValueTypeID,StratificationCategoryID1,StratificationID1
61,2016,2016,DC,District of Columbia,BRFSS,Asthma,Current asthma prevalence among adults aged >=...,%,Crude Prevalence,9.8,9.8,Overall,Overall,POINT (-77.036871 38.907192),11,AST,AST1_1,CRDPREV,OVERALL,OVR
707,2011,2015,ID,Idaho,Death Certificate,Cancer,"Cancer of the lung and bronchus, mortality","per 100,000",Average Annual Crude Rate,39.3,39.3,Overall,Overall,POINT (-114.3637300419997 43.682630005000476),16,CAN,CAN8_2,AVGANNCRDRATE,OVERALL,OVR
2216,2019,2019,OH,Ohio,NVSS,Chronic Obstructive Pulmonary Disease,Mortality with chronic obstructive pulmonary d...,"cases per 100,000",Crude Rate,136.9,136.9,Overall,Overall,POINT (-82.40426005599966 40.06021014100048),39,COPD,COPD1_1,CRDRATE,OVERALL,OVR
2959,2017,2017,MA,Massachusetts,BRFSS,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,%,Crude Prevalence,5.6,5.6,Overall,Overall,POINT (-72.08269067499964 42.27687047000046),25,COPD,COPD2_0,CRDPREV,OVERALL,OVR
582,2011,2015,CT,Connecticut,Statewide central cancer registries,Cancer,"Cancer of the lung and bronchus, incidence","per 100,000",Average Annual Crude Rate,74.1,74.1,Overall,Overall,POINT (-72.64984095199964 41.56266102000046),9,CAN,CAN8_1,AVGANNCRDRATE,OVERALL,OVR
2623,2016,2016,GA,Georgia,BRFSS,Chronic Obstructive Pulmonary Disease,Prevalence of current smoking among adults >= ...,%,Crude Prevalence,35.5,35.5,Overall,Overall,POINT (-83.62758034599966 32.83968109300048),13,COPD,COPD3_0,CRDPREV,OVERALL,OVR
1302,2016,2016,CA,California,NVSS,Chronic Obstructive Pulmonary Disease,Mortality with chronic obstructive pulmonary d...,"cases per 100,000",Crude Rate,173.3,173.3,Overall,Overall,POINT (-120.99999953799971 37.63864012300047),6,COPD,COPD1_2,CRDRATE,OVERALL,OVR
2805,2019,2019,IN,Indiana,BRFSS,Chronic Obstructive Pulmonary Disease,Prevalence of current smoking among adults >= ...,%,Crude Prevalence,40.3,40.3,Overall,Overall,POINT (-86.14996019399968 39.766910452000445),18,COPD,COPD3_0_1,CRDPREV,OVERALL,OVR
2995,2012,2012,MD,Maryland,BRFSS,Chronic Obstructive Pulmonary Disease,Prevalence of chronic obstructive pulmonary di...,%,Crude Prevalence,7.9,7.9,Overall,Overall,POINT (-76.60926011099963 39.29058096400047),24,COPD,COPD2_0_1,CRDPREV,OVERALL,OVR
1229,2010,2010,AR,Arkansas,NVSS,Chronic Obstructive Pulmonary Disease,Mortality with chronic obstructive pulmonary d...,"cases per 100,000",Crude Rate,146.3,146.3,Overall,Overall,POINT (-92.27449074299966 34.74865012400045),5,COPD,COPD1_1,CRDRATE,OVERALL,OVR


All seems to be in order.

#### Save dataframe
Save the data back to the interim store, overwriting what is there

In [126]:
# save the data to interim dir
datapath = '../data/interim/'
save_file(cdi_df, 'US_Chronic_Disease_Indicators_CDI.csv', datapath)

Writing file.  "../data/interim/US_Chronic_Disease_Indicators_CDI.csv"
