This is the collection of codes that read food atlas datasets and CDC health indicator datasets from Github repository, integrate datasets and cleaning data

In [None]:
#merge food atlas datasets into one
import pandas as pd

dfs=list()
url_folder='https://raw.githubusercontent.com/cathyxinxyz/Capstone_Project_1/master/Datasets/Food_atlas/'
filenames=['ACCESS','ASSISTANCE','HEALTH','INSECURITY','LOCAL','PRICES_TAXES','RESTAURANTS','SOCIOECONOMIC','STORES']
for i,filename in enumerate(filenames):
    url=url_folder+filename+".csv"   
    d=pd.read_csv(url,index_col='FIPS',encoding="ISO-8859-1")
    #append datasets to the list and drop the redundent columns:'State' and 'County'
    if i!=0:
        dfs.append(d.drop(['State', 'County'], axis=1))
    else:
        dfs.append(d)

#merge datasets
df_merge=pd.concat(dfs, join='outer', axis=1)

In [None]:
print (df_merge.head(5))

Check columns for missing values

In [None]:
df_merge.describe()

In [None]:
number_null_values_percol=df_merge.isnull().sum(axis=0)
#columns with over 100 missing values
cols_with_over_50_null_values=number_null_values_percol[number_null_values_percol>100]
print (cols_with_over_50_null_values.index)

In [None]:
#drop these columns first
df_merge=df_merge.drop(list(cols_with_over_50_null_values.index), axis=1)

In [None]:
#check number of remaining columns
print (df_merge.columns)

categorizes columns into three groups: category data ('State' and 'County'), count data, percent data, # per 1000 pop, and percent change

columns to keep: category data ('State' and 'County'), percent data, # per 1000 pop, and percent change; remove count data because it is not adjusted by population size

Each column name is highly abstract and unreadable, need to extract info from the variable information provided by Food_atlas

In [None]:
from collections import defaultdict

url='https://raw.githubusercontent.com/cathyxinxyz/Capstone_Project_1/master/Datasets/Food_atlas/variable_info.csv'
var_info_df=pd.read_csv(url,encoding="ISO-8859-1")
var_info_dict=defaultdict(list)
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    var_info_dict[k].append(var_info_df['Variable Code'][idx])


further filter varaibles based on following principles:
i. keep variables that are adjusted by population size: '% change', 'Percent', '# per 1,000 pop','Percentage points';
ii. keep variables that are mostly valuable for analysis
iii. keep variables where values are valid: e.g. no negative values for variables with units as 'Percent' or '# per 1,000 pop'.


In [None]:
#units to keep: '% change', 'Percent', '# per 1,000 pop','Percentage points'
for k in var_info_dict.keys():
    if k not in ['% change', 'Percent', '# per 1,000 pop','Percentage points'] and var_info_dict[k] not in ['State','County']:
        df_merge = df_merge[df_merge.columns.difference(var_info_dict[k])]
        
#print (df_merge.shape)

In [None]:
#view variables
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_merge.columns:
        print (k1,k2,k,var)

In [None]:
print (df_merge.shape)

In [None]:
'''Restaurant Availability and Expenditures,RESTAURANTS,Expenditures per capita,fast food,2007*,PC_FFRSALES07,CNTY10,Dollars
Restaurant Availability and Expenditures,RESTAURANTS,Expenditures per capita,fast food,2012*,PC_FFRSALES12,CNTY10,Dollars
Restaurant Availability and Expenditures,RESTAURANTS,Expenditures per capita,restaurants,2007*,PC_FSRSALES07,CNTY10,Dollars
Restaurant Availability and Expenditures,RESTAURANTS,Expenditures per capita,restaurants,2012*,PC_FSRSALES12,CNTY10,Dollars'''
df_merge=df_merge[['PCH_LACCESS_POP_10_15','PCT_LACCESS_POP10','PCT_LACCESS_POP15',
                   'PCH_LACCESS_LOWI_10_15','PCT_LACCESS_LOWI10', 'PCT_LACCESS_LOWI15', 
                   'PCT_LACCESS_CHILD10','PCT_LACCESS_CHILD15',
                   'PCH_LACCESS_SENIORS_10_15','PCT_LACCESS_SENIORS10','PCT_LACCESS_SENIORS15',
                   'PCH_GROC_09_14','GROCPTH09','GROCPTH14',
                   'SUPERCPTH09','SUPERCPTH14','PCH_CONVS_09_14', 
                   'CONVSPTH09','CONVSPTH14','PCH_CONVSPTH_09_14', 
                   'SPECSPTH09','SPECSPTH14',
                   'PCH_SNAPS_12_16','SNAPSPTH12','SNAPSPTH16','PCH_SNAPSPTH_12_16',
                   'PCH_WICS_08_12','WICSPTH08','WICSPTH12','PCH_WICSPTH_08_12',
                   'PCH_FFR_09_14','FFRPTH09','FFRPTH14',
                    'PCH_FFRPTH_09_14','PCH_FSR_09_14','FSRPTH09',
                    'FSRPTH14','PCT_CACFP09','PCT_CACFP15','PCH_CACFP_09_15',
                    'FOODINSEC_10_12','FOODINSEC_13_15','CH_FOODINSEC_12_15',
                    'VLFOODSEC_10_12','VLFOODSEC_13_15','CH_VLFOODSEC_12_15',
                    'RECFACPTH09','RECFACPTH14',
                    'PCT_NHWHITE10', 'PCT_NHBLACK10', 'PCT_HISP10','PCT_NHASIAN10',
                    'PCT_NHNA10','PCT_NHPI10','PCT_65OLDER10','PCT_18YOUNGER10','POVRATE15',
                     'PCT_DIABETES_ADULTS08', 'PCT_DIABETES_ADULTS13','PCT_OBESE_ADULTS08','PCT_OBESE_ADULTS13',
                    'State','County']]


In [None]:
print(df_merge.shape)
#view variables
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_merge.columns:
        print (k1,k2,k,var)

In [None]:
#devide dataframe into two: one with variables measured at one year and one with variables as percent change、
var_timepoint=list()
var_percentchange=list()
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_merge.columns: 
        if k in ['Percent', '# per 1,000 pop']:
            var_timepoint.append(var)
        elif k in ['% change','Percentage points']:
            var_percentchange.append(var)
var_timepoint.extend(['State','County'])
var_percentchange.extend(['State','County'])
df_tp=df_merge[var_timepoint]
df_pr=df_merge[var_percentchange]


In [None]:
print (df_tp.shape)
print (df_pr.shape)
print (df_pr.columns)

In [None]:
#check weather each column has valid values:
####### columns with units 'Percent' should have values between 0 and 100, any value that fall out of this range should be changed to NaN values
###### 
######
######

#Replace invalid values with np.nan
import numpy as np

for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if k =='Percent' and var in df_tp.columns: 
        ser=df_tp[var]<0 
        ser=ser+df_tp[var]>100
        if ser.sum()>0:
            print ((k1,k2,var,ser.sum()))
            df_tp[var][(df_tp[var]<0)|(df_tp[var]>100)]=np.nan
    elif k=='# per 1,000 pop' and var in df_tp.columns:
        ser=df_tp[var]<0
        ser=ser+df_tp[var]>1000
        if ser.sum()>0:
            print ((k1,k2,var,ser.sum()))
            df_tp[var][(df_tp[var]<0)|(df_tp[var]>1000)]=np.nan
    elif k=='Percentage points' and var in df_pr.columns:
        ser=df_pr[var]>100
        ser=df_pr[var]<-100
        if ser.sum()>0:
            print ((k1,k2,var,ser.sum())) 
            df_pr[var][(df_pr[var]<-100)|(df_pr[var]>100)]=np.nan

        

In [None]:
#break df_tp into two sets: variables measured at the earlier time point: df_tp_1; and variables measured at the earlier time point: df_tp_2

#group the same measure into tuples, the same measure share the same name except the last two digits which indicate the year of the measure
var_grouped_by_measures=defaultdict(list)
early_measure_list=['State','County']
late_measure_list=['State','County']
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_tp.columns and var not in ['State','County']:
        var_grouped_by_measures[(k1,k2)].append((var, float(var[-2:])))
    
for v in var_grouped_by_measures.values():
    v.sort(key=lambda tup: tup[-1])
    early_measure_list.append(v[0][0])  
    late_measure_list.append(v[-1][0])

df_tp_1=df_tp[early_measure_list]
df_tp_2=df_tp[late_measure_list]

In [None]:
print (df_tp_1.shape)
print (df_tp_2.shape)
print (df_pr.shape)

In [None]:
print (df_tp_1.columns)
print (df_tp_2.columns)
print (df_pr.columns)

In [None]:
var_trans_dict={'PCT_LACCESS_POP':'Low_Access_Overall',
               'PCT_LACCESS_LOWI':'Low_Access_Low_income',
               'PCT_LACCESS_CHILD':'Low_Access_Children',
               'PCT_LACCESS_SENIORS':'Low_Access_Senior',
               'GROCPTH':'Grocery',
                'SUPERCPTH':'Supercenter',
                'CONVSPTH':'Convenience',
                'SPECSPTH':'Specialized',
                'SNAPSPTH':'SNAP_store',
                'WICSPTH':'WIC_store',
                 'FFRPTH':'Fast_food',
                 'FSRPTH':'Full_service',
                 'PC_FFRSALES':'Expend_fast_food',
                 'PC_FSRSALES':'Expend_full_service',
                 'PCT_CACFP':'Assistance',
                 'FOODINSEC':'Low_insecurity',
                 'VLFOODSEC':'Very_low_insecurity',
                 'RECFACPTH':'Recreation_facility',
                 'PCT_NHWHITE':'White',
                  'PCT_NHBLACK':'Black',
                  'PCT_HISP':'Hispanic',
                   'PCT_NHASIAN1':'Asian',
                   'PCT_NHNA':'American Indian or Alaska Native',
                    'PCT_NHPI':'Hawaiian or Pacific Islander',
                    'PCT_65OLDER':'>=65',
                    'PCT_18YOUNGER':'<18',
                    'POVRATE':'Poverty_rate',
                    'PCT_DIABETES_ADULT':'Adult_db',
                    'PCT_OBESE_ADULTS':'Adult_ob'}

cols=list(df_tp_1.columns)
new_cols=list()
for c in cols:
    if c in ['State', 'County']:
        new_cols.append(c)
    else:
        for k in var_trans_dict.keys():
            if k in c:
                new_cols.append(var_trans_dict[k])
print (new_cols)
df_tp_1.columns=new_cols

          
cols=list(df_tp_2.columns)
new_cols=list()
for c in cols:
    if c in ['State', 'County']:
        new_cols.append(c)
    else:
        for k in var_trans_dict.keys():
            if k in c:
                new_cols.append(var_trans_dict[k])
df_tp_2.columns=new_cols

In [None]:
print (df_tp_1.head(5))
print (df_tp_2.head(5))
print (df_pr.head(5))

In [None]:
df_tp_1.to_csv('C:/Users/cathy/Capstone_Project_1/Datasets/Food_atlas/df_tp_1.csv')
df_tp_2.to_csv('C:/Users/cathy/Capstone_Project_1/Datasets/Food_atlas/df_tp_2.csv')

Integrate CDC Datasets together

In [None]:
#working on this ....