This is the collection of codes that read food atlas datasets and CDC health indicator datasets from Github repository, integrate datasets and cleaning data

In [None]:
#merge food atlas datasets into one
import pandas as pd

dfs=list()
url_folder='https://raw.githubusercontent.com/cathyxinxyz/Capstone_Project_1/master/Datasets/Food_atlas/'
filenames=['ACCESS','ASSISTANCE','HEALTH','INSECURITY','LOCAL','PRICES_TAXES','RESTAURANTS','SOCIOECONOMIC','STORES']
for i,filename in enumerate(filenames):
    url=url_folder+filename+".csv"   
    d=pd.read_csv(url,index_col='FIPS',encoding="ISO-8859-1")
    #append datasets to the list and drop the redundent columns:'State' and 'County'
    if i!=0:
        dfs.append(d.drop(['State', 'County'], axis=1))
    else:
        dfs.append(d)

#merge datasets
df_merge=pd.concat(dfs, join='outer', axis=1)

In [None]:
print (df_merge.head(5))

Check columns for missing values

In [None]:
df_merge.describe()

In [None]:
number_null_values_percol=df_merge.isnull().sum(axis=0)
#columns with over 100 missing values
cols_with_over_50_null_values=number_null_values_percol[number_null_values_percol>100]
print (cols_with_over_50_null_values.index)

In [None]:
#drop these columns first
df_merge=df_merge.drop(list(cols_with_over_50_null_values.index), axis=1)

In [None]:
#check number of remaining columns
df_merge.shape
print (df_merge.columns)

categorizes columns into three groups: category data ('State' and 'County'), count data, percent data, # per 1000 pop, and percent change

columns to keep: category data ('State' and 'County'), percent data, # per 1000 pop, and percent change; remove count data because it is not adjusted by population size

Each column name is highly abstract and unreadable, need to extract info from the variable information provided by Food_atlas

In [None]:
from collections import defaultdict

url='https://raw.githubusercontent.com/cathyxinxyz/Capstone_Project_1/master/Datasets/Food_atlas/variable_info.csv'
var_info_df=pd.read_csv(url,encoding="ISO-8859-1")
var_info_dict=defaultdict(list)
print (var_info_df.head(5))
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    var_info_dict[k].append(var_info_df['Variable Code'][idx])

print (var_info_dict.keys())


further filter varaibles based on following principles:
i. keep variables that are adjusted by population size: '% change', 'Percent', '# per 1,000 pop','Percentage points';
ii. keep variables that are mostly valuable for analysis
iii. keep variables where values are valid: e.g. no negative values for variables with units as 'Percent' or '# per 1,000 pop'.


In [None]:
#units to keep: '% change', 'Percent', '# per 1,000 pop','Percentage points'
for k in var_info_dict.keys():
    if k not in ['% change', 'Percent', '# per 1,000 pop','Percentage points'] and var_info_dict[k] not in ['State','County']:
        df_merge = df_merge[df_merge.columns.difference(var_info_dict[k])]
        
#print (df_merge.shape)

In [None]:
#view variables
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_merge.columns:
        print (k1,k2,k,var)

In [None]:
print (df_merge.shape)

In [None]:
#subset of variables chosen for analysis
#dropped variables: 
#                   ASSISTANCE for students: PCT_NSLP09, PCT_NSLP15,PCH_NSLP_09_15,PCT_FREE_LUNCH09,PCT_REDUCED_LUNCH09,PCT_SBP09,
#                                            PCT_SBP15,PCH_SBP_09_15,PCT_SFSP09,PCT_SFSP15,PCH_SFSP_09_15
#                   ASSISTANCE SNAP participants/eligible pop Percent:SNAP_PART_RATE08 and SNAP_PART_RATE13
#                   PRICES_TAXES: SODATAX_STORES14, SODATAX_VENDM14, SODATAX_VENDM14, CHIPSTAX_STORES14, FOOD_TAX14,CHIPSTAX_VENDM14
#                   HEALTH Adult diabetes rate: PCT_DIABETES_ADULTS08, PCT_DIABETES_ADULTS13, PCT_OBESE_ADULTS08, PCT_OBESE_ADULTS13
#                   

df_merge=df_merge.drop(['PCT_NSLP09', 'PCT_NSLP15', 'PCH_NSLP_09_15','PCT_FREE_LUNCH09',
                        'PCT_REDUCED_LUNCH09','PCT_SBP09','PCT_SBP15','PCH_SBP_09_15',
                        'PCT_SFSP09','PCT_SFSP15','PCH_SFSP_09_15',
                        'SNAP_PART_RATE08','SNAP_PART_RATE13',
                        'SODATAX_STORES14', 'SODATAX_VENDM14', 'SODATAX_VENDM14', 
                        'CHIPSTAX_STORES14', 'FOOD_TAX14','CHIPSTAX_VENDM14',
                         'PCT_DIABETES_ADULTS08', 'PCT_DIABETES_ADULTS13', 'PCT_OBESE_ADULTS08', 'PCT_OBESE_ADULTS13'], axis=1)


In [135]:
print(df_merge.shape)
#view variables
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_merge.columns:
        print (k1,k2,k,var)

(3143, 96)
ACCESS nan % change PCH_LACCESS_POP_10_15
ACCESS nan Percent PCT_LACCESS_POP10
ACCESS nan Percent PCT_LACCESS_POP15
ACCESS Low income % change PCH_LACCESS_LOWI_10_15
ACCESS Low income Percent PCT_LACCESS_LOWI10
ACCESS Low income Percent PCT_LACCESS_LOWI15
ACCESS no car % change PCH_LACCESS_HHNV_10_15
ACCESS no car Percent PCT_LACCESS_HHNV10
ACCESS no car Percent PCT_LACCESS_HHNV15
ACCESS SNAP Percent PCT_LACCESS_SNAP15
ACCESS Children Percent PCT_LACCESS_CHILD10
ACCESS Children Percent PCT_LACCESS_CHILD15
ACCESS Seniors % change PCH_LACCESS_SENIORS_10_15
ACCESS Seniors Percent PCT_LACCESS_SENIORS10
ACCESS Seniors Percent PCT_LACCESS_SENIORS15
ACCESS White Percent PCT_LACCESS_WHITE15
ACCESS Black Percent PCT_LACCESS_BLACK15
ACCESS Hispanic ethnicity Percent PCT_LACCESS_HISP15
ACCESS Asian Percent PCT_LACCESS_NHASIAN15
ACCESS American Indian or Alaska Native Percent PCT_LACCESS_NHNA15
ACCESS Hawaiian or Pacific Islander Percent PCT_LACCESS_NHPI15
ACCESS Multiracial Percent PCT

In [None]:
#devide dataframe into two: one with variables measured at one year and one with variables as percent change、
var_timepoint=list()
var_percentchange=list()
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_merge.columns: 
        if k in ['Percent', '# per 1,000 pop']:
            var_timepoint.append(var)
        elif k in ['% change','Percentage points']:
            var_percentchange.append(var)
var_timepoint.extend(['State','County'])
var_percentchange.extend(['State','County'])
df_tp=df_merge[var_timepoint]
df_pr=df_merge[var_percentchange]


In [None]:
print (df_tp.shape)
print (df_pr.shape)
print (df_tp.columns)

In [None]:
#check weather each column has valid values:
####### columns with units 'Percent' should have values between 0 and 100, any value that fall out of this range should be changed to NaN values
###### 
######
######

#Replace invalid values with np.nan
import numpy as np

for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if k =='Percent' and var in df_tp.columns: 
        ser=df_tp[var]<0 
        ser=ser+df_tp[var]>100
        if ser.sum()>0:
            print ((k1,k2,var,ser.sum()))
            df_tp[var][df_tp[var]<0 or df_tp[var]>100]=np.nan
    elif k=='# per 1,000 pop' and var in df_tp.columns:
        ser=df_tp[var]<0
        ser=ser+df_tp[var]>1000
        if ser.sum()>0:
            print ((k1,k2,var,ser.sum()))
            df_tp[var][df_tp[var]<0 or df_tp[var]>1000]=np.nan
    elif k=='Percentage points' and var in df_pr.columns:
        ser=df_pr[var]>100
        ser=df_pr[var]<-100
        if ser.sum()>0:
            print ((k1,k2,var,ser.sum())) 
            df_pr[var][df_pr[var]<-100 or df_pr[var]>100]=np.nan

        

In [None]:
#break df_tp into two sets: variables measured at the earlier time point: df_tp_1; and variables measured at the earlier time point: df_tp_2

#group the same measure into tuples, the same measure share the same name except the last two digits which indicate the year of the measure
var_grouped_by_measures=defaultdict(list)
early_measure_list=['State','County']
late_measure_list=['State','County']
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_tp.columns and var not in ['State','County']:
        var_grouped_by_measures[(k1,k2)].append((var, float(var[-2:])))
    
for v in var_grouped_by_measures.values():
    v.sort(key=lambda tup: tup[-1])
    early_measure_list.append(v[0][0])  
    late_measure_list.append(v[-1][0])

df_tp_1=df_tp[early_measure_list]
df_tp_2=df_tp[late_measure_list]

In [None]:
print (df_tp_1.shape)
print (df_tp_2.shape)

In [133]:
df_tp_1.to_csv('H:/Github/Capstone_Project_1/Datasets/Food_atlas/df_tp_1.csv')
df_tp_2.to_csv('H:/Github/Capstone_Project_1/Datasets/Food_atlas/df_tp_2.csv')

In [134]:
df_pr.to_csv('H:/Github/Capstone_Project_1/Datasets/Food_atlas/df_pr.csv')

Integrate CDC Datasets together