This is the collection of codes that read food atlas datasets and CDC health indicator datasets from Github repository, integrate datasets and cleaning data

In [109]:
#merge food atlas datasets into one
import pandas as pd

Overall_folder='C:/Users/cathy/Capstone_project_1/'

dfs=list()
url_folder='https://raw.githubusercontent.com/cathyxinxyz/Capstone_Project_1/master/Datasets/Food_atlas/'

filenames=['ACCESS','ASSISTANCE','HEALTH','INSECURITY','LOCAL','PRICES_TAXES','RESTAURANTS','SOCIOECONOMIC','STORES']
for i,filename in enumerate(filenames):
    filepath=url_folder+filename+".csv"   
    d=pd.read_csv(filepath,index_col='FIPS',encoding="ISO-8859-1")
    #append datasets to the list and drop the redundent columns:'State' and 'County'
    if i!=0:
        dfs.append(d.drop(['State', 'County'], axis=1))
    else:
        dfs.append(d)

#merge datasets
df_merge=pd.concat(dfs, join='outer', axis=1)

In [None]:
print (df_merge.head(5))

Check columns for missing values

In [None]:
df_merge.describe()

In [110]:
number_null_values_percol=df_merge.isnull().sum(axis=0)
#columns with over 100 missing values
cols_with_over_10_percent_null_values=number_null_values_percol[number_null_values_percol>0.1*df_merge.shape[0]]
print (cols_with_over_10_percent_null_values.index)

Index(['PC_WIC_REDEMP08', 'PC_WIC_REDEMP12', 'PCH_PC_WIC_REDEMP_08_12',
       'REDEMP_WICS08', 'REDEMP_WICS12', 'PCH_REDEMP_WICS_08_12', 'PCT_HSPA15',
       'PCT_LOCLSALE07', 'PCH_DIRSALES_07_12', 'PCH_PC_DIRSALES_07_12',
       'PCH_FMRKT_09_16', 'PCH_FMRKTPTH_09_16', 'FMRKT_SNAP16',
       'PCT_FMRKT_SNAP16', 'FMRKT_WIC16', 'PCT_FMRKT_WIC16', 'FMRKT_WICCASH16',
       'PCT_FMRKT_WICCASH16', 'FMRKT_SFMNP16', 'PCT_FMRKT_SFMNP16',
       'FMRKT_CREDIT16', 'PCT_FMRKT_CREDIT16', 'FMRKT_FRVEG16',
       'PCT_FMRKT_FRVEG16', 'FMRKT_ANMLPROD16', 'PCT_FMRKT_ANMLPROD16',
       'FMRKT_BAKED16', 'PCT_FMRKT_BAKED16', 'FMRKT_OTHERFOOD16',
       'PCT_FMRKT_OTHERFOOD16', 'PCH_VEG_FARMS_07_12', 'VEG_ACRES07',
       'VEG_ACRES12', 'PCH_VEG_ACRES_07_12', 'VEG_ACRESPTH07',
       'VEG_ACRESPTH12', 'PCH_VEG_ACRESPTH_07_12', 'PCH_FRESHVEG_FARMS_07_12',
       'FRESHVEG_ACRES07', 'FRESHVEG_ACRES12', 'PCH_FRESHVEG_ACRES_07_12',
       'FRESHVEG_ACRESPTH07', 'FRESHVEG_ACRESPTH12',
       'PCH_FRESHVEG_A

In [111]:
#drop these columns first
df_merge=df_merge.drop(list(cols_with_over_10_percent_null_values.index), axis=1)

In [112]:
df_merge.shape

(3143, 209)

In [None]:
#check number of remaining columns
print (df_merge.columns)

categorizes columns into three groups: category data ('State' and 'County'), count data, percent data, # per 1000 pop, and percent change

columns to keep: category data ('State' and 'County'), percent data, # per 1000 pop, and percent change; remove count data because it is not adjusted by population size

Each column name is highly abstract and unreadable, need to extract info from the variable information provided by Food_atlas

In [None]:
from collections import defaultdict

url='https://raw.githubusercontent.com/cathyxinxyz/Capstone_Project_1/master/Datasets/Food_atlas/variable_info.csv'
var_info_df=pd.read_csv(url,encoding="ISO-8859-1")
var_info_dict=defaultdict(list)
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    var_info_dict[k].append(var_info_df['Variable Code'][idx])


further filter varaibles based on following principles:
i. keep variables that are adjusted by population size: '% change', 'Percent', '# per 1,000 pop','Percentage points';
ii. keep variables that are mostly valuable for analysis
iii. keep variables where values are valid: e.g. no negative values for variables with units as 'Percent' or '# per 1,000 pop'.


In [113]:
#units to keep: '% change', 'Percent', '# per 1,000 pop','Percentage points'
for k in var_info_dict.keys():
    if k not in ['Percent', '# per 1,000 pop','Dollars'] and var_info_dict[k] not in ['State','County']:
        df_merge = df_merge[df_merge.columns.difference(var_info_dict[k])]
        
#print (df_merge.shape)

In [114]:
#view variables
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_merge.columns:
        print (k1,k2,k,var)

ACCESS Overall Percent PCT_LACCESS_POP10
ACCESS Overall Percent PCT_LACCESS_POP15
ACCESS Low income Percent PCT_LACCESS_LOWI10
ACCESS Low income Percent PCT_LACCESS_LOWI15
ACCESS no car Percent PCT_LACCESS_HHNV10
ACCESS no car Percent PCT_LACCESS_HHNV15
ACCESS SNAP Percent PCT_LACCESS_SNAP15
ACCESS Children Percent PCT_LACCESS_CHILD10
ACCESS Children Percent PCT_LACCESS_CHILD15
ACCESS Seniors Percent PCT_LACCESS_SENIORS10
ACCESS Seniors Percent PCT_LACCESS_SENIORS15
ACCESS White Percent PCT_LACCESS_WHITE15
ACCESS Black Percent PCT_LACCESS_BLACK15
ACCESS Hispanic ethnicity Percent PCT_LACCESS_HISP15
ACCESS Asian Percent PCT_LACCESS_NHASIAN15
ACCESS American Indian or Alaska Native Percent PCT_LACCESS_NHNA15
ACCESS Hawaiian or Pacific Islander Percent PCT_LACCESS_NHPI15
ACCESS Multiracial Percent PCT_LACCESS_MULTIR15
STORES Grocery # per 1,000 pop GROCPTH09
STORES Grocery # per 1,000 pop GROCPTH14
STORES Supercenters # per 1,000 pop SUPERCPTH09
STORES Supercenters # per 1,000 pop SUPERCP

In [115]:

#view variables
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_merge.columns:
        print (k1,k2,k,var)

ACCESS Overall Percent PCT_LACCESS_POP10
ACCESS Overall Percent PCT_LACCESS_POP15
ACCESS Low income Percent PCT_LACCESS_LOWI10
ACCESS Low income Percent PCT_LACCESS_LOWI15
ACCESS no car Percent PCT_LACCESS_HHNV10
ACCESS no car Percent PCT_LACCESS_HHNV15
ACCESS SNAP Percent PCT_LACCESS_SNAP15
ACCESS Children Percent PCT_LACCESS_CHILD10
ACCESS Children Percent PCT_LACCESS_CHILD15
ACCESS Seniors Percent PCT_LACCESS_SENIORS10
ACCESS Seniors Percent PCT_LACCESS_SENIORS15
ACCESS White Percent PCT_LACCESS_WHITE15
ACCESS Black Percent PCT_LACCESS_BLACK15
ACCESS Hispanic ethnicity Percent PCT_LACCESS_HISP15
ACCESS Asian Percent PCT_LACCESS_NHASIAN15
ACCESS American Indian or Alaska Native Percent PCT_LACCESS_NHNA15
ACCESS Hawaiian or Pacific Islander Percent PCT_LACCESS_NHPI15
ACCESS Multiracial Percent PCT_LACCESS_MULTIR15
STORES Grocery # per 1,000 pop GROCPTH09
STORES Grocery # per 1,000 pop GROCPTH14
STORES Supercenters # per 1,000 pop SUPERCPTH09
STORES Supercenters # per 1,000 pop SUPERCP

In [116]:
#devide dataframe into two: one with variables measured at one year and one with variables as percent change、
var_timepoint=list()
var_percentchange=list()
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_merge.columns: 
        var_timepoint.append(var)

var_timepoint.extend(['State','County'])
var_percentchange.extend(['State','County'])
df_tp=df_merge[var_timepoint]


In [117]:
print (df_tp.shape)

(3143, 93)


In [118]:
#check weather each column has valid values:
####### columns with units 'Percent' should have values between 0 and 100, any value that fall out of this range should be changed to NaN values
###### 
######
######

#Replace invalid values with np.nan
import numpy as np

for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if k =='Percent' and var in df_tp.columns: 
        ser=df_tp[var]<0 
        ser=ser+df_tp[var]>100
        if ser.sum()>0:
            print ((k1,k2,var,ser.sum()))
            df_tp[var][(df_tp[var]<0)|(df_tp[var]>100)]=np.nan
    elif k=='# per 1,000 pop' and var in df_tp.columns:
        ser=df_tp[var]<0
        ser=ser+df_tp[var]>1000
        if ser.sum()>0:
            print ((k1,k2,var,ser.sum()))
            df_tp[var][(df_tp[var]<0)|(df_tp[var]>1000)]=np.nan


        

('ACCESS', 'Overall', 'PCT_LACCESS_POP10', 25)
('ACCESS', 'Overall', 'PCT_LACCESS_POP15', 22)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self._update_inplace(new_data)
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  exec(code_obj, self.user_global_ns, self.user_ns)


In [119]:
#break df_tp into two sets: variables measured at the earlier time point: df_tp_1; and variables measured at the earlier time point: df_tp_2

#group the same measure into tuples, the same measure share the same name except the last two digits which indicate the year of the measure
var_grouped_by_measures=defaultdict(list)
early_measure_list=['State','County']
late_measure_list=['State','County']
for idx in var_info_df.index:
    k=var_info_df['Units'][idx]
    k1=var_info_df['Category Code'][idx]
    k2=var_info_df['Sub_subcategory Name'][idx]
    var=var_info_df['Variable Code'][idx]
    
    if var in df_tp.columns and var not in ['State','County']:
        var_grouped_by_measures[(k1,k2)].append((var, float(var[-2:])))
    
for v in var_grouped_by_measures.values():
    v.sort(key=lambda tup: tup[-1])
    late_measure_list.append(v[-1][0])
    
df_tp_later=df_tp[late_measure_list]

In [120]:
print (df_tp_later.shape)

(3143, 60)


In [104]:
print (df_tp_later.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3143 entries, 1001 to 56045
Data columns (total 83 columns):
State                     3143 non-null object
County                    3143 non-null object
PCT_LACCESS_POP15         3102 non-null float64
PCT_LACCESS_LOWI15        3123 non-null float64
PCT_LACCESS_HHNV15        3140 non-null float64
PCT_LACCESS_SNAP15        3123 non-null float64
PCT_LACCESS_CHILD15       3124 non-null float64
PCT_LACCESS_SENIORS15     3124 non-null float64
PCT_LACCESS_WHITE15       3124 non-null float64
PCT_LACCESS_BLACK15       3124 non-null float64
PCT_LACCESS_HISP15        3124 non-null float64
PCT_LACCESS_NHASIAN15     3124 non-null float64
PCT_LACCESS_NHNA15        3124 non-null float64
PCT_LACCESS_NHPI15        3124 non-null float64
PCT_LACCESS_MULTIR15      3124 non-null float64
PCH_GROCPTH_09_14         3128 non-null float64
PCH_SUPERCPTH_09_14       3004 non-null float64
PCH_CONVSPTH_09_14        3124 non-null float64
PCH_SPECSPTH_09_14        2

In [121]:
df_tp_later.to_csv(Overall_folder+'Datasets/Food_atlas/df_tp_new.csv')

Integrate CDC Datasets together

In [None]:
import pandas as pd
dfs=list()
sub_folder=Overall_folder+'/Datasets/CDC/'
filenames=['Diabetes_prevalence',
           'Obesity_prevalence',
           'Physical_inactive_prevalence']

In [None]:
for filename in filenames:
    filepath=sub_folder+filename+".csv"   
    df=pd.read_csv(filepath,index_col='FIPS')
    
    
    if 'Diabetes' in filename:
        df.columns=df.columns.astype(str)+'_db'
    elif 'Obesity' in filename:
        df.columns=df.columns.astype(str)+'_ob'
    elif 'Physical' in filename:
        df.columns=df.columns.astype(str)+'_phy'
    dfs.append(df)
#merge datasets


In [None]:
CDC_merge=pd.concat(dfs, join='outer', axis=1)

In [None]:
CDC_merge.info()

In [None]:
#Find out the non numeric entries in CDC_merge
for c in CDC_merge.columns:
    num_non_numeric=sum(CDC_merge.applymap(lambda x: isinstance(x, (int, float)))[c])
    if num_non_numeric>0:
        print(c, num_non_numeric, CDC_merge[pd.to_numeric(CDC_merge[c], errors='coerce').isnull()])

In [None]:
#It turns out that some entries are 'No Data' or NaN, so I replace the 'No Data' with NaN values
CDC_merge=CDC_merge.replace('No Data', np.nan)
CDC_merge=CDC_merge.astype(float)

In [None]:
#now check the CDC_merge
CDC_merge.info()

In [122]:
#choose the latest prevalence of diabetes, obesity and physical inactivity to merge with df_tp 
combined=pd.concat([df_tp_later, CDC_merge[['2013_db','2013_ob','2013_phy']]], join='inner',axis=1)

In [123]:
combined.rename(columns={'2013_db': 'prevalence of diabetes', '2013_ob': 'prevalence of obesity', '2013_phy':'prevalence of physical inactivity'}, inplace=True)

Integrating geography dataset

In [None]:
df_rbcodes=pd.read_csv(Overall_folder+'Datasets/geography/FIPS_RUCC_Code_2013.csv', index_col='FIPS')

In [None]:
df_rbcodes.info()

In [None]:
combined=pd.concat([combined, df_rbcodes[['RUCC_2013']]], join='inner',axis=1)

In [None]:
combined.info()

In [None]:
df_divisions=pd.read_csv(Overall_folder+'Datasets/geography/state_region.csv')

In [None]:
df_divisions.info()

In [None]:
division_dict=df_divisions.set_index('State Code').to_dict()

In [None]:
division_dict

In [124]:
combined['Division']=combined['State'].map(division_dict['Division'])
combined['Region']=combined['State'].map(division_dict['Region'])

In [125]:
combined.head(5)

Unnamed: 0_level_0,State,County,PCT_LACCESS_POP15,PCT_LACCESS_LOWI15,PCT_LACCESS_HHNV15,PCT_LACCESS_SNAP15,PCT_LACCESS_CHILD15,PCT_LACCESS_SENIORS15,PCT_LACCESS_WHITE15,PCT_LACCESS_BLACK15,...,PCT_65OLDER10,PCT_18YOUNGER10,MEDHHINC15,POVRATE15,CHILDPOVRATE15,prevalence of diabetes,prevalence of obesity,prevalence of physical inactivity,Division,Region
FIPS,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1001,AL,Autauga,32.062255,11.991125,3.351332,4.608749,8.460485,3.996279,23.163613,7.726582,...,11.995382,26.777959,56580.0,12.7,18.8,13.0,34.1,28.6,East South Central,South
1003,AL,Baldwin,16.767489,5.424427,1.905114,1.2989,3.844936,3.06184,13.981393,1.942757,...,16.771185,22.987408,52387.0,12.9,19.6,10.4,27.4,22.3,East South Central,South
1005,AL,Barbour,22.10556,10.739667,4.329378,4.303147,3.758341,3.001695,10.302934,9.88136,...,14.236807,21.906982,31433.0,32.0,45.2,18.4,44.4,31.8,East South Central,South
1007,AL,Bibb,4.230324,2.601627,2.821427,0.67671,1.015242,0.600865,2.35877,1.828933,...,12.68165,22.696923,40767.0,22.2,29.3,14.8,40.3,33.9,East South Central,South
1009,AL,Blount,6.49738,2.88015,3.336414,0.812727,1.58872,0.882583,5.909147,0.167201,...,14.722096,24.608353,50487.0,14.7,22.2,14.1,34.6,28.0,East South Central,South


In [126]:
combined.to_csv(Overall_folder+'Datasets/Combined_data_new.csv')