In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
df = pd.read_excel('DataDownload2015.xlsx', sheet_name='Food Access Research Atlas')
df.head()

In [None]:
df.set_index('CensusTract', inplace=True)
target_df = df[['LILATracts_halfAnd10']]
target_df.head()

In [None]:
df = df[['OHU2010', 'PovertyRate', 'MedianFamilyIncome', 'TractKids','TractSeniors','TractWhite','TractBlack','TractAsian', 
         'TractNHOPI','TractAIAN','TractOMultir','TractHispanic','TractHUNV','TractSNAP']]
df = df.join(target_df)
df.head()

##### target label info:
LILATracts_halfAnd10 = Low-income census tracts where a significant number (at least 500 people) or share (at least 33 percent) of the population is greater than ½ mile from the nearest supermarket, supercenter, or large grocery store for an urban area or greater than 10 miles for a rural area. Using this measure, an estimated 54.4 million people, or 17.7 percent of the U.S. population, live in tracts that are low-income and low access and are more than ½ mile or 10 miles from the nearest supermarket.

In [None]:
health_stats = pd.read_csv('500_Cities__Census_Tract-level_Data__GIS_Friendly_Format___2018_release.csv')
health_stats.head()

In [None]:
health_stats.set_index('TractFIPS', inplace=True)
health_stats.head()

In [None]:
combined_data = health_stats.join(df)
combined_data.head()

In [None]:
combined_data[combined_data['PlaceName'] == "New York"]

In [None]:
combined_data.shape

In [None]:
combined_data.rename(columns={'LILATracts_halfAnd10': 'target'}, inplace=True)
combined_data.head()

In [None]:
to_drop = [x for x in combined_data.columns if x[-2:] == "CI"]

In [None]:
combined_data.drop(columns=to_drop, inplace=True)
combined_data.head()

#### Fixing demographic columns to percentages

In [None]:
dem_columns = combined_data.columns[-12:-3]
for column in dem_columns:
    combined_data[column] = combined_data[column]/combined_data['Population2010']
household_columns = combined_data.columns[-3:-1]
for column in household_columns:
    combined_data[column] = combined_data[column]/combined_data['OHU2010']

In [None]:
combined_data.head()

In [None]:
plt.figure(figsize=(10,10))
sns.heatmap(combined_data.iloc[:, 4:].corr(), center=0)

In [None]:
correlations = combined_data.corr().abs()

In [None]:
upper = correlations.where(np.triu(np.ones(correlations.shape), k=1).astype(np.bool))
potential_drop = [column for column in upper.columns if any(upper[column] > .90)]

In [None]:
potential_drop

In [None]:
abs(combined_data[potential_drop].corr())

In [None]:
X = combined_data.drop(columns=['StateAbbr', 'PlaceName', 'PlaceFIPS', 'Place_TractID', 'target'])

In [None]:
test = combined_data.sample(n=500)
plt.figure(figsize=(20,20))
for i, val in enumerate(X.columns):
    plt.subplot(9, 5, i+1)
    plt.scatter(test[val], test.target)
    plt.title(val)
    plt.tight_layout()
    
plt.show()