In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt


# Create Feature Vectors for Cosine Similarity with AU3
All of these figures are rough estimates by US Census and ACS data.

In [3]:
final_df = pd.read_csv("/Users/camilledunning/Desktop/ds3-zillow/datasets/heatmap_data/educational_attainment_2012.csv").iloc[1:]
final_df = final_df[['NAME', 'S1501_C01_014E']]
final_df['S1501_C01_014E'] = final_df['S1501_C01_014E'].astype(float) 
final_df = final_df.rename(columns = {'S1501_C01_014E': 'High School Graduation Rate'})

employment = pd.read_csv("/Users/camilledunning/Desktop/ds3-zillow/datasets/heatmap_data/employment_2012.csv").iloc[1:]['S2301_C04_026E']
final_df['Unemployment Rate'] = employment.astype(float)

health_insurance = pd.read_csv("/Users/camilledunning/Desktop/ds3-zillow/datasets/heatmap_data/health_insurance_2012.csv").iloc[1:]
health_insurance = health_insurance[['S2701_C01_001E', 'S2701_C02_001E']].astype(int)
health_insurance = health_insurance['S2701_C02_001E'].div(health_insurance['S2701_C01_001E'])
final_df['Percent Uninsured for Health'] = health_insurance.astype(float)

median_income = pd.read_csv("/Users/camilledunning/Desktop/ds3-zillow/datasets/heatmap_data/median_income_2012.csv")

num_households = median_income['S1903_C01_001E'].iloc[1:].astype(int)

median_income = median_income['S1903_C02_001E'].iloc[1:]
final_df['Median Household Income'] = median_income.astype(int)

food_stamps = pd.read_csv("/Users/camilledunning/Desktop/ds3-zillow/datasets/heatmap_data/food_stamps_2012.csv").iloc[1:]
food_stamps = food_stamps['S2201_C02_001E'].astype(int).div(num_households)
final_df['Percentage of Households on Food Stamps'] = food_stamps.astype(float)

final_df = final_df.sort_values(by = 'NAME')
final_df.NAME = final_df.NAME.map(lambda x: x.replace(' Metro Area', '').replace(' Micro Area', ''))
final_df


Unnamed: 0,NAME,High School Graduation Rate,Unemployment Rate,Percent Uninsured for Health,Median Household Income,Percentage of Households on Food Stamps
15,"Aberdeen, WA",86.6,23.7,0.173578,42057,0.216212
16,"Abilene, TX",83.5,17.5,0.187588,43407,0.136550
17,"Adrian, MI",91.0,25.8,0.111519,48224,0.140139
18,"Aguadilla-Isabela-San Sebasti?n, PR",66.0,31.8,0.059501,15339,0.454933
19,"Akron, OH",90.6,21.9,0.111187,49731,0.149729
...,...,...,...,...,...,...
10,"York-Hanover, PA",87.5,17.0,0.091445,55648,0.090488
11,"Youngstown-Warren-Boardman, OH-PA",88.7,13.6,0.118995,40686,0.179525
12,"Yuba City, CA",80.2,24.0,0.158815,45646,0.136201
13,"Yuma, AZ",71.6,20.6,0.255714,39485,0.219310


In [4]:
au3 = pd.read_csv("/Users/camilledunning/Desktop/ds3-zillow/AU3_results.csv")[:-1].rename(columns = {'CBSA_Codes': 'RegionName'})
au3['RegionName'].iloc[1:] = au3['RegionName'].iloc[1:].astype(int)
metro_data = pd.read_csv("/Users/camilledunning/Desktop/ds3-zillow/datasets/metro_data.csv")[['RegionName', 'MetroName']]
au3 = pd.merge(au3, metro_data, on = 'RegionName')
au3 = au3.drop_duplicates(subset=None, keep='first', inplace=False).drop(columns = ['Unnamed: 0', 'RegionName'])
au3 = au3.rename(columns = {'MetroName': 'NAME'})
au3

Unnamed: 0,AU3,NAME
0,0.795075,"Ada, OK"
261,6.300922,"Adrian, MI"
522,5.965741,"Akron, OH"
783,5.699291,"Albany, GA"
1044,5.281652,"Albany, OR"
...,...,...
166912,6.255158,"Youngstown, OH"
167173,6.139532,"Yuba City, CA"
167434,6.139876,"Yuma, AZ"
167695,4.602732,"Zanesville, OH"


In [5]:
final_df_au3 = au3.merge(final_df, on = 'NAME', how = 'right', suffixes = ('', '_y')).dropna()
col_list = list(final_df_au3)
col_list[0], col_list[1] = col_list[1], col_list[0]
final_df_au3 = final_df_au3.ix[:, col_list]
final_df_au3

Unnamed: 0,NAME,AU3,High School Graduation Rate,Unemployment Rate,Percent Uninsured for Health,Median Household Income,Percentage of Households on Food Stamps
0,"Adrian, MI",6.300922,91.0,25.8,0.111519,48224,0.140139
1,"Akron, OH",5.965741,90.6,21.9,0.111187,49731,0.149729
2,"Albany, GA",5.699291,79.9,19.3,0.206726,34469,0.238583
3,"Albuquerque, NM",5.356778,88.0,13.4,0.153148,46725,0.156425
4,"Alexandria, LA",4.384656,84.6,8.9,0.164420,40896,0.214571
...,...,...,...,...,...,...,...
241,"Worcester, MA",5.764700,89.9,16.5,0.041914,62505,0.130972
242,"Yakima, WA",4.417720,70.7,12.8,0.231651,43942,0.248458
243,"Yuba City, CA",6.139532,80.2,24.0,0.158815,45646,0.136201
244,"Yuma, AZ",6.139876,71.6,20.6,0.255714,39485,0.219310


## Normalise, Create 'Good/Bad' Measure

In [6]:
def normalise(col):
    return (col - col.min()) / (col.max() - col.min())

def reverse_measure(col):
    return 1 - col

for i in range(1, len(final_df_au3.columns)):
    final_df_au3[final_df_au3.columns[i]] = normalise(final_df_au3[final_df_au3.columns[i]])

# Higher statistics are 'worse'
reverse_measure_cols = ['Unemployment Rate', 'Percent Uninsured for Health', 'Percentage of Households on Food Stamps']

for reverse_measure_col in reverse_measure_cols:
    final_df_au3[reverse_measure_col] = reverse_measure(final_df_au3[reverse_measure_col])

final_df_au3


Unnamed: 0,NAME,AU3,High School Graduation Rate,Unemployment Rate,Percent Uninsured for Health,Median Household Income,Percentage of Households on Food Stamps
0,"Adrian, MI",0.472119,0.849693,0.430464,0.745593,0.461355,0.650064
1,"Akron, OH",0.456980,0.837423,0.516556,0.746685,0.496239,0.621239
2,"Albany, GA",0.444946,0.509202,0.573951,0.432802,0.142960,0.354174
3,"Albuquerque, NM",0.429476,0.757669,0.704194,0.608827,0.426657,0.601115
4,"Alexandria, LA",0.385569,0.653374,0.803532,0.571794,0.291729,0.426345
...,...,...,...,...,...,...,...
241,"Worcester, MA",0.447900,0.815951,0.635762,0.974275,0.791926,0.677616
242,"Yakima, WA",0.387062,0.226994,0.717439,0.350912,0.362237,0.324493
243,"Yuba City, CA",0.464829,0.518405,0.470199,0.590208,0.401681,0.661901
244,"Yuma, AZ",0.464845,0.254601,0.545254,0.271855,0.259068,0.412101


## Transpose DataFrame for Feature Creation

In [17]:
f = final_df_au3.T
new_header = f.iloc[0]
f = f[1:]
f.columns = new_header
f

NAME,"Adrian, MI","Akron, OH","Albany, GA","Albuquerque, NM","Alexandria, LA","Altoona, PA","Amarillo, TX","Anchorage, AK","Ann Arbor, MI","Appleton, WI",...,"Williamsport, PA","Wilmington, NC","Wilson, NC","Winston-Salem, NC","Wooster, OH","Worcester, MA","Yakima, WA","Yuba City, CA","Yuma, AZ","Zanesville, OH"
AU3,0.472119,0.45698,0.444946,0.429476,0.385569,0.187533,0.416473,0.41543,0.469004,0.370647,...,0.336301,0.45028,0.435019,0.430139,0.458806,0.4479,0.387062,0.464829,0.464845,0.395419
High School Graduation Rate,0.849693,0.837423,0.509202,0.757669,0.653374,0.834356,0.592025,0.898773,0.923313,0.898773,...,0.748466,0.794479,0.469325,0.653374,0.717791,0.815951,0.226994,0.518405,0.254601,0.720859
Unemployment Rate,0.430464,0.516556,0.573951,0.704194,0.803532,0.551876,0.907285,0.587196,0.589404,0.812362,...,0.512141,0.516556,0.589404,0.611479,0.845475,0.635762,0.717439,0.470199,0.545254,0.573951
Percent Uninsured for Health,0.745593,0.746685,0.432802,0.608827,0.571794,0.801137,0.426751,0.509792,0.857325,0.936759,...,0.838896,0.580028,0.480283,0.557622,0.660635,0.974275,0.350912,0.590208,0.271855,0.658347
Median Household Income,0.461355,0.496239,0.14296,0.426657,0.291729,0.300109,0.419921,1.0,0.64899,0.737784,...,0.403162,0.502442,0.202009,0.323164,0.45492,0.791926,0.362237,0.401681,0.259068,0.253999
Percentage of Households on Food Stamps,0.650064,0.621239,0.354174,0.601115,0.426345,0.555971,0.710947,0.759484,0.711399,0.831091,...,0.699349,0.689063,0.505477,0.632779,0.714047,0.677616,0.324493,0.661901,0.412101,0.369015


In [24]:
non_au3_features = len(f.iloc[1:])
non_au3_features

new_f = pd.DataFrame(np.repeat(f.iloc[0].values, non_au3_features, axis = 0))
new_f

Unnamed: 0,0
0,0.472119
1,0.472119
2,0.472119
3,0.472119
4,0.472119
...,...
1225,0.395419
1226,0.395419
1227,0.395419
1228,0.395419
