#### Set Up

In [1]:
# Import dependencies
import pandas as pd
import numpy as np
import sklearn.neighbors

# Read in files
stations = pd.read_csv("output/stations_data_clean.csv")
housing = pd.read_csv("output/re_data_clean.csv")

# Keep relevant columns
stations = stations[['station_name', 'lat_field', 'lon_field', 'zipcode']] 
housing = housing[['address', 'borough', 'lat', 'long', 'zipcode']] 

#### Find the closest train station to each housing record

In [2]:
# Find the absolute value of each coordinate pair
def dist(lat1, long1, lat2, long2):
    return np.abs((lat1-lat2)+(long1-long2))

In [3]:
# Extract all lat values and save to variable
lat_column = housing.loc[:,'lat']
lats = lat_column.values


# Extract all long values and save to variable
long_column = housing.loc[:,'long']
longs = long_column.values

In [4]:
# Apply lambda function across each column and if 1 apply the function to the row
distances = stations.apply(
    lambda row: dist(lats, longs, row['lat_field'], row['lon_field']), 
    axis=1)

distances

0      [0.12590200000001062, 0.06620200000000409, 0.0...
1      [0.1153149999999954, 0.05561499999998887, 0.02...
2      [0.10819999999999652, 0.048499999999989996, 0....
3      [0.09921200000000852, 0.03951200000000199, 0.0...
4      [0.09012899999999746, 0.030428999999990936, 0....
                             ...                        
489    [0.41165699999999816, 0.4713570000000047, 0.50...
490    [0.4325369999999893, 0.4922369999999958, 0.527...
491    [0.44660999999999973, 0.5063100000000063, 0.54...
492    [0.4762969999999882, 0.5359969999999947, 0.570...
493    [0.4626179999999849, 0.5223179999999914, 0.557...
Length: 494, dtype: object

In [5]:
# Use idxmin to calculate the closest station name

def find_station(lat, long):
    distances = stations.apply(
        lambda row: dist(lat, long, row['lat_field'], row['lon_field']), 
        axis=1)
    return stations.loc[distances.idxmin(), 'station_name']

In [6]:
# Find the closest station name to each recorded sale
closest_station = housing.apply(
    lambda row: find_station(row['lat'], row['long']), 
    axis=1)

In [7]:
closest_station

0                      8 av
1        lexington av/59 st
2                     52 st
3                      8 av
4              queens plaza
                ...        
37223              avenue u
37224              avenue u
37225              avenue u
37226              avenue u
37227              avenue u
Length: 37228, dtype: object

In [8]:
# Append list of closest stations to the homes DataFrame
housing['nearest_station'] = closest_station
housing

Unnamed: 0,address,borough,lat,long,zipcode,nearest_station
0,"219 w 14th st, unit 2f",manhattan,40.7465,-74.0094,10011,8 av
1,"315 w 70th st, unit 6a",manhattan,40.7769,-73.9801,10023,lexington av/59 st
2,"306 w 100th st, unit 56",manhattan,40.7999,-73.9683,10025,52 st
3,"38 w 9th st, unit 1",manhattan,40.7465,-74.0094,10011,8 av
4,"48 w 86th st, unit 2",manhattan,40.7859,-73.9742,10024,queens plaza
...,...,...,...,...,...,...
37223,"2906 brighton 12th st, unit 6f",brooklyn,40.5860,-73.9419,11235,avenue u
37224,272 corbin pl,brooklyn,40.5860,-73.9419,11235,avenue u
37225,2831 brighton 4th st,brooklyn,40.5860,-73.9419,11235,avenue u
37226,2743 brighton 7th st.,brooklyn,40.5860,-73.9419,11235,avenue u


In [9]:
stations

Unnamed: 0,station_name,lat_field,lon_field,zipcode
0,astoria-ditmars blvd,40.775036,-73.912034,11101
1,astoria blvd,40.770258,-73.917843,11102
2,30 av,40.766779,-73.921479,11102
3,broadway,40.761820,-73.925508,11101
4,36 av,40.756804,-73.929575,11106
...,...,...,...,...
489,prince's bay,40.525507,-74.200064,10309
490,pleasant plains,40.522410,-74.217847,10309
491,richmond valley,40.519631,-74.229141,10307
492,tottenville,40.512764,-74.251961,10307


#### Find the distance between two lists of geographic coordinates - Use Haversine Distance

In [10]:
# Convert latitude and longitude to radians and add these columns to the dataframe using np.radians

# Add columns with radians for latitude and longitude
housing[['lat_radians_housing','long_radians_housing']] = (
    np.radians(housing.loc[:,['lat','long']])
)

stations[['lat_radians_stations','long_radians_stations']] = (
    np.radians(stations.loc[:,['lat_field','lon_field']])
)


In [11]:
# Add unique ID column
housing['uniqueid'] = np.arange(len(housing))

In [12]:
dist = sklearn.neighbors.DistanceMetric.get_metric('haversine')
dist_matrix = (dist.pairwise
    (housing[['lat_radians_housing','long_radians_housing']],
     stations[['lat_radians_stations','long_radians_stations']])*3959
)
# Note that 3959 is the radius of the earth in miles
df_dist_matrix = (
    pd.DataFrame(dist_matrix,index=housing['uniqueid'], 
                 columns=stations['station_name'])
)

In [13]:
df_dist_matrix
# Get a series containing minimum value of each row
minValuesObj = df_dist_matrix.min(axis=1)
print('minimum value in each row : ')
print(minValuesObj)

minimum value in each row : 
uniqueid
0        0.559478
1        0.213128
2        0.031641
3        0.559478
4        0.217015
           ...   
37223    0.646048
37224    0.646048
37225    0.646048
37226    0.646048
37227    0.646048
Length: 37228, dtype: float64


In [14]:
# Append to housing dataframe
housing['distance_miles'] = minValuesObj
housing

Unnamed: 0,address,borough,lat,long,zipcode,nearest_station,lat_radians_housing,long_radians_housing,uniqueid,distance_miles
0,"219 w 14th st, unit 2f",manhattan,40.7465,-74.0094,10011,8 av,0.711161,-1.291708,0,0.559478
1,"315 w 70th st, unit 6a",manhattan,40.7769,-73.9801,10023,lexington av/59 st,0.711691,-1.291196,1,0.213128
2,"306 w 100th st, unit 56",manhattan,40.7999,-73.9683,10025,52 st,0.712093,-1.290990,2,0.031641
3,"38 w 9th st, unit 1",manhattan,40.7465,-74.0094,10011,8 av,0.711161,-1.291708,3,0.559478
4,"48 w 86th st, unit 2",manhattan,40.7859,-73.9742,10024,queens plaza,0.711848,-1.291093,4,0.217015
...,...,...,...,...,...,...,...,...,...,...
37223,"2906 brighton 12th st, unit 6f",brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37223,0.646048
37224,272 corbin pl,brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37224,0.646048
37225,2831 brighton 4th st,brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37225,0.646048
37226,2743 brighton 7th st.,brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37226,0.646048


In [15]:
housing.dtypes

address                  object
borough                  object
lat                     float64
long                    float64
zipcode                   int64
nearest_station          object
lat_radians_housing     float64
long_radians_housing    float64
uniqueid                  int64
distance_miles          float64
dtype: object

In [16]:
# Use np.where to create Bool column --> True denotes less than 1 mile from train (lat/long in housing is zipcode based)
housing['under_1_mile'] = np.where(housing['distance_miles'] <= 1, True, False)
housing.head()

Unnamed: 0,address,borough,lat,long,zipcode,nearest_station,lat_radians_housing,long_radians_housing,uniqueid,distance_miles,under_1_mile
0,"219 w 14th st, unit 2f",manhattan,40.7465,-74.0094,10011,8 av,0.711161,-1.291708,0,0.559478,True
1,"315 w 70th st, unit 6a",manhattan,40.7769,-73.9801,10023,lexington av/59 st,0.711691,-1.291196,1,0.213128,True
2,"306 w 100th st, unit 56",manhattan,40.7999,-73.9683,10025,52 st,0.712093,-1.29099,2,0.031641,True
3,"38 w 9th st, unit 1",manhattan,40.7465,-74.0094,10011,8 av,0.711161,-1.291708,3,0.559478,True
4,"48 w 86th st, unit 2",manhattan,40.7859,-73.9742,10024,queens plaza,0.711848,-1.291093,4,0.217015,True


In [17]:
# Generate binary values using get_dummies for crime category
dum_df = pd.get_dummies(housing, columns=["under_1_mile"], prefix=["under_1_mile_type_is"] )

# Merge with main df
housing_df = housing.merge(dum_df)
housing_df['under_1_mile'].value_counts()

True     25583
False    11645
Name: under_1_mile, dtype: int64

In [18]:
housing_df

Unnamed: 0,address,borough,lat,long,zipcode,nearest_station,lat_radians_housing,long_radians_housing,uniqueid,distance_miles,under_1_mile,under_1_mile_type_is_False,under_1_mile_type_is_True
0,"219 w 14th st, unit 2f",manhattan,40.7465,-74.0094,10011,8 av,0.711161,-1.291708,0,0.559478,True,0,1
1,"315 w 70th st, unit 6a",manhattan,40.7769,-73.9801,10023,lexington av/59 st,0.711691,-1.291196,1,0.213128,True,0,1
2,"306 w 100th st, unit 56",manhattan,40.7999,-73.9683,10025,52 st,0.712093,-1.290990,2,0.031641,True,0,1
3,"38 w 9th st, unit 1",manhattan,40.7465,-74.0094,10011,8 av,0.711161,-1.291708,3,0.559478,True,0,1
4,"48 w 86th st, unit 2",manhattan,40.7859,-73.9742,10024,queens plaza,0.711848,-1.291093,4,0.217015,True,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37223,"2906 brighton 12th st, unit 6f",brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37223,0.646048,True,0,1
37224,272 corbin pl,brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37224,0.646048,True,0,1
37225,2831 brighton 4th st,brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37225,0.646048,True,0,1
37226,2743 brighton 7th st.,brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37226,0.646048,True,0,1


In [19]:
housing_df.rename(columns={'under_1_mile_type_is_False': 'walkable_false', 'under_1_mile_type_is_True': 'walkable_true'}, inplace=True)


In [20]:
# give index a name 
housing_df.index.name = 'index'

In [21]:
housing_df

Unnamed: 0_level_0,address,borough,lat,long,zipcode,nearest_station,lat_radians_housing,long_radians_housing,uniqueid,distance_miles,under_1_mile,walkable_false,walkable_true
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,"219 w 14th st, unit 2f",manhattan,40.7465,-74.0094,10011,8 av,0.711161,-1.291708,0,0.559478,True,0,1
1,"315 w 70th st, unit 6a",manhattan,40.7769,-73.9801,10023,lexington av/59 st,0.711691,-1.291196,1,0.213128,True,0,1
2,"306 w 100th st, unit 56",manhattan,40.7999,-73.9683,10025,52 st,0.712093,-1.290990,2,0.031641,True,0,1
3,"38 w 9th st, unit 1",manhattan,40.7465,-74.0094,10011,8 av,0.711161,-1.291708,3,0.559478,True,0,1
4,"48 w 86th st, unit 2",manhattan,40.7859,-73.9742,10024,queens plaza,0.711848,-1.291093,4,0.217015,True,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
37223,"2906 brighton 12th st, unit 6f",brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37223,0.646048,True,0,1
37224,272 corbin pl,brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37224,0.646048,True,0,1
37225,2831 brighton 4th st,brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37225,0.646048,True,0,1
37226,2743 brighton 7th st.,brooklyn,40.5860,-73.9419,11235,avenue u,0.708359,-1.290530,37226,0.646048,True,0,1


In [22]:
housing_df.to_csv("output/walkscore.csv", index=False)