This notebook is used to prepare amenity data (grocery stores) into distance data that can be used for clustering and visualization.



---

### Imports

In [1]:
import os
import json
import pickle
import pandas as pd
import numpy as np
import scipy.spatial as spatial
import geopandas as gp
from shapely.geometry import Point, Polygon
import gzip

### Define working directory

In [2]:
ROOT = '/media/school/project/amenities'

### Read Gazetteer table

GEOIDS and lat/long for tract centers are pulled from this table.

In [3]:
gaz = pd.read_pickle(os.path.join(ROOT, '2018_5yr_cendatagov_GAZ_v3.pkl'))
gaz.GEOID = gaz.GEOID.astype(int)
gaz.columns = [x.strip() for x in gaz.columns]
gaz.head(1)

Unnamed: 0,USPS,GEOID,ALAND,AWATER,ALAND_SQMI,AWATER_SQMI,INTPTLAT,INTPTLONG
0,AL,1001020100,9817813,28435,3.791,0.011,32.481959,-86.491338


### Read Geopandas dataframe

This is not really used for processing but gives some evidence of sanity-checks.

In [4]:
with gzip.GzipFile(os.path.join(ROOT, 'all_census_tract_shapes.json.gz'), 'r') as f:
    TRACT_ALL = json.loads(f.read().decode('utf-8'))
gpdf = gp.GeoDataFrame.from_features(TRACT_ALL['features'])
gpdf.GEOID = gpdf.GEOID.astype(int)
gpdf.head(1)

Unnamed: 0,geometry,STATEFP,COUNTYFP,TRACTCE,AFFGEOID,GEOID,NAME,LSAD,ALAND,AWATER
0,"POLYGON ((-93.16468 30.21663, -93.16392 30.216...",22,19,980000,1400000US22019980000,22019980000,9800,CT,5398742,2339


### Read Grocery Store data.

In [5]:
gro = pd.read_csv(os.path.join(ROOT, 'grocery-stores.csv'))
gro.head(1)

Unnamed: 0,zip,city,state,name,address,address_combo,postalCode,lat,lat_cleaned,lon,lon_cleaned
0,60415,Chicago Ridge,IL,ALDI,:arlem & Southwest Hwy,":arlem & Southwest Hwy,60415",60415,41.700313,41.70031,-87.797054,-87.79705


In [6]:
gro.shape

(53518, 11)

### Prepare the first fiew fields of the `result` table.

This will be used to aggregate results throughout the code.

In [7]:
# result with start off with the gaz GEOID and lat/long columns
result = gaz[['GEOID', 'INTPTLAT', 'INTPTLONG']]
result.head()

Unnamed: 0,GEOID,INTPTLAT,INTPTLONG
0,1001020100,32.481959,-86.491338
1,1001020200,32.475758,-86.472468
2,1001020300,32.474024,-86.459703
3,1001020400,32.47103,-86.444835
4,1001020500,32.458922,-86.421826


The following is a test to see if all geoids in gaz have corresponding geometries in gpdf.

219 in gaz are not in gpdf. These are tracts that correspond to waterways (bays, lakes, etc). A few were spot checked, for the rest, it was confirmed that ALAND was 0 in 218 records. The last record has ALAND > 0, but corresponds to tampa bay.

In [8]:
if not os.path.exists(os.path.join(ROOT, 'gaz_to_gpdf_geoid_mapping.pickle')):
    gaz_to_gpdf_geoid_mapping = {}
    for i, row in result.iterrows():
        print('\r{} '.format(i+1), end='')
        geoid = row.GEOID
        lat = float(row.INTPTLAT)
        lon = float(row.INTPTLONG)
        pt = Point(lon, lat) # order here is important!!

        # set default mapping when point not found within geometry
        gaz_to_gpdf_geoid_mapping[geoid] = None

        # if there is a matching geoid, set that
        gpdf_geoid_match = gpdf[gpdf.GEOID.isin([geoid])]
        if gpdf_geoid_match.shape[0] == 1:
            gaz_to_gpdf_geoid_mapping[geoid] = gpdf_geoid_match.GEOID.values[0]
            print('FOUND')
        # otherwise, find if it falls in a geometry
        else:
            for j, row_gpdf in gpdf.iterrows():          
                poly = row_gpdf.geometry
                found_flag = None
                if pt.within(poly):
                    print('FOUND, extra processing')
                    found_flag = True
                    gaz_to_gpdf_geoid_mapping[geoid] = row_gpdf.GEOID
                    break
            if not found_flag:
                print('===NOT FOUND===')
    with open(os.path.join(ROOT, 'gaz_to_gpdf_geoid_mapping.pickle'), 'wb') as f:
        pickle.dump(gaz_to_gpdf_geoid_mapping, f, protocol=4)
else:
    with open(os.path.join(ROOT, 'gaz_to_gpdf_geoid_mapping.pickle'), 'rb') as handle:
        gaz_to_gpdf_geoid_mapping = pickle.load(handle)    

In [9]:
len([x for x,y in gaz_to_gpdf_geoid_mapping.items() if y is not None])

72837

In [10]:
len([x for x,y in gaz_to_gpdf_geoid_mapping.items() if y is None])

219

In [11]:
gaz_geo_set = set(gaz.GEOID)
gpdf_geo_set = set(gpdf.GEOID)
print('Number of gaz geoids in gpdf: {}'.format(
    len(gaz_geo_set.intersection(gpdf_geo_set))))
print('Number of gaz geoids NOT in gpdf: {}'.format(
    gaz.shape[0] - len(gaz_geo_set.intersection(gpdf_geo_set))))

Number of gaz geoids in gpdf: 72837
Number of gaz geoids NOT in gpdf: 219


Given that these are equivalent, we can say it is likely that there are no geometries for the bays, lakes, etc that fall in the 219.

### For each grocery store, identify the census tract geoid in which it falls.

In [12]:
if not os.path.exists(os.path.join(ROOT, 'grocery_stage_1.pkl')):
    geoid_matches = []

    # create a dict of subsets of a merged dataframe for faster processing below
    gaz_gpdf = gaz.merge(gpdf, how='left', on='GEOID')
    gaz_gpdf_subset_dict = {}
    for state in gaz_gpdf.USPS.unique():
        gaz_gpdf_subset_dict[state] = gaz_gpdf[gaz_gpdf.USPS==state]

    for i, row in gro.iterrows():
        print('\r{} '.format(i+1, end=''))
        pt = Point(row.lon_cleaned, row.lat_cleaned)
        state = row.state

        # subset to state to reduce search space
        gaz_gpdf_subset = gaz_gpdf_subset_dict[state]
        geoid_match = None # set default as None for no match
        # iterate through geometries
        for j, row_j in gaz_gpdf_subset.iterrows():
            # update geoid_match if found
            poly = row_j.geometry
            if not poly:
                continue
            if pt.within(poly):
                print('FOUND')
                geoid_match = row_j.GEOID
                break

        # append geoid_match
        geoid_matches.append(geoid_match)

        if not geoid_match:
            print('===NOT FOUND===')
    gro['GEOID_MATCH'] = geoid_matches
    gro.to_pickle(os.path.join(ROOT, 'grocery_stage_1.pkl'), protocol=4)
else:
    gro = pd.read_pickle(os.path.join(ROOT, 'grocery_stage_1.pkl'))

In [13]:
gro.head()

Unnamed: 0,zip,city,state,name,address,address_combo,postalCode,lat,lat_cleaned,lon,lon_cleaned,GEOID_MATCH
0,60415,Chicago Ridge,IL,ALDI,:arlem & Southwest Hwy,":arlem & Southwest Hwy,60415",60415,41.700313,41.70031,-87.797054,-87.79705,17031820000.0
1,40219,Louisville,KY,B & E Salvage Store,.,".,40219",40219,38.128021,38.12802,-85.679443,-85.67944,21111010000.0
2,79106,Amarillo,TX,Butch,.,".,79109",79109,35.190903,35.1909,-101.845847,-101.84585,48375010000.0
3,60901,Kankakee,IL,Jewl,...,"...,60914",60914,41.186157,41.18616,-87.893356,-87.89336,17091010000.0
4,54017,New Richmond,WI,Indomaret,@ Fresh market,"@ Fresh market,54017",54017,45.119296,45.1193,-92.537666,-92.53767,55109120000.0


In [14]:
within_geoid_counts = gro.groupby('GEOID_MATCH')['GEOID_MATCH'].count()
print(within_geoid_counts.min(), within_geoid_counts.max())
within_geoid_counts.name = 'N_GROCERY_WITHIN_TRACT'
within_geoid_counts = within_geoid_counts.reset_index()
within_geoid_counts

1 17


Unnamed: 0,GEOID_MATCH,N_GROCERY_WITHIN_TRACT
0,1.001020e+09,1
1,1.001020e+09,3
2,1.001021e+09,1
3,1.001021e+09,1
4,1.001021e+09,2
...,...,...
32702,5.603997e+10,2
32703,5.604198e+10,1
32704,5.604198e+10,1
32705,5.604198e+10,2


Append the results and perform check.

In [15]:
result = result.merge(within_geoid_counts,
                      how='left',
                      left_on='GEOID', right_on='GEOID_MATCH'
                     )
result

Unnamed: 0,GEOID,INTPTLAT,INTPTLONG,GEOID_MATCH,N_GROCERY_WITHIN_TRACT
0,1001020100,32.481959,-86.491338,,
1,1001020200,32.475758,-86.472468,,
2,1001020300,32.474024,-86.459703,1.001020e+09,1.0
3,1001020400,32.471030,-86.444835,,
4,1001020500,32.458922,-86.421826,1.001020e+09,3.0
...,...,...,...,...,...
73051,56043000200,43.878830,-107.669052,,
73052,56043000301,44.014369,-107.956379,,
73053,56043000302,44.028771,-107.950748,,
73054,56045951100,43.846213,-104.570020,,


In [16]:
result.GEOID_MATCH.nunique() == within_geoid_counts.GEOID_MATCH.nunique()

True

Fill missing data with 0 (i.e. no stores found within that tract).

Drop the GEOID_MATCH field. No longer needed.

In [17]:
result.N_GROCERY_WITHIN_TRACT = result.N_GROCERY_WITHIN_TRACT.fillna(0)
result = result.drop(columns=['GEOID_MATCH'])
result

Unnamed: 0,GEOID,INTPTLAT,INTPTLONG,N_GROCERY_WITHIN_TRACT
0,1001020100,32.481959,-86.491338,0.0
1,1001020200,32.475758,-86.472468,0.0
2,1001020300,32.474024,-86.459703,1.0
3,1001020400,32.471030,-86.444835,0.0
4,1001020500,32.458922,-86.421826,3.0
...,...,...,...,...
73051,56043000200,43.878830,-107.669052,0.0
73052,56043000301,44.014369,-107.956379,0.0
73053,56043000302,44.028771,-107.950748,0.0
73054,56045951100,43.846213,-104.570020,0.0


The following blocks of commented code were a first attempt to calculate stores that fell within a certain radius of a census tract center. This would have taken far too long to compute (~5 days).

In [18]:
### Now, compute the stores within radius of neighborhood centers
# N_GROCERY_WT_2_MI, N_GROCERY_WT_5_MI, N_GROCERY_WT_30_MI 


In [19]:
# from sklearn.metrics.pairwise import haversine_distances
# from math import radians

# # example
# bsas = [-34.83333, -58.5166646]
# paris = [49.0083899664, 2.53844117956]
# bsas_in_radians = [radians(_) for _ in bsas]
# paris_in_radians = [radians(_) for _ in paris]
# r = haversine_distances([bsas_in_radians, paris_in_radians])
# r = (r * 6371000/1000) # multiply by Earth radius to get kilometers
# r = (r * 0.621371)# km to mile
# r

In [20]:
# # Since doing many pairwise calcs, prep all items before the distance calcs are performed
# # will parallelize the distance calcs

# tract_prep = [] # store (i, prepared_location)
# # iterate through each tract
# for i, row in result.iterrows():
#     tract_lat = row.INTPTLAT
#     tract_lon = row.INTPTLONG
#     tract_in_rads = [radians(x) for x in [tract_lat, tract_lon]]
#     tract_prep.append((i, tract_in_rads))

# amen_prep = []
# for j, rowj in gro.iterrows():
#     am_lat = rowj.lat_cleaned
#     am_lon = rowj.lon_cleaned
#     am_in_rads = [radians(x) for x in [am_lat, am_lon]]
#     amen_prep.append((j, am_in_rads))
    
# print(len(tract_prep), len(amen_prep))

In [21]:
# iterator = ((x,y) for x in tract_prep for y in amen_prep) # build iterator
# distances = []
# count = 1.0
# total = len(tract_prep) * len(amen_prep)

# for a, b in iterator:
#     print('{}%'.format(round(count/total*100., 2), end=''))
#     i = a[0]
#     j = b[0]
#     r = haversine_distances([a[1], b[1]])[0][1] * 3958.754641
#     count += 1

### Calculate which stores (and number of stores) fall within some distance of the tract centers.

Create a transformer to project coordinates from latitude/longitude to a 2D plane of the USA.

In [22]:
from pyproj import Transformer
transformer = Transformer.from_crs("epsg:4326", "epsg:2163") # lat/lon to us

Project all of the tract locations. Store for use below.

In [23]:
# transform coordinate system
tract_prep = [] # store (i, prepared_location)
# iterate through each tract
for i, row in result.iterrows():
    print('\r{} of {}'.format(i+1, result.shape[0]), end='')
    tract_lat = row.INTPTLAT
    tract_lon = row.INTPTLONG
    x, y = transformer.transform(tract_lat, tract_lon)
    point = [x, y]
    tract_prep.append((i, point))
print('tracts complete\n')

1 of 730562 of 730563 of 730564 of 730565 of 730566 of 730567 of 730568 of 730569 of 7305610 of 7305611 of 7305612 of 7305613 of 7305614 of 7305615 of 7305616 of 7305617 of 7305618 of 7305619 of 7305620 of 7305621 of 7305622 of 7305623 of 7305624 of 7305625 of 7305626 of 7305627 of 7305628 of 7305629 of 7305630 of 7305631 of 7305632 of 7305633 of 7305634 of 7305635 of 7305636 of 7305637 of 7305638 of 7305639 of 7305640 of 7305641 of 7305642 of 7305643 of 7305644 of 7305645 of 7305646 of 7305647 of 7305648 of 7305649 of 7305650 of 7305651 of 7305652 of 7305653 of 7305654 of 7305655 of 7305656 of 7305657 of 7305658 of 7305659 of 7305660 of 7305661 of 7305662 of 7305663 of 7305664 of 7305665 of 7305666 of 7305667 of 7305668 of 7305669 of 7305670 of 7305671 of 7305672 of 7305673 of 7305674 of 7305675 of 7305676 of 7305677 of 7305678 of 7305679 of 7305680 of 7305681 of 7305682 of 7305683 of 7305684 of 73056

695 of 73056696 of 73056697 of 73056698 of 73056699 of 73056700 of 73056701 of 73056702 of 73056703 of 73056704 of 73056705 of 73056706 of 73056707 of 73056708 of 73056709 of 73056710 of 73056711 of 73056712 of 73056713 of 73056714 of 73056715 of 73056716 of 73056717 of 73056718 of 73056719 of 73056720 of 73056721 of 73056722 of 73056723 of 73056724 of 73056725 of 73056726 of 73056727 of 73056728 of 73056729 of 73056730 of 73056731 of 73056732 of 73056733 of 73056734 of 73056735 of 73056736 of 73056737 of 73056738 of 73056739 of 73056740 of 73056741 of 73056742 of 73056743 of 73056744 of 73056745 of 73056746 of 73056747 of 73056748 of 73056749 of 73056750 of 73056751 of 73056752 of 73056753 of 73056754 of 73056755 of 73056756 of 73056757 of 73056758 of 73056759 of 73056760 of 73056761 of 73056762 of 73056763 of 73056764 of 73056765 of 73056766 of 73056767 of 73056768 of 73056769 of 73056770 of 73056771 of 7305

1451 of 730561452 of 730561453 of 730561454 of 730561455 of 730561456 of 730561457 of 730561458 of 730561459 of 730561460 of 730561461 of 730561462 of 730561463 of 730561464 of 730561465 of 730561466 of 730561467 of 730561468 of 730561469 of 730561470 of 730561471 of 730561472 of 730561473 of 730561474 of 730561475 of 730561476 of 730561477 of 730561478 of 730561479 of 730561480 of 730561481 of 730561482 of 730561483 of 730561484 of 730561485 of 730561486 of 730561487 of 730561488 of 730561489 of 730561490 of 730561491 of 730561492 of 730561493 of 730561494 of 730561495 of 730561496 of 730561497 of 730561498 of 730561499 of 730561500 of 730561501 of 730561502 of 730561503 of 730561504 of 730561505 of 730561506 of 730561507 of 730561508 of 730561509 of 730561510 of 730561511 of 730561512 of 730561513 of 730561514 of 730561515 of 730561516 of 730561517 of 730561518 of 730561519 of 730561520 of 730561521 of 730561522 

2128 of 730562129 of 730562130 of 730562131 of 730562132 of 730562133 of 730562134 of 730562135 of 730562136 of 730562137 of 730562138 of 730562139 of 730562140 of 730562141 of 730562142 of 730562143 of 730562144 of 730562145 of 730562146 of 730562147 of 730562148 of 730562149 of 730562150 of 730562151 of 730562152 of 730562153 of 730562154 of 730562155 of 730562156 of 730562157 of 730562158 of 730562159 of 730562160 of 730562161 of 730562162 of 730562163 of 730562164 of 730562165 of 730562166 of 730562167 of 730562168 of 730562169 of 730562170 of 730562171 of 730562172 of 730562173 of 730562174 of 730562175 of 730562176 of 730562177 of 730562178 of 730562179 of 730562180 of 730562181 of 730562182 of 730562183 of 730562184 of 730562185 of 730562186 of 730562187 of 730562188 of 730562189 of 730562190 of 730562191 of 730562192 of 730562193 of 730562194 of 730562195 of 730562196 of 730562197 of 730562198 of 730562199 

2795 of 730562796 of 730562797 of 730562798 of 730562799 of 730562800 of 730562801 of 730562802 of 730562803 of 730562804 of 730562805 of 730562806 of 730562807 of 730562808 of 730562809 of 730562810 of 730562811 of 730562812 of 730562813 of 730562814 of 730562815 of 730562816 of 730562817 of 730562818 of 730562819 of 730562820 of 730562821 of 730562822 of 730562823 of 730562824 of 730562825 of 730562826 of 730562827 of 730562828 of 730562829 of 730562830 of 730562831 of 730562832 of 730562833 of 730562834 of 730562835 of 730562836 of 730562837 of 730562838 of 730562839 of 730562840 of 730562841 of 730562842 of 730562843 of 730562844 of 730562845 of 730562846 of 730562847 of 730562848 of 730562849 of 730562850 of 730562851 of 730562852 of 730562853 of 730562854 of 730562855 of 730562856 of 730562857 of 730562858 of 730562859 of 730562860 of 730562861 of 730562862 of 730562863 of 730562864 of 730562865 of 730562866 

3466 of 730563467 of 730563468 of 730563469 of 730563470 of 730563471 of 730563472 of 730563473 of 730563474 of 730563475 of 730563476 of 730563477 of 730563478 of 730563479 of 730563480 of 730563481 of 730563482 of 730563483 of 730563484 of 730563485 of 730563486 of 730563487 of 730563488 of 730563489 of 730563490 of 730563491 of 730563492 of 730563493 of 730563494 of 730563495 of 730563496 of 730563497 of 730563498 of 730563499 of 730563500 of 730563501 of 730563502 of 730563503 of 730563504 of 730563505 of 730563506 of 730563507 of 730563508 of 730563509 of 730563510 of 730563511 of 730563512 of 730563513 of 730563514 of 730563515 of 730563516 of 730563517 of 730563518 of 730563519 of 730563520 of 730563521 of 730563522 of 730563523 of 730563524 of 730563525 of 730563526 of 730563527 of 730563528 of 730563529 of 730563530 of 730563531 of 730563532 of 730563533 of 730563534 of 730563535 of 730563536 of 730563537 

73056 of 73056tracts complete



Project all of the grocery store locations for use below.

In [24]:
amen_prep = []
for j, rowj in gro.iterrows():
    print('\r{} of {}'.format(j+1, gro.shape[0]), end='')
    am_lat = rowj.lat_cleaned
    am_lon = rowj.lon_cleaned
    x, y = transformer.transform(am_lat, am_lon)
    point = [x, y]#Point(x, y)
    amen_prep.append((j, point))

print(len(tract_prep), len(amen_prep))

53518 of 5351873056 53518


Build a kd-tree of the projected amenities points.

In [25]:
points = np.array([x[1] for x in amen_prep])
print(points)
point_tree = spatial.cKDTree(points)
print(point_tree)

[[ 1008903.39597079  -291698.51000652]
 [ 1247265.06037619  -656209.377317  ]
 [ -168337.02677753 -1087554.82271786]
 ...
 [ 1686110.01810216  -300867.78168736]
 [  543273.62098321  -671832.77348739]
 [-1913137.92091447  -594763.41046583]]
<scipy.spatial.ckdtree.cKDTree object at 0x7fb264fe0f90>


Iterate through the projected tract coordinates. Query the kdtree with these coordinates and a distance. The units produced by the projection are in meters. We specify the number of miles to the miles_to_meters function to convert it to meters.

This code stores a list of grocery store indexes that fall within the radius of the census tract center. These lists are later saved to the final 'full' table.

In [26]:
def miles_to_meters(miles):
    meters = miles * 1609.34
    return meters

def meters_to_miles(meters):
    miles = meters / 1609.34
    return miles

from collections import defaultdict

# for each tract, get a list of amenity indexes that fall within x miles 
distance_results = defaultdict(list)
for result_idx, locationxy in tract_prep:
    print('\r {} of {}'.format(result_idx+1, result.shape[0]), end='')
    
    for dist in [2,5,10,25,50]: #miles
        point_list = point_tree.query_ball_point(locationxy, miles_to_meters(dist), p=np.inf)
        n_points = len(point_list)
        distance_results['dist{}'.format(dist)].append(point_list)

 73056 of 73056

Conver the dictionary to a dataframe.

The number in these column names specifies the number of miles that were searched (radius) from the tract centers.

In [27]:
distance_results_df = pd.DataFrame(distance_results)
distance_results_df

Unnamed: 0,dist2,dist5,dist10,dist25,dist50
0,[],"[39075, 46917, 39758, 49391, 13067, 13782]","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 17626, 7434, 32793, 9946, 6963, 9351, ..."
1,"[39075, 39758]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 42647, 17626, 7434, 32793, 9946, 6963,..."
2,"[39075, 39758, 49391]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 42647, 17626, 7434, 32793, 9946, 6963,..."
3,"[39075, 39758, 49391, 13067, 13782]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 42647, 17626, 7434, 32793, 9946, 6963,..."
4,"[39075, 39758, 49391, 13067, 13782, 19291]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[31676, 39075, 46337, 49392, 46917, 39758, 493...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 17626, 7434, 32793, 9946, 6963, 9351, ..."
...,...,...,...,...,...
73051,[],[],[],[],"[17848, 51693, 51691, 51692, 23670]"
73052,[],[],[],"[17848, 51693]","[17848, 51693, 51691, 51692]"
73053,[],[],[],[51691],"[17848, 51693, 51691, 51692]"
73054,[],[],[],"[37046, 39370, 37674]","[30666, 32811, 41611, 37046, 39370, 37674, 204..."


In [28]:
distance_results_df['dist2']

0                                                []
1                                    [39075, 39758]
2                             [39075, 39758, 49391]
3               [39075, 39758, 49391, 13067, 13782]
4        [39075, 39758, 49391, 13067, 13782, 19291]
                            ...                    
73051                                            []
73052                                            []
73053                                            []
73054                                            []
73055                         [37046, 39370, 37674]
Name: dist2, Length: 73056, dtype: object

Create weighted number for amenities within x distance

Weights go as 1 / distance (radius distance, in miles)

In [29]:
for col in distance_results_df.columns:
    weight = 1. / int(col[4:])
    new_col = str('wt_n_stores_dist_{}'.format(col[4:])).upper()
    distance_results_df[new_col] =\
        distance_results_df[col].apply(lambda x: len(x)) * weight
distance_results_df

Unnamed: 0,dist2,dist5,dist10,dist25,dist50,WT_N_STORES_DIST_2,WT_N_STORES_DIST_5,WT_N_STORES_DIST_10,WT_N_STORES_DIST_25,WT_N_STORES_DIST_50
0,[],"[39075, 46917, 39758, 49391, 13067, 13782]","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 17626, 7434, 32793, 9946, 6963, 9351, ...",0.0,1.2,1.4,2.40,2.40
1,"[39075, 39758]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 42647, 17626, 7434, 32793, 9946, 6963,...",1.0,1.8,1.5,2.40,2.44
2,"[39075, 39758, 49391]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 42647, 17626, 7434, 32793, 9946, 6963,...",1.5,2.0,1.5,2.40,2.44
3,"[39075, 39758, 49391, 13067, 13782]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 42647, 17626, 7434, 32793, 9946, 6963,...",2.5,2.0,1.6,2.40,2.44
4,"[39075, 39758, 49391, 13067, 13782, 19291]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[31676, 39075, 46337, 49392, 46917, 39758, 493...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 17626, 7434, 32793, 9946, 6963, 9351, ...",3.0,2.0,2.3,2.40,2.40
...,...,...,...,...,...,...,...,...,...,...
73051,[],[],[],[],"[17848, 51693, 51691, 51692, 23670]",0.0,0.0,0.0,0.00,0.10
73052,[],[],[],"[17848, 51693]","[17848, 51693, 51691, 51692]",0.0,0.0,0.0,0.08,0.08
73053,[],[],[],[51691],"[17848, 51693, 51691, 51692]",0.0,0.0,0.0,0.04,0.08
73054,[],[],[],"[37046, 39370, 37674]","[30666, 32811, 41611, 37046, 39370, 37674, 204...",0.0,0.0,0.0,0.12,0.32


Append the newly calc'd info to the `result` table

In [30]:
result = pd.concat([result, distance_results_df], axis=1)
result

Unnamed: 0,GEOID,INTPTLAT,INTPTLONG,N_GROCERY_WITHIN_TRACT,dist2,dist5,dist10,dist25,dist50,WT_N_STORES_DIST_2,WT_N_STORES_DIST_5,WT_N_STORES_DIST_10,WT_N_STORES_DIST_25,WT_N_STORES_DIST_50
0,1001020100,32.481959,-86.491338,0.0,[],"[39075, 46917, 39758, 49391, 13067, 13782]","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 17626, 7434, 32793, 9946, 6963, 9351, ...",0.0,1.2,1.4,2.40,2.40
1,1001020200,32.475758,-86.472468,0.0,"[39075, 39758]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 42647, 17626, 7434, 32793, 9946, 6963,...",1.0,1.8,1.5,2.40,2.44
2,1001020300,32.474024,-86.459703,1.0,"[39075, 39758, 49391]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 42647, 17626, 7434, 32793, 9946, 6963,...",1.5,2.0,1.5,2.40,2.44
3,1001020400,32.471030,-86.444835,0.0,"[39075, 39758, 49391, 13067, 13782]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[39075, 46337, 49392, 46917, 39758, 49391, 130...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 42647, 17626, 7434, 32793, 9946, 6963,...",2.5,2.0,1.6,2.40,2.44
4,1001020500,32.458922,-86.421826,3.0,"[39075, 39758, 49391, 13067, 13782, 19291]","[39075, 39758, 49391, 13067, 13782, 19291, 886...","[31676, 39075, 46337, 49392, 46917, 39758, 493...","[49388, 31676, 22192, 35804, 49400, 21223, 216...","[49388, 17626, 7434, 32793, 9946, 6963, 9351, ...",3.0,2.0,2.3,2.40,2.40
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
73051,56043000200,43.878830,-107.669052,0.0,[],[],[],[],"[17848, 51693, 51691, 51692, 23670]",0.0,0.0,0.0,0.00,0.10
73052,56043000301,44.014369,-107.956379,0.0,[],[],[],"[17848, 51693]","[17848, 51693, 51691, 51692]",0.0,0.0,0.0,0.08,0.08
73053,56043000302,44.028771,-107.950748,0.0,[],[],[],[51691],"[17848, 51693, 51691, 51692]",0.0,0.0,0.0,0.04,0.08
73054,56045951100,43.846213,-104.570020,0.0,[],[],[],"[37046, 39370, 37674]","[30666, 32811, 41611, 37046, 39370, 37674, 204...",0.0,0.0,0.0,0.12,0.32


Output this full table to a file. This is rather large. It is needed for the visualization but not for clustering.

In [31]:
result.to_pickle(os.path.join(ROOT, 'amenities_grocery_full_v1.pkl'), protocol=4)

Output a much smaller version (per disk storage) of the table without the list columns. This would be ideal to use for clustering.

In [32]:
result[[x for x in result.columns if x.isupper()]].to_pickle(
    os.path.join(ROOT, 'amenities_grocery_lite_v1.pkl'), protocol=4)

In [33]:
# show lite table
result[[x for x in result.columns if x.isupper()]]

Unnamed: 0,GEOID,INTPTLAT,INTPTLONG,N_GROCERY_WITHIN_TRACT,WT_N_STORES_DIST_2,WT_N_STORES_DIST_5,WT_N_STORES_DIST_10,WT_N_STORES_DIST_25,WT_N_STORES_DIST_50
0,1001020100,32.481959,-86.491338,0.0,0.0,1.2,1.4,2.40,2.40
1,1001020200,32.475758,-86.472468,0.0,1.0,1.8,1.5,2.40,2.44
2,1001020300,32.474024,-86.459703,1.0,1.5,2.0,1.5,2.40,2.44
3,1001020400,32.471030,-86.444835,0.0,2.5,2.0,1.6,2.40,2.44
4,1001020500,32.458922,-86.421826,3.0,3.0,2.0,2.3,2.40,2.40
...,...,...,...,...,...,...,...,...,...
73051,56043000200,43.878830,-107.669052,0.0,0.0,0.0,0.0,0.00,0.10
73052,56043000301,44.014369,-107.956379,0.0,0.0,0.0,0.0,0.08,0.08
73053,56043000302,44.028771,-107.950748,0.0,0.0,0.0,0.0,0.04,0.08
73054,56045951100,43.846213,-104.570020,0.0,0.0,0.0,0.0,0.12,0.32
