In [28]:
import geopy.distance
import pandas as pd
import numpy as np
import os
import csv
import shapefile
import matplotlib.pyplot as plt

## 1 - Creating the new dataframe

In [29]:
csv_file_path = 'Downloads/distance_matrix_by_tracts_CA.csv'
df = pd.read_csv(csv_file_path)

In [30]:
df.head()

Unnamed: 0.1,Unnamed: 0,origin,destination,distance
0,1,6071010700,4001944201,295.615635
1,2,6071940100,4001944201,294.930777
2,3,6071010700,4001944300,295.811483
3,4,6071940100,4001944300,294.872095
4,5,6071940100,4001944901,296.485464


In [31]:
len(df)

45414791

In [32]:
# First check - make sure there are no internal trips (i.e. tract in CA to tract in CA)
# Using 6071010700 and 6071940100

In [33]:
val1 = 6071010700
val2 = 6071940100
row_exists = (df['origin'] == val1) & (df['destination'] == val2)

In [34]:
if row_exists.any():
    print("Row exists")
else:
    print("Row does not exist")

Row exists


In [35]:
# Need to flag destinations in California
# State GEOID is 6. Hence numbers between 6000000000 and 6999999999 are in California
df['dest_CA'] = np.where((df['destination'] >= 6000000000) & (df['destination'] <= 6999999999) , 1, 0)

In [36]:
df.head()

Unnamed: 0.1,Unnamed: 0,origin,destination,distance,dest_CA
0,1,6071010700,4001944201,295.615635,0
1,2,6071940100,4001944201,294.930777,0
2,3,6071010700,4001944300,295.811483,0
3,4,6071940100,4001944300,294.872095,0
4,5,6071940100,4001944901,296.485464,0


In [37]:
# Filter out CA destinations
df_out=df[df['dest_CA']==0]

In [38]:
# Calculate the minimum distance for every origin
# Output: origin tract - minimum distance to out of state tract - tract
min_dist_out = df.loc[df_out.groupby('origin')['distance'].idxmin()]

#print(min_dist_out)

In [39]:
min_dist_out.head()

Unnamed: 0.1,Unnamed: 0,origin,destination,distance,dest_CA
44726069,44726070,6001400100,32005990000,146.885212,0
44726070,44726071,6001400200,32005990000,148.442731,0
44726071,44726072,6001400300,32005990000,148.951108,0
44726072,44726073,6001400400,32005990000,148.783168,0
44726073,44726074,6001400500,32005990000,149.101951,0


In [40]:
val1 = 6071010700
row_exists = (min_dist_out['origin'] == val1)
if row_exists.any():
    print("Row exists")
else:
    print("Row does not exist")

Row exists


In [41]:
min_dist_out = min_dist_out.reset_index()
final_df = min_dist_out[['origin','destination','distance']].copy()

In [42]:
final_df.head()

Unnamed: 0,origin,destination,distance
0,6001400100,32005990000,146.885212
1,6001400200,32005990000,148.442731
2,6001400300,32005990000,148.951108
3,6001400400,32005990000,148.783168
4,6001400500,32005990000,149.101951


In [43]:
len(final_df)

8038

In [44]:
# Flag: California has 8,057 tracts

In [45]:
final_df.to_csv('tract_to_border_distance.csv')

## 2 - Testing the output

Where are the destinations?

In [18]:
final_df['First2'] = final_df['destination'].astype(str).str[:2]

In [19]:
final_df['First2'].value_counts()

32    6462
40    1461
41     115
Name: First2, dtype: int64

In [20]:
# 32 is Nevada, # 41 is Oregon, #4 is Arizona

OD Pair testing

In [21]:
# 6001400100 is Alameda County, 32005990000 is Douglas, NV 

Aggregate plot of destinations

In [22]:
dest_count_df = final_df['destination'].value_counts().reset_index()
dest_count_df.columns = ['GEOID', 'Count']

In [23]:
dest_count_df['GEOID'] = dest_count_df['GEOID'].astype(str)
dest_count_df.head()

Unnamed: 0,GEOID,Count
0,32023960405,2903
1,32005001800,1206
2,32005990000,1191
3,4012020602,789
4,4027011403,423


In [24]:
#32023960405: Pahrump, NV
#32005001800: Stateline, NV
#32005990000: Glenbrook, NV
#4012020602: Cibola, AZ

In [25]:
# Plot the top occurring census tracts
# USE FOLIUM
import folium