In [1]:
import geopy.distance
import pandas as pd
import numpy as np
import os
import csv
#import shapefile
import matplotlib.pyplot as plt

## 1 - Creating the new dataframe

In [2]:
csv_file_path = 'Downloads/distance_matrix_by_tracts_WA.csv'
df = pd.read_csv(csv_file_path)

In [3]:
df.head()

Unnamed: 0.1,Unnamed: 0,origin,destination,distance
0,1,53011040102,6015000101,296.841093
1,2,53011040201,6015000101,296.357495
2,3,53011040202,6015000101,298.009889
3,4,53011040203,6015000101,293.993493
4,5,53011040301,6015000101,292.287234


In [4]:
len(df)

3235483

In [5]:
# First check - make sure there are no internal trips (i.e. tract in WA to tract in WA)

In [6]:
val1 = 53011040102
val2 = 53011040102
row_exists = (df['origin'] == val1) & (df['destination'] == val2)

In [7]:
if row_exists.any():
    print("Row exists")
else:
    print("Row does not exist")

Row exists


In [8]:
# Need to flag destinations in California
# State GEOID is 53. Hence numbers between 6000000000 and 6999999999 are in California
df['dest_WA'] = np.where((df['destination'] >= 53000000000) & (df['destination'] <= 53999999999) , 1, 0)

In [9]:
df.head()

Unnamed: 0.1,Unnamed: 0,origin,destination,distance,dest_WA
0,1,53011040102,6015000101,296.841093,0
1,2,53011040201,6015000101,296.357495,0
2,3,53011040202,6015000101,298.009889,0
3,4,53011040203,6015000101,293.993493,0
4,5,53011040301,6015000101,292.287234,0


In [10]:
# Filter out WA destinations
df_out=df[df['dest_WA']==0]

In [11]:
# Calculate the minimum distance for every origin
# Output: origin tract - minimum distance to out of state tract - tract
min_dist_out = df.loc[df_out.groupby('origin')['distance'].idxmin()]

#print(min_dist_out)

In [12]:
min_dist_out.head()

Unnamed: 0.1,Unnamed: 0,origin,destination,distance,dest_WA
78804,78805,53001950100,16057005100,70.205197,0
918311,918312,53001950200,41059950200,69.166795,0
927013,927014,53001950300,41059950800,64.47095,0
927014,927015,53001950400,41059950800,64.632924,0
927015,927016,53001950500,41059950800,63.139458,0


In [13]:
val1 = 53001950100
row_exists = (min_dist_out['origin'] == val1)
if row_exists.any():
    print("Row exists")
else:
    print("Row does not exist")

Row exists


In [14]:
min_dist_out = min_dist_out.reset_index()
final_df = min_dist_out[['origin','destination','distance']].copy()

In [15]:
final_df.head()

Unnamed: 0,origin,destination,distance
0,53001950100,16057005100,70.205197
1,53001950200,41059950200,69.166795
2,53001950300,41059950800,64.47095
3,53001950400,41059950800,64.632924
4,53001950500,41059950800,63.139458


In [16]:
len(final_df)

1454

In [17]:
# Flag: California has 1,458 tracts

In [18]:
final_df.to_csv('tract_to_border_distance_wa.csv')

## 2 - Testing the output

Where are the destinations?

In [19]:
final_df['First2'] = final_df['destination'].astype(str).str[:2]

In [20]:
final_df['First2'].value_counts()

First2
41    1295
16     159
Name: count, dtype: int64

In [21]:
# 32 is Nevada, # 16 is Idaho

Most common destinations

In [23]:
dest_count_df = final_df['destination'].value_counts().reset_index()
dest_count_df.columns = ['GEOID', 'Count']

In [24]:
dest_count_df['GEOID'] = dest_count_df['GEOID'].astype(str)
dest_count_df.head()

Unnamed: 0,GEOID,Count
0,41009970300,951
1,16055000401,93
2,41007950300,56
3,41059950800,47
4,41049970100,45


In [25]:
#41009970300: Columbia County, OR
#16055000401: Kootenai County, ID