# Gage Assignment to Zip Code
_Calvin Whealton_

The Data Incubator Capstone Project

This notebook completes the calculations to find the nearest stream gages to the zip code nominal location.

In [1]:
import numpy as np
import os
import pandas as pd
import geopandas as gpd
from geopy.distance import geodesic
from shapely import wkt

## Zip Code Processing to point lat-long coordinates

Reading in shapefile for zip codes as a shapefile as table. It includes the interpolation latitude and longitude of the zip code.

In [2]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/data/geo_data/tl_2019_us_zcta510')

zip_data = gpd.read_file('tl_2019_us_zcta510.shp')

In [3]:
zip_data.head()

Unnamed: 0,ZCTA5CE10,GEOID10,CLASSFP10,MTFCC10,FUNCSTAT10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10,geometry
0,43451,43451,B5,G6350,S,63484186,157689,41.318301,-83.6174935,"POLYGON ((-83.70873 41.32733, -83.70815 41.327..."
1,43452,43452,B5,G6350,S,121522304,13721730,41.5157923,-82.9809454,"POLYGON ((-83.08698 41.53780, -83.08256 41.537..."
2,43456,43456,B5,G6350,S,9320975,1003775,41.63183,-82.8393923,"MULTIPOLYGON (((-82.83558 41.71082, -82.83515 ..."
3,43457,43457,B5,G6350,S,48004681,0,41.2673301,-83.4274872,"POLYGON ((-83.49650 41.25371, -83.48382 41.253..."
4,43458,43458,B5,G6350,S,2573816,39915,41.5304461,-83.2133648,"POLYGON ((-83.22229 41.53102, -83.22228 41.532..."


In [4]:
# converting to from string (read in)  to float to do conversion later
zip_data['x'] = zip_data['INTPTLON10'].astype(float)
zip_data['y'] = zip_data['INTPTLAT10'].astype(float)

In [5]:
# dropping the geometry and other columns that are not needed form the shapefile
zip_data.drop(['geometry','CLASSFP10','MTFCC10','FUNCSTAT10'],axis=1,inplace=True)


In [6]:
# using x and y values to set geometry
zip_data_latlong = gpd.GeoDataFrame(
                    zip_data, geometry=gpd.points_from_xy(zip_data.x, zip_data.y))

In [7]:
zip_data_latlong.head()

Unnamed: 0,ZCTA5CE10,GEOID10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10,x,y,geometry
0,43451,43451,63484186,157689,41.318301,-83.6174935,-83.617493,41.318301,POINT (-83.61749 41.31830)
1,43452,43452,121522304,13721730,41.5157923,-82.9809454,-82.980945,41.515792,POINT (-82.98095 41.51579)
2,43456,43456,9320975,1003775,41.63183,-82.8393923,-82.839392,41.63183,POINT (-82.83939 41.63183)
3,43457,43457,48004681,0,41.2673301,-83.4274872,-83.427487,41.26733,POINT (-83.42749 41.26733)
4,43458,43458,2573816,39915,41.5304461,-83.2133648,-83.213365,41.530446,POINT (-83.21336 41.53045)


In [8]:
# setting to wgs84
zip_data_latlong.crs = {'init' :"EPSG:4269"}

  return _prepare_from_string(" ".join(pjargs))


In [9]:
zip_data_latlong.crs

<Geographic 2D CRS: +init=epsg:4269 +type=crs>
Name: NAD83
Axis Info [ellipsoidal]:
- lon[east]: Longitude (degree)
- lat[north]: Latitude (degree)
Area of Use:
- name: North America - NAD83
- bounds: (167.65, 14.92, -47.74, 86.46)
Datum: North American Datum 1983
- Ellipsoid: GRS 1980
- Prime Meridian: Greenwich

## Stream Gages

Reading in a dataset for the gages.

In [10]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/data/peak_data')

gage_locs = pd.read_csv('usgs_supp.txt',sep='\t',comment='#')


In [11]:
gage_locs.head()

Unnamed: 0,agency_cd,site_no,station_nm,dec_lat_va,dec_long_va,coord_acy_cd,dec_coord_datum_cd,state_cd,county_cd,alt_va,alt_acy_va,alt_datum_cd,basin_cd,contrib_drain_area_va
0,USGS,1010000,"St. John River at Ninemile Bridge, Maine",46.700556,-69.715556,S,NAD83,23.0,3.0,931.26,0.01,NGVD29,,1341.0
1,USGS,1010070,"Big Black River near Depot Mtn, Maine",46.893889,-69.751667,S,NAD83,23.0,3.0,885.0,20.0,NGVD29,,171.0
2,USGS,1010500,"St. John River at Dickey, Maine",47.113056,-69.088056,S,NAD83,23.0,3.0,590.38,0.01,NGVD29,,2680.0
3,USGS,1011000,"Allagash River near Allagash, Maine",47.069722,-69.079444,S,NAD83,23.0,3.0,604.6,0.01,NGVD29,,1229.0
4,USGS,1013500,"Fish River near Fort Kent, Maine",47.2375,-68.582778,S,NAD83,23.0,3.0,511.38,0.01,NGVD29,,873.0


In [12]:
# dropping locations without coordinates
gage_locs.dropna(subset=['dec_lat_va','dec_long_va'],inplace=True)

In [13]:
gage_locs_gpd = gpd.GeoDataFrame(gage_locs,geometry=gpd.points_from_xy(gage_locs.dec_long_va, gage_locs.dec_lat_va))

In [14]:
gage_locs_gpd.crs = {'init' :"EPSG:4269"}

In [15]:
gage_locs_gpd.head()

Unnamed: 0,agency_cd,site_no,station_nm,dec_lat_va,dec_long_va,coord_acy_cd,dec_coord_datum_cd,state_cd,county_cd,alt_va,alt_acy_va,alt_datum_cd,basin_cd,contrib_drain_area_va,geometry
0,USGS,1010000,"St. John River at Ninemile Bridge, Maine",46.700556,-69.715556,S,NAD83,23.0,3.0,931.26,0.01,NGVD29,,1341.0,POINT (-69.71556 46.70056)
1,USGS,1010070,"Big Black River near Depot Mtn, Maine",46.893889,-69.751667,S,NAD83,23.0,3.0,885.0,20.0,NGVD29,,171.0,POINT (-69.75167 46.89389)
2,USGS,1010500,"St. John River at Dickey, Maine",47.113056,-69.088056,S,NAD83,23.0,3.0,590.38,0.01,NGVD29,,2680.0,POINT (-69.08806 47.11306)
3,USGS,1011000,"Allagash River near Allagash, Maine",47.069722,-69.079444,S,NAD83,23.0,3.0,604.6,0.01,NGVD29,,1229.0,POINT (-69.07944 47.06972)
4,USGS,1013500,"Fish River near Fort Kent, Maine",47.2375,-68.582778,S,NAD83,23.0,3.0,511.38,0.01,NGVD29,,873.0,POINT (-68.58278 47.23750)


## Reprojecting dataframes for distance calcs
Decimal lat-long are generally not good for distance calculations. Will reproject the dataframe to a coordinate system more appropriate for distance calcluation.

In [17]:
gages_for_dist_calc = gpd.GeoDataFrame(gage_locs,columns=['site_no','geometry'])

In [24]:
gages_for_dist_calc.crs = {'init' :"EPSG:4269"}

  return _prepare_from_string(" ".join(pjargs))


In [29]:
gages_for_dist_calc = gages_for_dist_calc.to_crs('EPSG:2163')

In [30]:
gages_for_dist_calc['x'] = gages_for_dist_calc['geometry'].x
gages_for_dist_calc['y'] = gages_for_dist_calc['geometry'].y

In [31]:
zip_locs_dist = zip_data_latlong.to_crs('EPSG:2163')

In [32]:
gages_for_dist_calc.head()

Unnamed: 0,site_no,geometry,x,y
0,1010000,POINT (2241049.472 621122.518),2241049.0,621122.517752
1,1010070,POINT (2230437.221 640396.859),2230437.0,640396.858568
2,1010500,POINT (2266857.514 682270.118),2266858.0,682270.117604
3,1011000,POINT (2269310.429 677982.898),2269310.0,677982.898332
4,1013500,POINT (2296028.248 709900.183),2296028.0,709900.183249


In [120]:
from shapely.geometry import Point

def closest_n_gage_to_zip(point,gages,n,dmax=500000):
    # finds the closest gages
    # point = zip code location (shapely point)
    # gages = vector of points for gages (shapely points) with gage number as index
    # n = number of nearest points
    # point and gages assumed to be projected into appropriate distance unit
    # returns dataframe of the n closest points
    
    # setting up dataframe that will be returned
    dist_gage = pd.DataFrame(columns=['site_no','dist'])
    #print(gages.index[0:19])
    #dist_gage.set_index('site_no',inplace=True)
    
    true_false = np.array((gages['x'] < (dmax + point.x))).astype(int) + np.array((gages['x'] > (dmax - point.x))).astype(int) + np.array((gages['y'] < (dmax + point.y))).astype(int) + np.array((gages['y'] > (dmax - point.y))).astype(int)
    
    gages_use = gages.loc[true_false==4]
    
    #gages_use.set_index('site_no',inplace=True)
    
    # calculating all distances
    if gages_use.shape[0] != 0:
        for g_use in gages_use.index:
            #print(gage)
            if isinstance(gages.loc[g_use,'geometry'],Point):
                dist_gage.loc[g_use] = [gages_use.loc[g_use,'site_no'],point.distance(gages_use['geometry'][g_use])]
            else:
                dist_gage.loc[g_use] = [gages_use.loc[g_use,'site_no'][0],point.distance(gages_use['geometry'][g_use].iloc[0])] 
                
        # sorting the results
        dist_gage.sort_values(by=['dist'],inplace=True)

        return dist_gage.iloc[0:n]
    else:
        dist_gage['dist'] = np.zeros(shape=[10])
        dist_gage['gage'] = np.zeros(shape=[10])
        return dist_gage
    
    

In [124]:
from shapely.geometry import Point

def closest_n_gage_to_zip2(point,gages,n,dmax=500000):
    # finds the closest gages
    # point = zip code location (shapely point)
    # gages = vector of points for gages (shapely points) with gage number as index
    # n = number of nearest points
    # point and gages assumed to be projected into appropriate distance unit
    # returns dataframe of the n closest points
    
    # setting up dataframe that will be returned
    dist_gage = pd.DataFrame(columns=['site_no','dist'])
    #print(gages.index[0:19])
    #dist_gage.set_index('site_no',inplace=True)
    
    dist_gage['site_no'] = gages['site_no']
    
    dist_gage['dist'] = np.sqrt(np.power(np.array(point.x-gages['x']),2) + np.power(np.array(point.y-gages['y']),2))
    
    #true_false = np.array((gages['x'] < (dmax + point.x))).astype(int) + np.array((gages['x'] > (dmax - point.x))).astype(int) + np.array((gages['y'] < (dmax + point.y))).astype(int) + np.array((gages['y'] > (dmax - point.y))).astype(int)
    
    #gages_use = gages.loc[true_false==4]
    
    #gages_use.set_index('site_no',inplace=True)
    
    # calculating all distances
    #if gages_use.shape[0] != 0:
        #for g_use in gages_use.index:
            #print(gage)
            #if isinstance(gages.loc[g_use,'geometry'],Point):
                #dist_gage.loc[g_use] = [gages_use.loc[g_use,'site_no'],point.distance(gages_use['geometry'][g_use])]
            #else:
                #dist_gage.loc[g_use] = [gages_use.loc[g_use,'site_no'][0],point.distance(gages_use['geometry'][g_use].iloc[0])] 
                
    # sorting the results
    dist_gage.sort_values(by=['dist'],inplace=True)
    
    return dist_gage.iloc[0:n]

In [125]:
zip_locs_dist2 = zip_data_latlong.to_crs('EPSG:2163')

In [126]:
for j in range(10):
    zip_locs_dist2['gage'+str(j)] = 0
    zip_locs_dist2['dist'+str(j)] = 0

In [128]:
for ind in zip_locs_dist2.index:
    zip_loc2 = zip_locs_dist2.loc[ind,'geometry']
    
    closest_gages = closest_n_gage_to_zip2(zip_loc2,gages_for_dist_calc,10)
    
    for j in range(closest_gages.shape[0]):
        zip_locs_dist2.loc[ind,'dist'+str(j)] = closest_gages['dist'].values[j]
        zip_locs_dist2.loc[ind,'gage'+str(j)] = closest_gages.index[j]


In [None]:
# for ind in zip_locs_dist.index:
#     zip_loc = zip_locs_dist.loc[ind,'geometry']
    
#     closest_gages = closest_n_gage_to_zip2(zip_loc,gages_for_dist_calc,10)
    
#     for j in range(closest_gages.shape[0]):
#         zip_locs_dist.loc[ind,'dist'+str(j)] = closest_gages['dist'].values[j]
#         zip_locs_dist.loc[ind,'gage'+str(j)] = closest_gages.index[j]


In [132]:
zip_locs_dist2.head()

Unnamed: 0,ZCTA5CE10,GEOID10,ALAND10,AWATER10,INTPTLAT10,INTPTLON10,x,y,geometry,gage0,...,gage5,dist5,gage6,dist6,gage7,dist7,gage8,dist8,gage9,dist9
0,43451,43451,63484186,157689,41.318301,-83.6174935,-83.617493,41.318301,POINT (1357632.982 -273355.024),4767,...,4780,35048.269737,4770,38047.461696,4741,38127.199412,4781,38153.116744,4740,38912.761534
1,43452,43452,121522304,13721730,41.5157923,-82.9809454,-82.980945,41.515792,POINT (1405077.789 -240979.135),4770,...,4783,38992.381755,4784,42889.448497,4778,47669.895781,4782,48099.579127,4741,54583.146601
2,43456,43456,9320975,1003775,41.63183,-82.8393923,-82.839392,41.63183,POINT (1413951.159 -225837.387),4770,...,4780,48263.019518,4782,50798.462462,4785,51313.90721,4779,54115.851576,4739,56928.044679
3,43457,43457,48004681,0,41.2673301,-83.4274872,-83.427487,41.26733,POINT (1374212.851 -275772.617),4779,...,4770,30266.616017,4776,31432.801554,4762,32218.889964,4767,34624.114203,4768,35022.435948
4,43458,43458,2573816,39915,41.5304461,-83.2133648,-83.213365,41.530446,POINT (1385905.376 -243339.010),4770,...,4741,35956.931241,4740,40417.635485,4767,41341.050532,4768,41563.461857,4739,42369.956528


In [133]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/data/processed')
zip_locs_dist2.to_csv('zip_gage_dist.csv')