In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [61]:
def load_data(file_name):
    data = []
    with open(file_name, 'r') as file:
        data = file.read().split("#\n")

    if not data:
        print("File {} was unable to be read.".format(file_name))
    return data

In [62]:
def convert_to_df(raw_data):
    # The raw data comes in a giant string with tabs and newlines
    data_split = raw_data.split("\n")
    for i in range(len(data_split)):
        data_split[i] = data_split[i].split("\t")
        
    return pd.DataFrame(data_split[2:], columns = data_split[0])

In [70]:
def get_parameter_def(param_header, regex = "# +(\w+) +-+ +(.+)"):
    params_dict = {}
    params_type = {}
    params = param_header.split("\n")
    params_pattern = re.compile(regex)

    for param in params:
        a = params_pattern.search(param)

        if a: 
            #print(first_t, a.group(1), a.group(2))
            params_dict[a.group(1).lower()] = a.group(2)
            #print(a.group(1) + ":", a.group(2).split(',')[-1])

    return params_dict

## Load in the Quality Data to get the Site Numbers 

In [3]:
quality_file_name = "LA_Water_Quality_Data.txt"
data = load_data(quality_file_name)
print("~~~~~ Quality Data ~~~~~")
print("Number of sections:", len(data))
print("2nd section:", data[2], sep = "\n")
print("Number of characters in actual data:", len(data[15]))

Number of sections: 16
2nd section:
# U.S. Geological Survey
# 
# This file contains selected water-quality data for stations in the National Water Information 
# System water-quality database.  Explanation of codes found in this file are followed by
# the retrieved data.

Number of characters in actual data: 9736982


In [22]:
sites = data[13].split("\n")
sites_pattern = re.compile("# +USGS (\d+)")

In [27]:
site_ids = []
for site in sites:
    site_found = sites_pattern.search(site)
    if site_found:
        site_ids.append(site_found.group(1))
        
len(site_ids)

1212

## Load in the Site information to connect to Quality Data 

In [44]:
site_file_name = "LA_Water_Site_Data.txt"
site_data = load_data(site_file_name)
print("~~~~~ Site Data ~~~~~")
print("Number of sections:", len(site_data))
print("Number of characters in actual data:", len(site_data[10]))

~~~~~ Site Data ~~~~~
Number of sections: 11
Number of characters in actual data: 418205


    #  0:                               #  6:  
    #  1:                               #  7: query started 2018-09-...
    #  2: U.S. Geological Survey        #  8: there are 1212 sites...  
    #  3: Te Site File stores...        #  9:    
    #  4: The following selected...     # 10: Data!
    #  5: Param_id      - parameter      

In [65]:
site_info = convert_to_df(site_data[10])
#site_info.head(2)

In [79]:
site_params = get_parameter_def(site_data[5])
site_params

{'agency_cd': 'Agency',
 'site_no': 'Site identification number',
 'station_nm': 'Site name',
 'site_tp_cd': 'Site type',
 'lat_va': 'DMS latitude',
 'long_va': 'DMS longitude',
 'dec_lat_va': 'Decimal latitude',
 'dec_long_va': 'Decimal longitude',
 'coord_meth_cd': 'Latitude-longitude method',
 'coord_acy_cd': 'Latitude-longitude accuracy',
 'coord_datum_cd': 'Latitude-longitude datum',
 'dec_coord_datum_cd': 'Decimal Latitude-longitude datum',
 'district_cd': 'District code',
 'state_cd': 'State code',
 'county_cd': 'County code',
 'country_cd': 'Country code',
 'land_net_ds': 'Land net location description',
 'map_nm': 'Name of location map',
 'map_scale_fc': 'Scale of location map',
 'alt_va': 'Altitude of Gage/land surface',
 'alt_meth_cd': 'Method altitude determined',
 'alt_acy_va': 'Altitude accuracy',
 'alt_datum_cd': 'Altitude datum',
 'huc_cd': 'Hydrologic unit code',
 'basin_cd': 'Drainage basin code',
 'topo_cd': 'Topographic setting code',
 'data_types_cd': 'Flags for th

In [85]:
print("Different Districts:", set(site_info["district_cd"]))
print("Different Counties: ", set(site_info["county_cd"]))
print("Different Countries:", set(site_info["country_cd"]))
print("Different SiteTypes:", set(site_info["site_tp_cd"]))

Different Districts: {None, '36', '06'}
Different Counties:  {None, '037'}
Different Countries: {None, 'US'}
Different SiteTypes: {'GW', 'LK', 'FA-WDS', 'FA-WU', 'ST', 'FA-WWTP', 'OC-CO', 'FA-SPS', 'FA-OF', 'SP', 'ES', 'GW-HZ', 'LA-SH', 'AT', None, 'GW-TH', 'SB-UZ'}


In [103]:
from uszipcode import SearchEngine, Zipcode
search = SearchEngine(simple_zipcode=False)

342907
1180120
~~~~
341846
1182932
~~~~
341350
1174605
~~~~
341439
1175725
~~~~
340917
1175426
~~~~
340203
1180214
~~~~
340125
1180311
~~~~
341802
1181604
~~~~
340718
1181610
~~~~
340452
1181336
~~~~
341320
1181036
~~~~
340145
1180407
~~~~
340115
1180415
~~~~
334902
1181220
~~~~
334816
1181215
~~~~
340352
1183510
~~~~
340651
1184642
~~~~
340445
1184154
~~~~
340440
1184203
~~~~
340121
1184858
~~~~
340244
1185602
~~~~
342547
1182116
~~~~
342656
1183022
~~~~
342534
1183509
~~~~
343609
1183943
~~~~
343346
1183758
~~~~
343751
1184451
~~~~
335040.20
1182034.40
~~~~
335421.2
1180605.6
~~~~
334445.95
1181253.73
~~~~
334445.95
1181253.73
~~~~
334644.3
1175951.9
~~~~
334607
1180538
~~~~
334607
1180539
~~~~
334615.38
1181313.88
~~~~
334615.38
1181313.88
~~~~
334626
1180436
~~~~
334636.5
1181723.9
~~~~
334649.68
1181502.97
~~~~
334649.68
1181502.97
~~~~
334654.54
1181418.43
~~~~
334654.72
1181418.45
~~~~
334655.80
1181628.64
~~~~
334656.39
1181603.60
~~~~
334656.39
1181603.60
~~~~
334656.39
118160

335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
355943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180422
~~~~
335943
1180424
~~~~
335943
1180702
~~~~
335946
1180421
~~~~
335946
1180421
~~~~
335947.43
1181404.45
~~~~
335948
1180535
~~~~
335952
1181226
~~~~
335951.84
1181556.24
~~~~
335951.84
1181556.24
~~~~
335951.84
1181556.24
~~~~
335951.84
1181556.24
~~~~
335951.84
1181556.24
~~~~


~~~~


~~~~


~~~~


~~~~


~~~~


~~~~


~~~~
340023.5
1181255.5
~~~~
334729.1
11

343927
1180748
~~~~
343931
1181337
~~~~
343932.3
1181440.4
~~~~
343933
1181114
~~~~
343935
1181321
~~~~
343938
1181029
~~~~
343938
1181313
~~~~
343943
1180817
~~~~
343943
1180817
~~~~
343943
1180817
~~~~
343943
1180817
~~~~
343943.01
1180821.87
~~~~
343943
1181256
~~~~
343950
1181508
~~~~
343951
1180659
~~~~
343951
1180700
~~~~
343952.2
1180422.1
~~~~
344000
1181306
~~~~
344009.8
1182521
~~~~
344059.6
1182742.8
~~~~
344004.86
1180749.23
~~~~
344002
1181304
~~~~
344003
1180639
~~~~
344005.34
1180749.66
~~~~
344006.50
1180751.56
~~~~
344003
1182518
~~~~
344003
1182520
~~~~
344004
1181028
~~~~
344004.63
1180821.83
~~~~
344005
1180822
~~~~
344005
1180822
~~~~
344005
1180822
~~~~
344005
1180822
~~~~
344006
1175208
~~~~
344006
1175232
~~~~
344006
1175856
~~~~
344006
1180142
~~~~
344007.44
1180827.19
~~~~
344008.09
1180750.01
~~~~
344009
1182518
~~~~
344011
1180747
~~~~
344018
1182552
~~~~
344028.33
1181125.14
~~~~
344030
1181100
~~~~
344032
1175932
~~~~
344036
1180122
~~~~
344039
1175929
~~~

In [126]:
from uszipcode import Zipcode
for i in range(len(site_info[["dec_lat_va", "dec_long_va"]])):
    lat, long = site_info[["dec_lat_va", "dec_long_va"]].iloc[i]
    print(float(lat), float(long))
    result = search.by_coordinates(float(lat), float(long), radius=1.0)
    print(result)

34.48527184 -118.0231248
[]
34.31277828 -118.4931404
[]
34.2305606 -117.768947
[]
34.24416995 -117.9578424
[]
34.15472899 -117.9081183
[]
34.03417767 -118.0381225
[]
34.02362266 -118.0539563
[]
34.30055637 -118.2686878
[]
34.1216753 -118.270353
[SimpleZipcode(zipcode='90039', zipcode_type='Standard', major_city='Los Angeles', post_office_city='Los Angeles, CA', common_city_list=['Los Angeles'], county='Los Angeles County', state='CA', lat=34.12, lng=-118.26, timezone='Pacific', radius_in_miles=3.0, area_code_list=['213'], population=28514, population_density=7563.0, land_area_in_sqmi=3.77, water_area_in_sqmi=0.2, housing_units=12127, occupied_housing_units=11436, median_home_value=600000, median_household_income=64080, bounds_west=-118.278337, bounds_east=-118.22689, bounds_north=34.153261, bounds_south=34.082814), SimpleZipcode(zipcode='91204', zipcode_type='Standard', major_city='Glendale', post_office_city='Glendale, CA', common_city_list=['Glendale'], county='Los Angeles County', s

[SimpleZipcode(zipcode='90755', zipcode_type='Standard', major_city='Signal Hill', post_office_city='Signal Hill, CA', common_city_list=['Signal Hill', 'Long Beach'], county='Los Angeles County', state='CA', lat=33.8, lng=-118.17, timezone='Pacific', radius_in_miles=1.0, area_code_list=[], population=11074, population_density=5081.0, land_area_in_sqmi=2.18, water_area_in_sqmi=0.0, housing_units=4404, occupied_housing_units=4172, median_home_value=373400, median_household_income=71077, bounds_west=-118.184663, bounds_east=-118.149426, bounds_north=33.818916, bounds_south=33.789836)]
33.79369167 -118.172136
[SimpleZipcode(zipcode='90755', zipcode_type='Standard', major_city='Signal Hill', post_office_city='Signal Hill, CA', common_city_list=['Signal Hill', 'Long Beach'], county='Los Angeles County', state='CA', lat=33.8, lng=-118.17, timezone='Pacific', radius_in_miles=1.0, area_code_list=[], population=11074, population_density=5081.0, land_area_in_sqmi=2.18, water_area_in_sqmi=0.0, hou

[]
33.83694444 -118.241111
[]
33.83707428 -118.1170275
[SimpleZipcode(zipcode='90808', zipcode_type='Standard', major_city='Long Beach', post_office_city='Long Beach, CA', common_city_list=['Long Beach'], county='Los Angeles County', state='CA', lat=33.83, lng=-118.11, timezone='Pacific', radius_in_miles=3.0, area_code_list=['310', '323', '562', '657', '714'], population=38232, population_density=5568.0, land_area_in_sqmi=6.87, water_area_in_sqmi=0.1, housing_units=14567, occupied_housing_units=14167, median_home_value=497300, median_household_income=94335, bounds_west=-118.15411, bounds_east=-118.063162, bounds_north=33.847078, bounds_south=33.809915), SimpleZipcode(zipcode='90713', zipcode_type='Standard', major_city='Lakewood', post_office_city='Lakewood, CA', common_city_list=['Lakewood'], county='Los Angeles County', state='CA', lat=33.85, lng=-118.11, timezone='Pacific', radius_in_miles=2.0, area_code_list=['310', '323', '562', '657'], population=27925, population_density=8228.0,

[]
33.87666667 -118.3893333
[]
33.87666667 -118.3893333
[]
33.87666667 -118.3893333
[]
33.87666667 -118.3893333
[]
33.87666667 -118.3893333
[]
33.87666667 -118.3893333
[]
33.8777222 -118.1043056
[]
33.8777222 -118.1043056
[]
33.8777222 -118.1043056
[]
33.8777222 -118.1043056
[]
33.8777222 -118.1043056
[]
33.8777222 -118.1043056
[]
33.8797222 -118.3644444
[SimpleZipcode(zipcode='90278', zipcode_type='Standard', major_city='Redondo Beach', post_office_city='Redondo Beach, CA', common_city_list=['Redondo Beach'], county='Los Angeles County', state='CA', lat=33.87, lng=-118.37, timezone='Pacific', radius_in_miles=2.0, area_code_list=['424', '310'], population=40071, population_density=11120.0, land_area_in_sqmi=3.6, water_area_in_sqmi=0.0, housing_units=16655, occupied_housing_units=16009, median_home_value=675600, median_household_income=101575, bounds_west=-118.391371, bounds_east=-118.352347, bounds_north=33.894653, bounds_south=33.854356)]
33.8811111 -118.3616667
[SimpleZipcode(zipcode

[SimpleZipcode(zipcode='90250', zipcode_type='Standard', major_city='Hawthorne', post_office_city='Hawthorne, CA', common_city_list=['Hawthorne', 'Holly Park', 'Hollyglen'], county='Los Angeles County', state='CA', lat=33.91, lng=-118.35, timezone='Pacific', radius_in_miles=2.0, area_code_list=['310', '323', '424'], population=93193, population_density=13920.0, land_area_in_sqmi=6.7, water_area_in_sqmi=0.01, housing_units=32600, occupied_housing_units=31087, median_home_value=397500, median_household_income=45995, bounds_west=-118.378703, bounds_east=-118.313391, bounds_north=33.931965, bounds_south=33.894654)]
33.91175556 -118.3661778
[SimpleZipcode(zipcode='90250', zipcode_type='Standard', major_city='Hawthorne', post_office_city='Hawthorne, CA', common_city_list=['Hawthorne', 'Holly Park', 'Hollyglen'], county='Los Angeles County', state='CA', lat=33.91, lng=-118.35, timezone='Pacific', radius_in_miles=2.0, area_code_list=['310', '323', '424'], population=93193, population_density=1

[]
33.95466667 -118.0542222
[]
33.9545 -118.0542222
[]
33.95455556 -118.2309167
[]
33.95455556 -118.2309167
[]
33.95455556 -118.2309167
[]
33.95455556 -118.2309167
[]
33.95455556 -118.2309167
[]
33.95455556 -118.2309167
[]
33.9553 -118.1710778
[]
33.95662778 -118.4153333
[SimpleZipcode(zipcode='90045', zipcode_type='Standard', major_city='Los Angeles', post_office_city='Los Angeles, CA', common_city_list=['Los Angeles', 'Westchester'], county='Los Angeles County', state='CA', lat=33.96, lng=-118.4, timezone='Pacific', radius_in_miles=2.0, area_code_list=['323'], population=39480, population_density=3681.0, land_area_in_sqmi=10.73, water_area_in_sqmi=0.0, housing_units=16127, occupied_housing_units=15224, median_home_value=661700, median_household_income=77893, bounds_west=-118.439743, bounds_east=-118.368375, bounds_north=33.982796, bounds_south=33.92365)]
33.95662778 -118.4153333
[SimpleZipcode(zipcode='90045', zipcode_type='Standard', major_city='Los Angeles', post_office_city='Los A

[]
33.97857778 -118.061425
[SimpleZipcode(zipcode='90606', zipcode_type='Standard', major_city='Whittier', post_office_city='Whittier, CA', common_city_list=['Whittier', 'Los Nietos'], county='Los Angeles County', state='CA', lat=33.98, lng=-118.07, timezone='Pacific', radius_in_miles=2.0, area_code_list=['562'], population=32396, population_density=8564.0, land_area_in_sqmi=3.78, water_area_in_sqmi=0.08, housing_units=8931, occupied_housing_units=8633, median_home_value=328200, median_household_income=61800, bounds_west=-118.089113, bounds_east=-118.041524, bounds_north=34.0068, bounds_south=33.958773)]
33.97857778 -118.061425
[SimpleZipcode(zipcode='90606', zipcode_type='Standard', major_city='Whittier', post_office_city='Whittier, CA', common_city_list=['Whittier', 'Los Nietos'], county='Los Angeles County', state='CA', lat=33.98, lng=-118.07, timezone='Pacific', radius_in_miles=2.0, area_code_list=['562'], population=32396, population_density=8564.0, land_area_in_sqmi=3.78, water_a

[SimpleZipcode(zipcode='90660', zipcode_type='Standard', major_city='Pico Rivera', post_office_city='Pico Rivera, CA', common_city_list=['Pico Rivera'], county='Los Angeles County', state='CA', lat=33.99, lng=-118.08, timezone='Pacific', radius_in_miles=4.0, area_code_list=['562'], population=62928, population_density=8120.0, land_area_in_sqmi=7.75, water_area_in_sqmi=0.49, housing_units=17106, occupied_housing_units=16564, median_home_value=330000, median_household_income=57605, bounds_west=-118.121307, bounds_east=-118.05458, bounds_north=34.021917, bounds_south=33.950362)]
33.9952905 -118.073679
[SimpleZipcode(zipcode='90660', zipcode_type='Standard', major_city='Pico Rivera', post_office_city='Pico Rivera, CA', common_city_list=['Pico Rivera'], county='Los Angeles County', state='CA', lat=33.99, lng=-118.08, timezone='Pacific', radius_in_miles=4.0, area_code_list=['562'], population=62928, population_density=8120.0, land_area_in_sqmi=7.75, water_area_in_sqmi=0.49, housing_units=171

ValueError: could not convert string to float: 

In [117]:
help(search.by_coordinates)

Help on method by_coordinates in module uszipcode.search:

by_coordinates(lat, lng, radius=25.0, zipcode_type='Standard', sort_by='dist', ascending=True, returns=5) method of uszipcode.search.SearchEngine instance
    Search zipcode information near a coordinates on a map.
    
    Returns multiple results.
    
    :param lat: center latitude.
    :param lng: center longitude.
    :param radius: only returns zipcode within X miles from ``lat``, ``lng``.
    
    **中文文档**
    
    1. 计算出在中心坐标处, 每一经度和纬度分别代表多少miles.
    2. 以给定坐标为中心, 画出一个矩形, 长宽分别为半径的2倍多一点, 找到该
      矩形内所有的Zipcode.
    3. 对这些Zipcode计算出他们的距离, 然后按照距离远近排序。距离超过我们
      限定的半径的直接丢弃.

