In [1]:
import re
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [61]:
def load_data(file_name):
    data = []
    with open(file_name, 'r') as file:
        data = file.read().split("#\n")

    if not data:
        print("File {} was unable to be read.".format(file_name))
    return data

In [62]:
def convert_to_df(raw_data):
    # The raw data comes in a giant string with tabs and newlines
    data_split = raw_data.split("\n")
    for i in range(len(data_split)):
        data_split[i] = data_split[i].split("\t")
        
    return pd.DataFrame(data_split[2:], columns = data_split[0])

In [70]:
def get_parameter_def(param_header, regex = "# +(\w+) +-+ +(.+)"):
    params_dict = {}
    params_type = {}
    params = param_header.split("\n")
    params_pattern = re.compile(regex)

    for param in params:
        a = params_pattern.search(param)

        if a: 
            #print(first_t, a.group(1), a.group(2))
            params_dict[a.group(1).lower()] = a.group(2)
            #print(a.group(1) + ":", a.group(2).split(',')[-1])

    return params_dict

## Load in the Quality Data to get the Site Numbers 

In [3]:
quality_file_name = "LA_Water_Quality_Data.txt"
data = load_data(quality_file_name)
print("~~~~~ Quality Data ~~~~~")
print("Number of sections:", len(data))
print("2nd section:", data[2], sep = "\n")
print("Number of characters in actual data:", len(data[15]))

Number of sections: 16
2nd section:
# U.S. Geological Survey
# 
# This file contains selected water-quality data for stations in the National Water Information 
# System water-quality database.  Explanation of codes found in this file are followed by
# the retrieved data.

Number of characters in actual data: 9736982


In [22]:
sites = data[13].split("\n")
sites_pattern = re.compile("# +USGS (\d+)")

In [27]:
site_ids = []
for site in sites:
    site_found = sites_pattern.search(site)
    if site_found:
        site_ids.append(site_found.group(1))
        
len(site_ids)

1212

## Load in the Site information to connect to Quality Data 

In [44]:
site_file_name = "LA_Water_Site_Data.txt"
site_data = load_data(site_file_name)
print("~~~~~ Site Data ~~~~~")
print("Number of sections:", len(site_data))
print("Number of characters in actual data:", len(site_data[10]))

~~~~~ Site Data ~~~~~
Number of sections: 11
Number of characters in actual data: 418205


    #  0:                               #  6:  
    #  1:                               #  7: query started 2018-09-...
    #  2: U.S. Geological Survey        #  8: there are 1212 sites...  
    #  3: Te Site File stores...        #  9:    
    #  4: The following selected...     # 10: Data!
    #  5: Param_id      - parameter      

In [65]:
site_info = convert_to_df(site_data[10])
#site_info.head(2)

In [79]:
site_params = get_parameter_def(site_data[5])
site_params

{'agency_cd': 'Agency',
 'site_no': 'Site identification number',
 'station_nm': 'Site name',
 'site_tp_cd': 'Site type',
 'lat_va': 'DMS latitude',
 'long_va': 'DMS longitude',
 'dec_lat_va': 'Decimal latitude',
 'dec_long_va': 'Decimal longitude',
 'coord_meth_cd': 'Latitude-longitude method',
 'coord_acy_cd': 'Latitude-longitude accuracy',
 'coord_datum_cd': 'Latitude-longitude datum',
 'dec_coord_datum_cd': 'Decimal Latitude-longitude datum',
 'district_cd': 'District code',
 'state_cd': 'State code',
 'county_cd': 'County code',
 'country_cd': 'Country code',
 'land_net_ds': 'Land net location description',
 'map_nm': 'Name of location map',
 'map_scale_fc': 'Scale of location map',
 'alt_va': 'Altitude of Gage/land surface',
 'alt_meth_cd': 'Method altitude determined',
 'alt_acy_va': 'Altitude accuracy',
 'alt_datum_cd': 'Altitude datum',
 'huc_cd': 'Hydrologic unit code',
 'basin_cd': 'Drainage basin code',
 'topo_cd': 'Topographic setting code',
 'data_types_cd': 'Flags for th

In [85]:
print("Different Districts:", set(site_info["district_cd"]))
print("Different Counties: ", set(site_info["county_cd"]))
print("Different Countries:", set(site_info["country_cd"]))
print("Different SiteTypes:", set(site_info["site_tp_cd"]))

Different Districts: {None, '36', '06'}
Different Counties:  {None, '037'}
Different Countries: {None, 'US'}
Different SiteTypes: {'GW', 'LK', 'FA-WDS', 'FA-WU', 'ST', 'FA-WWTP', 'OC-CO', 'FA-SPS', 'FA-OF', 'SP', 'ES', 'GW-HZ', 'LA-SH', 'AT', None, 'GW-TH', 'SB-UZ'}


In [None]:
from uszipcode import SearchEngine, Zipcode
search = SearchEngine(simple_zipcode=False)

Start downloading data for rich info zipcode database, total size 450+MB ...
  10 MB finished ...
  20 MB finished ...


In [None]:
for i in range(len(site_info[["dec_lat_va", "dec_long_va"]])):
    lat, long = site_info[["dec_lat_va", "dec_long_va"]].iloc[i]
    print(float(lat), float(long))
    result = search.by_coordinates(float(lat), float(long), radius=1.0)
    print(result)

In [117]:
help(search.by_coordinates)

Help on method by_coordinates in module uszipcode.search:

by_coordinates(lat, lng, radius=25.0, zipcode_type='Standard', sort_by='dist', ascending=True, returns=5) method of uszipcode.search.SearchEngine instance
    Search zipcode information near a coordinates on a map.
    
    Returns multiple results.
    
    :param lat: center latitude.
    :param lng: center longitude.
    :param radius: only returns zipcode within X miles from ``lat``, ``lng``.
    
    **中文文档**
    
    1. 计算出在中心坐标处, 每一经度和纬度分别代表多少miles.
    2. 以给定坐标为中心, 画出一个矩形, 长宽分别为半径的2倍多一点, 找到该
      矩形内所有的Zipcode.
    3. 对这些Zipcode计算出他们的距离, 然后按照距离远近排序。距离超过我们
      限定的半径的直接丢弃.

