# Scraping USGS Gages
_Calvin Whealton_

Standard libraries to import

In [1]:
import os
import pandas as pd
import requests

Changing directory. The usgs_gages.txt is a file that has been pre-screened for having over 20 years of peak flow data

In [2]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/peak_data')
gage_df = pd.read_csv('usgs_gages.txt', sep="\t",comment='#')
gage_df.head()

Unnamed: 0,agency_cd,site_no,station_nm\n
0,USGS,1010000,"St. John River at Ninemile Bridge, Maine\n"
1,USGS,1010070,"Big Black River near Depot Mtn, Maine\n"
2,USGS,1010500,"St. John River at Dickey, Maine\n"
3,USGS,1011000,"Allagash River near Allagash, Maine\n"
4,USGS,1011500,"St. Francis River near Connors, New Brunswick\n"


Taking all data from usgs gages

In [52]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/peak_data/gages')

for i in gage_df.index:
    gage_no = "{:08d}".format(gage_df['site_no'][i]) # add leading zeros, if needed
    agency_cd = gage_df['agency_cd'][i] # USGS or other agency
    
    # formatting url for request
    url = 'https://nwis.waterdata.usgs.gov/nwis/peak?site_no='+gage_no+'&agency_cd=' + agency_cd + '&format=rdb'
    
    # reading data from url
    gage_data_temp = pd.read_csv(url,sep='\t',comment='#')
    
    # checking if it is a bad url
    if list(gage_data_temp.columns) != ['No sites/data found using the selection criteria specified ']:
        # dropping row that describes length of variable
        gage_data_temp.drop(index=0,inplace=True)
        # name for saving
        name = gage_no+'.csv'
        # saving
        gage_data_temp.to_csv(name)
  

Looping over the dataset to calculate the frequency of floods. Will use the time period 1990-2018. The frequency model used will be the log-normal distribution without a time trend parameter.

In [56]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/peak_data/gages')
list_files = os.listdir()

counter = 0

for i in list_files:
    gage_data_temp = pd.read_csv(i)
    counter += gage_data_temp.shape[0]
    
print(counter)

662477


In [57]:
662477/13852

47.82536817788045

## making dataframe with return periods

In [2]:
from scipy.stats import norm
import numpy as np
import datetime

In [3]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/peak_data/gages')
list_files = os.listdir()

yr_list = ['Gage']
for i in range(1970,2020):
    yr_list.append(str(i))

In [4]:
ts_rps = pd.DataFrame(columns=yr_list)
ts_rps.head()

Unnamed: 0,Gage,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019


In [5]:
# index used in counting rows of dataframe
ind = 0
    
for i in range(len(list_files)):
    # reading in file
    gage_data_temp = pd.read_csv(list_files[ind])
    
    # calculating log-space mean and standard deviation for peak values
    log_std = np.log(gage_data_temp['peak_va']).std()
    log_mean = np.log(gage_data_temp['peak_va']).mean()
    
    # calculating all return periods of the events
    rps = 1/(1-norm.cdf(np.log(gage_data_temp['peak_va']), loc=log_mean, scale=log_std))
    yrs = gage_data_temp['peak_dt'].str.slice(start=0,stop=4)
    
    if len(yrs) != len(yrs.unique()): # condition with more than one flood in a calendar year
        rps_use = []
        yrs_use = []
        for k in yrs.unique():
            inds1 = (yrs == k)
            if len(inds1) != 1:
                yrs_use.append(k)
                rps_use.append(max(rps[inds1]))
            else:
                yrs_use.append(k)
                rps_use.append(rps[inds1])
    else:
        yrs_use = list(yrs)
        rps_use = list(rps)
    
    # assigning an row of zero values
    ts_rps.loc[ts_rps.shape[0]] = 0
    # getting the gage number
    gage_temp = list_files[i].split('.')[0]
    # assigning gage number
    ts_rps.loc[ind]['Gage'] = gage_temp
    
    # putting calculated return periods in the results
    for j in range(len(yrs_use)):
        if yrs_use[j] in ts_rps.columns:
            ts_rps.loc[ind][yrs_use[j]] = rps_use[j]
    
    # incrementing index
    ind += 1

  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = (x >= _b) & cond0
  result = getattr(ufunc, method)(*inputs, **kwargs)
  x = np.asarray((x - loc)/scale, dtype=dtyp)
  del sys.path[0]


In [6]:
ts_rps.shape

(13706, 51)

In [7]:
ts_rps.to_csv('ts_rps.csv')

In [None]:
# Joining spatial location of each gage