# Scraping USGS Gages
_Calvin Whealton_

This notebook is used to process the USGS.

In [1]:
import os
import pandas as pd
import requests

Changing directory. The usgs_gages.txt is a file that has been pre-screened for having over 20 years of peak flow data

In [2]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/data/peak_data')
gage_df = pd.read_csv('usgs_gages.txt', sep="\t",comment='#')
gage_df.head()

Unnamed: 0,agency_cd,site_no,station_nm\n
0,USGS,1010000,"St. John River at Ninemile Bridge, Maine\n"
1,USGS,1010070,"Big Black River near Depot Mtn, Maine\n"
2,USGS,1010500,"St. John River at Dickey, Maine\n"
3,USGS,1011000,"Allagash River near Allagash, Maine\n"
4,USGS,1011500,"St. Francis River near Connors, New Brunswick\n"


Taking all data from usgs gages

In [3]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/data/peak_data/gages')

for i in gage_df.index:
    gage_no = "{:08d}".format(gage_df['site_no'][i]) # add leading zeros, if needed
    agency_cd = gage_df['agency_cd'][i] # USGS or other agency
    
    # formatting url for request
    url = 'https://nwis.waterdata.usgs.gov/nwis/peak?site_no='+gage_no+'&agency_cd=' + agency_cd + '&format=rdb'
    
    # reading data from url
    gage_data_temp = pd.read_csv(url,sep='\t',comment='#')
    
    # checking if it is a bad url
    if list(gage_data_temp.columns) != ['No sites/data found using the selection criteria specified ']:
        # dropping row that describes length of variable
        gage_data_temp.drop(index=0,inplace=True)
        # name for saving
        name = gage_no+'.csv'
        # saving
        gage_data_temp.to_csv(name)
  

KeyboardInterrupt: 

Looping over the dataset to calculate the frequency of floods. Will use the time period 1990-2018. The frequency model used will be the log-normal distribution without a time trend parameter.

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/peak_data/gages')
list_files = os.listdir()

counter = 0

for i in list_files:
    gage_data_temp = pd.read_csv(i)
    counter += gage_data_temp.shape[0]
    
print(counter)

In [None]:
662477/13852

## making dataframe with return periods

In [4]:
from scipy.stats import norm
import numpy as np
import datetime

In [38]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/data/peak_data/gages')
import glob

def listdir_nohidden(path):
    return glob.glob(os.path.join(path, '*'))

list_files = listdir_nohidden('/Users/calvinwhealton/Documents/GitHub/tdi_capstone/data/peak_data/gages')

yr_list = ['Gage']
for i in range(1970,2020):
    yr_list.append(str(i))

In [39]:
ts_rps = pd.DataFrame(columns=yr_list)
ts_rps.head()

Unnamed: 0,Gage,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019


In [47]:
# index used in counting rows of dataframe
ind = 0
    
for i in range(len(list_files)):
    # reading in file
    gage_data_temp = pd.read_csv(list_files[i])
    gage_data_temp = gage_data_temp.loc[pd.notnull(gage_data_temp['peak_va'])]
    
    if gage_data_temp.shape[0] != 0:
        # calculating log-space mean and standard deviation for peak values
        peaks = np.array(gage_data_temp['peak_va'].values)
        if min(peaks) == 0:
            peaks[peaks == 0] = 0.99*min(peaks[peaks > 0])

        log_std = np.log(peaks).std()
        log_mean = np.log(peaks).mean()

        # calculating all return periods of the events
        rps = 1/(1-norm.cdf(np.log(gage_data_temp['peak_va']), loc=log_mean, scale=log_std))
        yrs = gage_data_temp['peak_dt'].str.slice(start=0,stop=4)

        if len(yrs) != len(yrs.unique()): # condition with more than one flood in a calendar year
            rps_use = []
            yrs_use = []
            for k in yrs.unique():
                inds1 = (yrs == k)
                if len(inds1) != 1:
                    yrs_use.append(k)
                    rps_use.append(max(rps[inds1]))
                else:
                    yrs_use.append(k)
                    rps_use.append(rps[inds1])
        else:
            yrs_use = list(yrs)
            rps_use = list(rps)

        # assigning an row of zero values
        ts_rps.loc[ts_rps.shape[0]] = 0
        # getting the gage number
        gage_temp = list_files[i].split("/").pop().split(".")[0]
        # assigning gage number
        ts_rps.loc[ind]['Gage'] = gage_temp

        # putting calculated return periods in the results
        for j in range(len(yrs_use)):
            if yrs_use[j] in ts_rps.columns:
                ts_rps.loc[ind][yrs_use[j]] = rps_use[j]
    
        # incrementing index
        ind += 1



In [48]:
ts_rps.head(20)

Unnamed: 0,Gage,1970,1971,1972,1973,1974,1975,1976,1977,1978,...,2010,2011,2012,2013,2014,2015,2016,2017,2018,2019
0,1123500,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,14330000,8.08053,19.2417,11.2193,0.0,16.9228,1.67803,0.0,9.21908,0.0,...,1.38721,9.17989,5.30817,0.0,19.2417,0.0,1.58443,2.65549,0.0,0.0
2,1672500,1.18902,2.19705,61.153,1.92672,0.0,8.96566,1.91966,0.0,4.93684,...,0.0,3.07106,0.0,1.53552,5.36156,1.09478,1.4819,0.0,0.0,0.0
3,6800500,1.03174,13.8018,1.42261,1.39686,3.02766,1.60203,1.1636,1.42663,11.6419,...,29.4834,4.24819,1.31784,1.10408,3.00502,2.30459,5.59732,2.13512,5.63623,407.128
4,1127000,5.87653,1.05118,5.33145,3.55282,1.91852,1.24687,5.597,1.59335,22.2376,...,129.081,3.72341,0.0,3.02233,3.55282,1.20376,1.16182,1.38445,2.31667,2.73031
5,3336900,1.91224,1.66998,0.0,2.42294,2.0901,1.34112,1.38453,1.17256,1.18064,...,0.0,2.30495,1.00625,7.06495,1.91224,9.68871,0.0,0.0,9.04643,0.0
6,1394500,1.61369,18.6274,1.59871,131.72,0.0,13.2392,1.34342,4.74116,0.0,...,7.57473,1737.47,0.0,1.82945,16.4032,2.60811,2.29382,0.0,4.95816,2.39338
7,11203200,5.14674,1.14572,0.0,3.67915,3.64871,1.53977,1.1413,1.03023,4.55112,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,14187000,0.0,3.05763,100.745,0.0,0.0,0.0,0.0,0.0,0.0,...,1.18037,1.98357,3.65152,0.0,1.58877,2.00479,0.0,1.82461,0.0,2.88437
9,2226190,21.2604,3.34977,1.53774,2.38017,1.41771,4.77761,6.19001,1.5021,1.26938,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [41]:
ts_rps.shape

(13295, 51)

In [49]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
ts_rps.to_csv('ts_rps_2020-08-15.csv')