# Scraping USGS Gages
_Calvin Whealton_

This notebook is used to process the USGS.

In [None]:
import os
import pandas as pd
from scipy.stats import norm
import numpy as np
import datetime
import glob

Changing directory. The usgs_gages.txt is a file that has been pre-screened for having over 20 years of peak flow data

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/gage_data')
gage_df = pd.read_csv('usgs_gages.txt', sep="\t",comment='#')
gage_df.head()

Taking all data from usgs gages

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/gage_data/peak_flows')

# looping over all gages
for i in gage_df.index:
    gage_no = "{:08d}".format(gage_df['site_no'][i]) # add leading zeros, if needed
    agency_cd = gage_df['agency_cd'][i] # USGS or other agency
    
    # formatting url for request
    url = 'https://nwis.waterdata.usgs.gov/nwis/peak?site_no='+gage_no+'&agency_cd=' + agency_cd + '&format=rdb'
    
    # reading data from url
    gage_data_temp = pd.read_csv(url,sep='\t',comment='#')
    
    # checking if it is a bad url
    if list(gage_data_temp.columns) != ['No sites/data found using the selection criteria specified ']:
        # dropping row that describes length of variable
        gage_data_temp.drop(index=0,inplace=True)
        # name for saving
        name = gage_no+'.csv'
        # saving
        gage_data_temp.to_csv(name)
  

Looping over the dataset to calculate the frequency of floods. Will use the time period 1990-2018. The frequency model used will be the log-normal distribution without a time trend parameter.

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/gage_data/peak_flows')
list_files = os.listdir()

counter = 0

for i in list_files:
    gage_data_temp = pd.read_csv(i)
    counter += gage_data_temp.shape[0]
    
print(counter)

In [None]:
# calculating average number of years per gage
662477/13852

## making dataframe with return periods

In [None]:
# only returning files that are "real" gages
# nothing that begins with .
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/gage_data/peak_flows')

def listdir_nohidden(path):
    return glob.glob(os.path.join(path, '*'))

list_files = listdir_nohidden('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/gage_data/peak_flows')

yr_list = ['Gage']
for i in range(1970,2020):
    yr_list.append(str(i))

In [None]:
ts_rps = pd.DataFrame(columns=yr_list)
ts_rps.head()

In [None]:
# index used in counting rows of dataframe
ind = 0
    
for i in range(len(list_files)):
    # reading in file
    gage_data_temp = pd.read_csv(list_files[i])
    gage_data_temp = gage_data_temp.loc[pd.notnull(gage_data_temp['peak_va'])]
    
    if gage_data_temp.shape[0] != 0:
        # calculating log-space mean and standard deviation for peak values
        peaks = np.array(gage_data_temp['peak_va'].values)
        if min(peaks) == 0:
            peaks[peaks == 0] = 0.99*min(peaks[peaks > 0])

        log_std = np.log(peaks).std()
        log_mean = np.log(peaks).mean()

        # calculating all return periods of the events
        rps = 1/(1-norm.cdf(np.log(gage_data_temp['peak_va']), loc=log_mean, scale=log_std))
        yrs = gage_data_temp['peak_dt'].str.slice(start=0,stop=4)

        if len(yrs) != len(yrs.unique()): # condition with more than one flood in a calendar year
            rps_use = []
            yrs_use = []
            for k in yrs.unique():
                inds1 = (yrs == k)
                if len(inds1) != 1:
                    yrs_use.append(k)
                    rps_use.append(max(rps[inds1]))
                else:
                    yrs_use.append(k)
                    rps_use.append(rps[inds1])
        else:
            yrs_use = list(yrs)
            rps_use = list(rps)

        # assigning an row of zero values
        ts_rps.loc[ts_rps.shape[0]] = 0
        # getting the gage number
        gage_temp = list_files[i].split("/").pop().split(".")[0]
        # assigning gage number
        ts_rps.loc[ind]['Gage'] = gage_temp

        # putting calculated return periods in the results
        for j in range(len(yrs_use)):
            if yrs_use[j] in ts_rps.columns:
                ts_rps.loc[ind][yrs_use[j]] = rps_use[j]
    
        # incrementing index
        ind += 1

In [None]:
ts_rps.head(20)

In [None]:
ts_rps.shape

In [None]:
os.chdir('/Users/calvinwhealton/Documents/GitHub/floods_housing_zipcode/data/processed_data')
ts_rps.to_csv('ts_rps_2020-08-15.csv')