# downloading and quality controlling gauge data

In this notebook, I have an example of the process I used to download and quality control gauge data from the Synoptic data API (https://synopticdata.com/weatherapi/)

To run the code, you'll need a Synoptic account. There is a free version available, which is what I use, although it's good to note that there are limitations on the amount and time period of data you can download with the free accounts

Last thing before we get going is to download this spreadsheet, which I use to translate from Synoptic network IDs to actual network names: https://docs.google.com/spreadsheets/d/1_6kWgqkqEPXW50-1HlkU6ylgGasvyzDO/edit?usp=sharing&ouid=112095056147865565966&rtpof=true&sd=true

In [None]:
# import libraries

import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
pd.set_option('display.max_rows', None) # optional, for showing all rows in a table
import xarray as xr
from datetime import datetime

import cartopy.crs as ccrs
import cartopy.feature 

import urllib.request as req
import json
import requests
from tqdm import trange

import resample_qc

### setup for running the notebook

In [None]:
# set up some important path information

netID_path = '/path/to/synoptic_netids.xlsx'           # USER INPUT! path to network spreadsheet
netids = pd.read_excel(netID_path)

using_atlas14 = False                                  # USER INPUT! whether or not to use atlas14 data for QC
atlas14_path = '/path/to/atlas14_data/'                # USER INPUT! path to atlas14 (if using, otherwise ignore)

In [None]:
# set up for downloading data from Synoptic

start = '202308180000'      # USER INPUT! start date and time in YYYYMMDDHHMM
end   = '202308230000'      # USER INPUT! end date and time in YYYYMMDDHHMM
api_token = ''              # USER INPUT! API token associated with synoptic data account
lat_bounds = [32, 38]       # USER INPUT! lower and upper latitude boundaries for study area
lon_bounds = [-123, -114]   # USER INPUT! lower and upper longitude boundaries for study area
units = 'metric'            # USER INPUT! units - 'metric' for mm and 'english' for inches 

### Download gauge data from Synoptic API

In [None]:
# download gauge data for desired time period 

lat_list = np.arange(lat_bounds[0], lat_bounds[1]+0.1, 0.2) # break study area into latitude bands
dict_list = [] # define a list for data to be stored in

for i in trange(len(lat_list) - 1): # loop over lat bands
    # define API request URL
    bbox = f'{lon_bounds[0]},{lat_list[i]},{lon_bounds[1]},{lat_list[i+1]}' # area for data download
    api_root = 'https://api.synopticdata.com/v2/stations/timeseries'
    api_params = ( 
        f'?token={api_token}&'
        f'start={start}&end={end}&'
        f'bbox={bbox}&'
        f'vars=precip&precip=1&all_reports=1&'
        f'units={units}'
    )
    api_request_url = api_root + api_params
    
    # request and download data
    response = req.urlopen(api_request_url)
    api_text_data = response.read() 
    data_dict = json.loads(api_text_data)
    
    # check for errors
    if list(data_dict.keys()) == ['SUMMARY']:
        print(data_dict['SUMMARY']['RESPONSE_MESSAGE'])
        continue
    
    # add data from lat band to list
    station_list_partial = data_dict['STATION']
    dict_list.append(station_list_partial)
    
# combine data from all lat bands into one list
station_list = sum(dict_list, [])
print('number of stations: ', len(station_list))

### check that all stations downloaded have precipitation data

In [None]:
# check for stations with no interval precip data
for station in station_list:
    if 'precip_intervals_set_1d' not in station['OBSERVATIONS'].keys():
        print('no intervals: ', station['STID'])
        
# check for stations with no accumulated precip data
for station in station_list:
    if 'precip_accumulated_set_1d' not in station['OBSERVATIONS'].keys():
        print('no accumulated: ', station['STID'])

### resample and perform quality control on station data

NOTE: for help with the resample_qc.py functions, you can look at the resample_qc.py file, or take a look at the description of the functiom (for example, by running the line
? resample_qc.resample 
to look at the doc string of the resample function)

In [None]:
# set up dataframe to store gauge information/metrics
df_all = pd.DataFrame(columns=['STID','Name','Network','Latitude','Longitude',
                               'Storm_Total','Max_Intensity','QC'])
# set up dictionary to store gauge data
resampled_data_dict = {}

# loop through each station
for i in trange(len(station_list)):
    # get station info
    station = station_list[i]                                       # select station
    stid = station['STID']                                          # station ID
    name = station['NAME']                                          # station name
    lat = float(station['LATITUDE'])                                # latutude
    lon = float(station['LONGITUDE'])                               # longitude
    mnet_id = int(station['MNET_ID'])                               # mesonet ID
    network = netids['Name'].loc[netids['ID'] == mnet_id].values[0] # network, from mesonet ID

    # resample to hourly
    df_raw = pd.DataFrame.from_dict(station['OBSERVATIONS'])
    datetimes_raw = df_raw['date_time']
    precip_raw = df_raw['precip_intervals_set_1d']
    df_resampled = resample_qc.resample(precip_raw, datetimes_raw)

    # find maximum hourlt intensity and total storm accumulation
    maximum = df_resampled.max()
    total = precip_raw.sum()

    # quality control
    qc_flag = resample_qc.quality_control(
        precip_raw, datetimes_raw, 
        PFDS=using_atlas14,
        PFDS_folder=atlas14_path, 
        lat=lat, lon=lon)
    
    # store station info to dataframe
    df_all.loc[i] = [stid, name, network, lat, lon, total, maximum, qc_flag]
    # store resampled hourly data to dictionary 
    resampled_data_dict[stid] = df_resampled

### always good to check the data

In [None]:
# take a look at the stations that passed quality control
df_all[df_all['QC']==0].sort_values(by='Storm_Total')

In [None]:
# take a look at the stations that did NOT pass quality control
df_all[df_all['QC']!=0].sort_values(by='Storm_Total')

### save the results!

You can save the gauge information dataframe, but note that this does not contain the hourly gauge data, just the accumulations and max intensities. Below is also code for creating and saving a netCDF file, which stores gauge information (such as station ID, location, network, and QC flag), along with the resampled hourly data. 

In [None]:
# save information dataframe, if desired
df_all.to_csv('gauge_info_df.csv')

In [None]:
# set up data to save to netCDF
hourly_data = []

start_dt = datetime.strptime(start, '%Y%m%d%H%M')
end_dt = datetime.strptime(end, '%Y%m%d%H%M')
time_array = np.arange(start_dt, end_dt, dtype='datetime64[h]')

for i in df_all.index:
    stid = df_all['STID'].loc[i]

    precip_resample = resampled_data_dict[stid].copy()
    precip_resample.index = precip_resample.index.values.astype('datetime64[h]')
    precip_resample_reindex = precip_resample.reindex(time_array)
    precip_resample_vals = precip_resample_reindex.values
    hourly_data.append(precip_resample_vals)

In [None]:
# put station data and information into an xarray dataset
ds = xr.Dataset(
    data_vars=dict(
        precip_int=(['station', 'time'], hourly_data),
        total=(['station'], df_all['Storm_Total'].values),
        maximum=(['station'], df_all['Max_Intensity'].values),
        network=(['station'], df_all['Network'].values ),
        qc_flag=(['station'], df_all['QC'].values )
    ),
    coords=dict(
        station = df_all['STID'].values,
        time=time_array,
        lon=('station', df_all['Longitude'].values),
        lat=('station', df_all['Latitude'].values),
    ),
)

In [None]:
# save dataset to netCDF file
ds.to_netcdf('mesowest_gaugedata_hourly.nc')