# Reading in citizen science gauge data (1-5 min)

### Notes
- Intense QC code read in, not sure how easy that will be, I had to clone it off Github and change line 10 of code as got an error with a package, proabbly due to Python versions. Think the package needs updating on Github but not sure who is still in charge of maintaining it. Need ETCCDI data (available in example data for Intense QC)


### What does the code do
- Reads in Urban Observatory rain gauge data
- Reads in National Green Infrastructure Facility rain gauge data
- Reads in Acomb Flood Group rain gauge data

### What does the code need to do?
1. Download the data
    - Try API
    - Sort out problems with wrong accumulation methods
    - Save data
2. Quality control data
    - Use intense-qc code
    - Remove failed gauges
    - Remove failed observations
    - Save Qc'ed data
    
### Outputs format
- `\root` folder path 
    - `\UO` folder path (1-5 minute rain gauge data)
        - `<station-id>_<eastings>_<northings>.csv` - individual (varying resolution) gauge data for station `<station-id>`
        - `qc` folder path (quality controlled rain gauge data)
            - `<station-id>_<eastings>_<northings>.csv` - individual (15 minute resolution) quality controlled rain gauge data for station `<station-id>`
    - `\NGIF` folder path (1 minute rain gauge data)
        - `<station-id>_<eastings>_<northings>.csv` - individual (varying resolution) gauge data for station `<station-id>`
        - `qc` folder path (quality controlled rain gauge data)
            - `<station-id>_<eastings>_<northings>.csv` - individual (15 minute resolution) quality controlled rain gauge data for station `<station-id>`
    - `\CS` folder path (citizen science rain gauge data)
        - `<station-id>_<eastings>_<northings>.csv` - individual (varying resolution) gauge data for station `<station-id>`
        - `qc` folder path (quality controlled rain gauge data)
            - `<station-id>_<eastings>_<northings>.csv` - individual (15 minute resolution) quality controlled rain gauge data for station `<station-id>`

In [3]:
# Import relevent packages
import pandas as pd
import numpy as np
import requests
from os.path import join, exists, split
import os
from datetime import datetime
import io

In [4]:
##### Inputs to change
start_date = "2023-06-20"
end_date = "2023-06-30"

out_path = r"C:\Users\Amy\OneDrive - Newcastle University (1)\Documents\PYRAMID\data\realtime"

# Bounding box for data 
e_l, n_l, e_u, n_u = [355000, 534000, 440000, 609000]
#lon_l, lat_l, lon_u, lat_u = [-2.6771176, 54.702623, -1.3749203, 55.361917]
bbox = [e_l, e_u, n_l, n_u]

# Quality control test data path
static_data_path = join(out_path, "static")
intense_path = join(static_data_path, "intense-qc")
etccdi_data_path = join(join(intense_path, "tests"), "etccdi_data")


In [5]:
# File paths
uo_outpath = join(out_path, "UO")
if not exists(uo_outpath):
    os.mkdir(uo_outpath)
    
cs_outpath = join(out_path, "CS")
if not exists(cs_outpath):
    os.mkdir(cs_outpath)
    
ngif_outpath = join(out_path, "NGIF")
if not exists(ngif_outpath):
    os.mkdir(ngif_outpath)
    
# new 15 minute timestamp
new_timestamp = pd.date_range(
    pd.to_datetime(start_date),
    pd.to_datetime(end_date) + pd.Timedelta(1, "d"),
    freq=str(15 * 60) + "s", 
    tz="UTC"
)

In [6]:
### NGIF rain gauge
easting = "424038"
northing = "564414"
sensor_loc = "Ensemble E Pit Gauge"
sensor_id = "Pit rain gauge"
sensor = sensor_loc.replace(" ", "-")

# convert timestamp
start_time = pd.to_datetime(start_date)
end_time = pd.to_datetime(end_date)

dates = pd.date_range(start_time, end_time)
dates = dates.format("%f")[1:]

tabs = []

for i in range(len(dates) - 1):
    path = "https://ngif.newcastle.ac.uk/download/Ensemble%20E/Pit%20Rain%20Gauge%23%401m/" + dates[i] + "/" + dates[i + 1]
    try:
        r = requests.get(path, verify=False)
        tab = pd.read_csv(io.StringIO(r.text))
        if tab.shape[0] > 0:
            tabs.append(tab)
    except:
        print("Failed", dates[i])
        
tabs = pd.concat(tabs, ignore_index=True)

high_res = 0.2 * pd.Series(tabs[tabs.columns[1]].values, index = pd.to_datetime(tabs.time)) # Stored as tips, 1 is one tip, gauge should have 0.2mm bucket?
high_res.index = high_res.index.tz_localize(None)
high_res = high_res.sort_index()
high_res.to_csv(join(ngif_outpath, sensor + "_" + str(easting) + "_" + str(northing) + ".csv"))



Failed 2023-06-23
Failed 2023-06-24
Failed 2023-06-25




Failed 2023-06-26
Failed 2023-06-27




In [None]:
### Urban Observatory gauge data (including CS Acomb data)

# Get list of rainfall stations

# convert timestamp
start_time = pd.to_datetime(start_date)
end_time = pd.to_datetime(end_date)

api_date_string_format = "%Y%m%d%H%M%S"

# get sensors
sensor_params = dict(
    variable='Rainfall')

r = requests.get('http://uoweb3.ncl.ac.uk/api/v1.1/sensors/csv/', sensor_params)
sensor_info = pd.read_csv(io.StringIO(r.text))

# Function to fix dogdy accumulation that is done by UO maintenence
def rescale(data):
    rescaled = pd.Series(np.nan, index=data.index)
    d = data.dropna()
    diffs = d.diff()
    cond = (diffs > 0) & (~np.isnan(diffs))
    rescaled.loc[diffs[cond].index] = diffs.loc[diffs[cond].index]
    return rescaled

for sensor in sensor_info["Sensor Name"]:

    # get sensor data
    data_params = dict(
        data_variable='Rainfall',  # variable=Daily%20Accumulation%20Rainfall%2CRainfall
        starttime=pd.to_datetime(start_date).strftime(api_date_string_format),
        endtime=pd.to_datetime(end_date).strftime(api_date_string_format)
    )

    path = 'http://uoweb3.ncl.ac.uk/api/v1.1/sensors/{sensor_name}/data/csv/'
    path = path.replace("{sensor_name}", sensor)

    r = requests.get(path, data_params)
    if r.status_code == 200:
        try:
            data = pd.read_csv(io.StringIO(r.text))
            
            if len(data) > 0:
                transformer = Transformer.from_crs("epsg:4326", "epsg:27700")
                easting, northing = transformer.transform(data["Sensor Centroid Latitude"].iloc[0],
                                                          data["Sensor Centroid Longitude"].iloc[0])
                
                data_df = pd.Series(data.Value.values, index=pd.to_datetime(data.Timestamp))
                rescaled = pd.DataFrame(index=data_df.index)
                rescaled[sensor] = np.nan
                
                if "ACOMB" in sensor:
                    
                    rescaled.loc[: "2022-05-17", sensor] = data_df["0"].loc[: "2022-05-17"]
                    rescaled.loc["2022-05-17" :, sensor] = rescale(data_df["0"].loc["2022-05-17" :])
                    rescaled.to_csv(join(cs_outpath, sensor + "_" + str(easting) + "_" + str(northing) + ".csv"))
                else:
                    if "FS_" not in sensor:
                        rescaled[sensor] = rescale(data_df["0"])
                    else:
                        rescaled[sensor] = data_df["0"]
                    
                    rescaled.to_csv(join(uo_outpath, sensor + "_" + str(easting) + "_" + str(northing) + ".csv"))
        except:
            print("Not worked.")

In [7]:
### Accumulate up gauge data

for root_filepath in [cs_outpath, ngif_outpath, uo_outpath]:
    
    outpath_15min = join(root_filepath, "15min")
    if not exists(outpath_15min):
        os.mkdir(outpath_15min)
        
    for f in os.listdir(root_filepath):
        if f.endswith(".csv"):
            tab = pd.read_csv(join(root_filepath, f), index_col=0)
            tab.index = pd.to_datetime(tab.index)
            tab_15min = 4 * tab.resample(str(15*60) + "s").sum() # should be mm/h?
            
            filled_gaps = pd.Series(np.nan, index=new_timestamp)
            filled_gaps.loc[tab_15min.index] = tab_15min.iloc[:, 0]
            
            filled_gaps.to_csv(join(outpath_15min, f))


KeyError: "None of [DatetimeIndex(['2023-06-20 00:00:00', '2023-06-20 00:15:00',\n               '2023-06-20 00:30:00', '2023-06-20 00:45:00',\n               '2023-06-20 01:00:00', '2023-06-20 01:15:00',\n               '2023-06-20 01:30:00', '2023-06-20 01:45:00',\n               '2023-06-20 02:00:00', '2023-06-20 02:15:00',\n               ...\n               '2023-06-30 21:30:00', '2023-06-30 21:45:00',\n               '2023-06-30 22:00:00', '2023-06-30 22:15:00',\n               '2023-06-30 22:30:00', '2023-06-30 22:45:00',\n               '2023-06-30 23:00:00', '2023-06-30 23:15:00',\n               '2023-06-30 23:30:00', '2023-06-30 23:45:00'],\n              dtype='datetime64[ns]', name='time', length=1056, freq='900S')] are in the [index]"

In [8]:
tab.index

DatetimeIndex(['2023-06-20 00:00:00', '2023-06-20 00:01:00',
               '2023-06-20 00:02:00', '2023-06-20 00:03:00',
               '2023-06-20 00:04:00', '2023-06-20 00:05:00',
               '2023-06-20 00:06:00', '2023-06-20 00:07:00',
               '2023-06-20 00:08:00', '2023-06-20 00:09:00',
               ...
               '2023-06-30 23:50:00', '2023-06-30 23:51:00',
               '2023-06-30 23:52:00', '2023-06-30 23:53:00',
               '2023-06-30 23:54:00', '2023-06-30 23:55:00',
               '2023-06-30 23:56:00', '2023-06-30 23:57:00',
               '2023-06-30 23:58:00', '2023-06-30 23:59:00'],
              dtype='datetime64[ns]', name='time', length=7968, freq=None)

In [None]:
# Quality control gauge data

# Import relevent packages
os.sys.path.append(r"C:\Users\Amy\OneDrive - Newcastle University (1)\Documents\Jupyter\intense-qc")
from intense import gauge, qc, utils
from pyproj import Transformer

for path in gauge_paths:

    qc_output_path = join(split(path)[0], "qc")
    if not exists(qc_output_path):
        os.mkdir(qc_output_path)

    output_file = join(qc_output_path, split(path)[1])

    data_raw = pd.read_csv(path, index_col=0, parse_dates=True).iloc[:, 0]
    data_raw = data_raw.sort_index()
    res_min = pd.Series(data_raw.dropna().index).diff().median().seconds / 60

    data_15min = data_raw.resample(str(60*15) + "s").sum() / 4 # gives mm/h every 15 min
    try:
        if data.index.dtype == "datetime64[ns, UTC]":
            data_1h = data_15min.resample("1h").mean()
        else:
            data_1h = data_15min.resample("1h").mean().tz_localize('UTC')
    except:
        print(name, "not read in.")

    station_id, eastings, northings = split(path)[1].split(".")[0].split("_")

    data = data_1h.dropna()
    loc = (eastings, northings)

    flags = get_gauge_flags(data, loc)

    # only include gauge data that is not flagged
    if not flags["gauge"]:
        # currenly ignoring year flag as not enough data

        # remove flagged observationns
        if len(flags["obs"]) > 0:
            cond = [idx in flags["obs"] for idx in data_15min.index.round("1h")]
            data_15min.loc[cond] = np.nan

        data_15min.tz_localize('UTC').to_csv(output_file)