# Reading in Environment Agency rain gauge data (15 min)

### Notes
- Intense QC code read in, not sure how easy that will be, I had to clone it off Github and change line 10 of code as got an error with a package, proabbly due to Python versions. Think the package needs updating on Github but not sure who is still in charge of maintaining it. Need ETCCDI data (available in example data for Intense QC)


### What does the code do?

1. Download the data
    - Try historic API
    - Try real-time API
    - Save data
    

### Outputs format
- `\root` folder path 
    - `\EA` folder path (15 minute rain gauge data)
        - `<station-id>_<eastings>_<northings>.csv` - individual 15-minute gauge data for station `<station-id>`
        - `\15min` folder path (15 minute rain gauge data) with filled in timestamp
            - `<station-id>_<eastings>_<northings>.csv` - individual 15-minute gauge data for station `<station-id>`

In [3]:
# Import relevent packages
import pandas as pd
import numpy as np
import requests
from os.path import join, exists
import os
from datetime import datetime
import io

In [2]:
##### Inputs to change
start_date = "2023-06-20"
end_date = "2023-06-30"

out_path = r"C:\Users\Amy\OneDrive - Newcastle University (1)\Documents\PYRAMID\data\realtime"

# Bounding box for data 
e_l, n_l, e_u, n_u = [355000, 534000, 440000, 609000]
bbox = [e_l, e_u, n_l, n_u]

In [4]:
# File paths
ea_outpath = join(out_path, "EA")
if not exists(ea_outpath):
    os.mkdir(ea_outpath)
    
ea_15min_outpath = join(ea_outpath, "15min")
if not exists(ea_15min_outpath):
    os.mkdir(ea_15min_outpath)

In [5]:
# Download the data

# Get list of rainfall stations
root = "http://environment.data.gov.uk/flood-monitoring"
response = requests.get(root + "/id/stations?parameter=rainfall")
r = response.json()["items"]
df = pd.json_normalize(r)

# Get stations in area
extent_stations = df.loc[(df.northing > n_l) & (df.northing < n_u) & (df.easting > e_l) & (df.easting < e_u)]
station_ids = extent_stations.stationReference.to_list()
names = np.array(extent_stations.stationReference + "_" + extent_stations.easting.astype(int).astype(
    str) + "_" + extent_stations.northing.astype(int).astype(str) + ".csv")

now = datetime.now()
# use real-time API if possible (last 28 days)
if now - pd.to_datetime(start_date) < pd.Timedelta("28d"):
    for i, station in enumerate(station_ids):
        try:
            path = "http://environment.data.gov.uk/flood-monitoring/id/measures/{id}-rainfall-tipping_bucket_raingauge-t-15_min-mm/readings?parameter=rainfall&startdate={startDate}&enddate={endDate}"
            path = path.replace("{id}", station)
            path = path.replace("{startDate}", start_date)
            path = path.replace("{endDate}", end_date)
            path = path.replace("{startTime}", start_date)

            response = requests.get(path)
            r = response.json()["items"]
            df = pd.json_normalize(r)
            if len(df) > 0:
                data = pd.Series(df.value.values, index=pd.to_datetime(df.dateTime))
                data.to_csv(join(ea_outpath, names[i]))
        except:
            print(station, "not worked.")

# if not use historical API (seems to only work for last year- not very historical)
else:
    dates = pd.date_range(start_date, end_date)
    iids = np.array(extent_stations.measures.str[0].str["@id"])

    for date_ts in dates:
        date = str(date_ts).split(" ")[0]
        path = "http://environment.data.gov.uk/flood-monitoring/archive/readings-full-{date}.csv"
        path = path.replace("{date}", str(date))
        r = requests.get(path)
        full_data = []
        if r.status_code == 200:
            df = pd.read_csv(io.StringIO(r.text), index_col=0, parse_dates=True)
            full_data.append(df[[measure in iids for measure in df.measure]])
    full_data = pd.concat(full_data).groupby("stationReference")

    for i, station in enumerate(station_ids):
        try:
            (full_data.get_group(station).value).to_csv(join(ea_outpath, names[i])) # check units? Is this mm or mm/h because if it is mm it needs multiplying by 4?
        except:
            print(station, "not worked.")

In [13]:
# Filled in data incase missing values (don't think there is but just in case)
new_timestamp = pd.date_range(
    pd.to_datetime(start_date),
    pd.to_datetime(end_date) + pd.Timedelta(1, "d"),
    freq=str(15 * 60) + "s", 
    tz="UTC"
)

for f in os.listdir(ea_outpath):
    if f.endswith(".csv"):
        tab = pd.read_csv(join(ea_outpath, f), index_col=0)
        tab.index = pd.to_datetime(tab.index)

        filled_in = pd.Series(np.nan, index=new_timestamp)
        filled_in.loc[tab.index] = tab.iloc[:, 0].values

        filled_in.to_csv(join(ea_15min_outpath, f))