In [1]:
from os.path import join, exists
import os
import numpy as np
import pandas as pd

In [3]:
##### Inputs to change
start_date = "2023-06-20"
end_date = "2023-06-30"

input_path = r"C:\Users\Amy\OneDrive - Newcastle University (1)\Documents\PYRAMID\data\realtime"
output_path = join(input_path, "output")
if not exists(output_path):
    os.mkdir(output_path)
    
# Bounding box for data 
e_l, n_l, e_u, n_u = [355000, 534000, 440000, 609000]
bbox = [e_l, e_u, n_l, n_u]

# Quality control test data path
static_data_path = join(input_path, "static")
intense_path = join(static_data_path, "intense-qc")
etccdi_data_path = join(join(intense_path, "tests"), "etccdi_data")

# Adds intense folder location to path so can run correct version
os.sys.path.append(intense_path)
from intense import gauge, qc, utils
from pyproj import Transformer

In [4]:
def get_gauge_flags(data, loc):
    '''Function to apply wokring Intense QC hourly checks on gauge data. 
    Args: data is the gauge data (pd.Series), loc is the gauge location (eastings, northings)
    Returns: flags as a dictionary {gauge flagged, years flagged, obs flagged}
    '''
    eastings, northings = loc
    # convert coordinates to lat/lon
    transformer = Transformer.from_crs("epsg:27700", "epsg:4326")
    latitude, longitude = transformer.transform(eastings, northings)

    # create gauge object
    rain_gauge = gauge.Gauge(
        station_id,
        path_to_original_data="",
        latitude=latitude,
        longitude=longitude,
        original_timestep="15min",
        original_units="mm/h",
        new_units="mm/h",
        new_timestep="1h",
        data=data
    )
    rain_gauge.get_info()

    # create qc object 
    test = qc.Qc(
        gauge=rain_gauge,
        etccdi_data_folder=etccdi_data_path
    )

    # checks that don't work on existing data
    """
    test.check_percentiles()
    test.check_k_largest()
    test.check_intermittency()
    test.change_in_min_val_check() 
    test.cwd_check() # missing function in utils file???
    test.change_in_min_val_check() # Change in minimum value check, homogeneity check to see if the resolution of the data has changed. Change flag, flag years
    test.find_neighbours("hourly") # frequency: must be either hourly, daily or monthly, Names or names and paths of neighbouring stations
    # conditions are: must be within 50km, at least 3 years overlap, select the closest 10, don't have three years of data
    # check_hourly_neighbours(), check_daily_neighbours(), check_monthly_neighbours()
    test.get_flags() # runs all checks, fails at find_neighbours()
    """

    ### run checks ###

    flagged_sdii = 0
    sdii_thresh = 100 # just arbitrary atm
    if any(np.array(test.get_sdii()) > sdii_thresh): # Simple precipitation intensity index, SDII from ETCCDI and from gauge values (sdii_gridded, sdii_gauge), not sure how to use this
        flagged_sdii = 1
    
    # Flag data if any of these don't return 0:
    gauge_checks = [
        test.check_days_of_week(), # Checks if proportions of rainfall in each day is significantly different
        test.check_break_point(), # Pettitt breakpoint check
        flagged_sdii
    ]
    
    flagged_gauge = sum(gauge_checks) != 0
    #if flagged_gauge:
    #    print("Gauge", np.array(["days of week", "break point", "sdii"])[np.array(gauge_checks) == 1])
    #    print(test.check_break_point())

    # Flag individual data observations if don't return 0:
    obs_checks = pd.DataFrame(index=rain_gauge.data.index)
    obs_checks["world_record"] = test.world_record_check_ts() # Checks if and to what degree the world record has been exceeded by each rainfall value, 4, 3, 2 or 1 if exceeded by > 1.5x, 1.33x, 1.22x or 0x respectively and 0 if not exceeded for each value
    obs_checks["rx1day"] = test.rx1day_check_ts() # Checks hourly values against maximum 1-day precipitation, Magnitudes of exceedance for each day
    obs_checks["cdd"] = test.cdd_check() # ETCCDI provide an index for maximum length of dry spell. Look for suspicious number of consecutive dry hours recorded. Consecutive Dry Days: Maximum length of dry spell, maximum number of consecutive days with RR < 1mm. Magnitudes of exceedence of the length of longest dry period
    obs_checks["daily_accums"] = test.daily_accums_check() # Check daily accumulations. Suspect daily accumulations flagged where a recorded rainfall amount at these times is preceded by 23 hours with no rain. A threshold of 2x the mean wet day amount for the corresponding month is applied to increase thechance of identifying accumulated values at the expense of genuine, moderate events
    obs_checks["monthly_accums"] = test.monthly_accums_check() # Check monthly accumulations. Flags month prior to high value
    obs_checks["streaks"] = test.streaks_check() # Streaks: This is where you see the same value repeated in a run. Currently this records streaks of 2hrs in a row or more over 2 x Monthly mean rainfall. It is considered to be unlikely that you would see even 2 consecutive large rainfall amounts. For this code I have substituted the monthly mean rainfall for SDII as I want the thresholds to be independent of the rainfall time series as the global dataset is of highly variable quality.
    flagged_obs = obs_checks.index[obs_checks.sum(1) > 0]

    # Flags each individual year:
    year_flags = np.array([
        test.r99ptot_check_annual(), # Check against R99pTOT: R99pTOT. Annual total PRCP when RR > 99p. Magnitudes of exceedance for yearly 99th percentiles
        test.prcptot_check_annual() # check against annual total: PRCPTOT. Annual total precipitation in wet days. Magnitudes of exceedance for yearly totals
    ])

    flagged_years = np.arange(rain_gauge.data.index.min().year, rain_gauge.data.index.max().year + 1)[year_flags.sum(0) < 0]

    return({"gauge" : flagged_gauge, "years" : flagged_years, "obs" : flagged_obs})

In [5]:
gauge_sources = ["EA", "UO", "CS", "NGIF"]

for source in gauge_sources:

    input_filepath = join(input_path, source)
    output_filepath = join(output_path, source)

    if not exists(output_filepath):
        os.mkdir(output_filepath)

    for f in os.listdir(input_filepath):

        if f.endswith(".csv"):

            gauge_15min = pd.read_csv(join(input_filepath, f), index_col=0)
            gauge_15min.index = pd.to_datetime(gauge_15min.index, utc=True)

            gauge_1hr = gauge_15min.resample("1h").mean()
            station_id, eastings, northings = f.split(".")[0].split("_")

            loc = (eastings, northings)
            flags = get_gauge_flags(gauge_1hr.iloc[:, 0].dropna(), loc)

            # Only include gauge data that is not flagged
            if not flags["gauge"]:
                # currenly ignoring year flag as not enough data

                # remove flagged observationns
                if len(flags["obs"]) > 0:
                    cond = [idx in flags["obs"] for idx in gauge_15min.index.round("1h")]
                    gauge_15min.loc[cond] = np.nan

                gauge_15min.to_csv(join(output_filepath, f))

  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)


  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
  df3 = df1.append(df2)
