In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
# change directory to the root of the project
from os import chdir

chdir("../")

In [4]:
import matplotlib.pyplot as plt
import pandas as pd

from shapely.geometry import Point

In [5]:
station_data = pd.read_parquet(
    f"data/input/stations/{330020}/daily_temp_history",
)[["max_temp"]]
station_data

Unnamed: 0_level_0,max_temp
date,Unnamed: 1_level_1
1950-01-01,
1950-01-02,
1950-01-03,
1950-01-04,
1950-01-05,
...,...
2023-09-15,26.3
2023-09-16,21.6
2023-09-17,11.7
2023-09-18,15.1


In [32]:
def compute_stations_ts_gap(
    save: bool = False,
    save_path: str = None,
) -> pd.DataFrame:
    """Computes the time series gap for each station on the 1981-current year period.

    Parameters
    ----------
    save : bool, optional
        Boolean to decide whether to save the generated dataframe, by default False
    save_path : str, optional
        Path where to store generated dataframe, by default None

    Returns
    -------
    pd.DataFrame
        Dataframe containing the gap information for each station
        (TS days, data days, gap days, gap %)
    """
    print("Computing stations time series gap")
    gap_df = pd.DataFrame(
        [],
        index=pd.Index([], name="station_id"),
        columns=["ts_days", "data_days", "gap_days", "gap_%"],
    )
    station_ids = [
        180005,
        200006,
        220002,
        230001,
        270001,
        # 270002,
        270008,
        290004,
        300046,
        310024,
        320019,
        330002,
        330007,
        330020,
        330030,
        340031,
        340045,
        360011,
        360019,
        370033,
        380013,
        390006,
        400009,
        410005,
        430002,
        430004,
        450001,
        450004,
        450005,
        460001,
        470001,
        520006,
        550001,
        950001,
        950003,
    ]

    for station_id in station_ids:
        print(f"Processing station: {station_id}")
        station_data = pd.read_parquet(
            f"data/input/stations/{station_id}/daily_temp_history",
        )[["max_temp"]]

        # filter in 1981-X period
        # TODO: update 'ingest component' to begin with non-null max_temp TS
        station_data = station_data.loc["1981-01-01":]

        # detect gaps on days
        station_data["gap"] = station_data["max_temp"].isna().astype(int)

        # compute max_temp 'gap %'
        ts_days = len(station_data)
        data_days = station_data["gap"].value_counts().values[0]
        gap_days = station_data["gap"].value_counts().values[1]
        gap_percentage = (gap_days * 100) / len(station_data)

        gap_df.loc[station_id] = [ts_days, data_days, gap_days, gap_percentage]

    # gap_df type correction
    gap_df = gap_df.astype(
        {
            "ts_days": int,
            "data_days": int,
            "gap_days": int,
            "gap_%": float,
        }
    )

    if save:
        gap_df.to_parquet(save_path)

    return gap_df


compute_stations_ts_gap(
    save=True,
    save_path="data/reporting/stations_ts_gap.parquet",
)

Computing stations time series gap
Processing station: 180005
Processing station: 200006
Processing station: 220002
Processing station: 230001
Processing station: 270001
Processing station: 270008
Processing station: 290004
Processing station: 300046
Processing station: 310024
Processing station: 320019
Processing station: 330002
Processing station: 330007
Processing station: 330020
Processing station: 330030
Processing station: 340031
Processing station: 340045
Processing station: 360011
Processing station: 360019
Processing station: 370033
Processing station: 380013
Processing station: 390006
Processing station: 400009
Processing station: 410005
Processing station: 430002
Processing station: 430004
Processing station: 450001
Processing station: 450004
Processing station: 450005
Processing station: 460001
Processing station: 470001
Processing station: 520006
Processing station: 550001
Processing station: 950001
Processing station: 950003


Unnamed: 0_level_0,ts_days,data_days,gap_days,gap_%
station_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
180005,15602,15600,2,0.012819
200006,15602,15509,93,0.596077
220002,15602,15598,4,0.025638
230001,15602,15597,5,0.032047
270001,15602,15596,6,0.038457
270008,6836,6350,486,7.109421
290004,15602,15589,13,0.083323
300046,2453,2162,291,11.863025
310024,3184,2460,724,22.738693
320019,1716,1491,225,13.111888
