In [123]:
import json
import copy
import numpy as np
import pandas as pd

from pathlib import Path
from scipy.stats import wasserstein_distance


In [8]:
data_path = Path("../data/processed/")
months = "monthly.parquet"
days = "daily.parquet"


### Step 1 | Cleaning

- Input: una stazione S, un mese target (Y*, M*) (es. 2025-02).
- Oggetto “mese”: l’insieme dei giorni di quel mese con vettori meteo (T, precip, umidità, vento, …).

- Output desiderato: una classifica dei mesi storici più simili + una spiegazione “perché”.

In [9]:
monthly_df = pd.read_parquet(data_path / months)
display(monthly_df.head(2))
print(monthly_df.shape[0])

daily_df = pd.read_parquet(data_path / days)
display(daily_df.head(2))
print(daily_df.shape[0])

Unnamed: 0,station_name,year,month,precipitation_sum,precipitation_max,precipitation_mean,temperature_min_mean,temperature_min_min,temperature_min_max,temperature_min_std,...,temperature_mean_coverage,temperature_max_coverage,humidity_min_coverage,humidity_mean_coverage,humidity_max_coverage,wind_speed_mean_coverage,wind_speed_max_coverage,wind_direction_max_coverage,solar_radiation_coverage,pressure_mean_coverage
0,Gemona del Friuli,1999,1,0.0,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Gemona del Friuli,1999,2,0.0,,,,,,,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


2172


Unnamed: 0,date,year,month,day,station_name,precipitation,temperature_min,temperature_mean,temperature_max,humidity_min,humidity_mean,humidity_max,wind_speed_mean,wind_speed_max,wind_direction_max,solar_radiation,pressure_mean
0,2004-01-01,2004,1,1,Piancavallo,,,,,,,,,,,,
1,2004-01-02,2004,1,2,Piancavallo,,,,,,,,,,,,


65929


	1.	Filtra una stazione alla volta (per ora).
	2.	Gestisci i missing:
	•	per ogni variabile, definisci un minimo di giorni validi per considerare quel mese affidabile (es. ≥ 80% giorni).
	•	sfrutta i tuoi campi n_valid_days / coverage o calcolali dai daily.
	3.	Decidi cosa fare con mesi “scarsi”:
	•	o li escludi
	•	o li tieni ma abbassi il peso/confidenza del confronto

Output: per la stazione S hai un set di mesi “validi” confrontabili + un report di copertura.


In [80]:
def is_feasible_to_check(station: str, year: int, month: int) -> bool:
    # Check if station exists
    if station not in monthly_df["station_name"].values:
        print(f"[ERROR] {station} does not exist.")
        return False

    # Filter for station, year, month
    filtered = monthly_df[
        (monthly_df["station_name"] == station) & 
        (monthly_df["year"] == year) & 
        (monthly_df["month"] == month)
    ]

    # Check if exactly one row found
    if len(filtered) != 1:
        print(f"[ERROR] No data found for {station}, {year}-{month:02d}")
        return False

    # Get the single row as Series
    this_month = filtered.iloc[0]

    # Define valid fields check
    valid_fields = [
        'precipitation_n_valid_days', 'temperature_min_n_valid_days',
        'temperature_mean_n_valid_days', 'temperature_max_n_valid_days',
        'humidity_min_n_valid_days', 'humidity_mean_n_valid_days',
        'humidity_max_n_valid_days', 'wind_speed_mean_n_valid_days',
        'wind_speed_max_n_valid_days', 'wind_direction_max_n_valid_days',
        'solar_radiation_n_valid_days', 'pressure_mean_n_valid_days'
    ]

    # Check all fields have >= 20 valid days
    is_valid = all(
        this_month[field] >= 20 for field in valid_fields if field in this_month.index
    )

    if is_valid:
        print(f"[OK] {station}, {year}-{month:02d} is feasible")
        return True
    else:
        print(f"[ERROR] Insufficient valid days for {station}, {year}-{month:02d}")
        print("Required: all fields >= 20 valid days")
        return False

In [138]:
def get_candidates(
    station: str,
    ref_year: int,
    ref_month: int,
    monthly_df: pd.DataFrame,
    daily_df: pd.DataFrame,
    min_coverage: float = 0.80,
) -> pd.DataFrame:
    # Coverage fields (0..1). Using these avoids month-length issues (Feb vs 31-day months).
    coverage_fields = [
        'precipitation_coverage', 'temperature_min_coverage',
        'temperature_mean_coverage', 'temperature_max_coverage',
        'humidity_min_coverage', 'humidity_mean_coverage',
        'humidity_max_coverage', 'wind_speed_mean_coverage',
        'wind_speed_max_coverage', 'wind_direction_max_coverage',
        'solar_radiation_coverage', 'pressure_mean_coverage'
    ]

    # 1) Keep only the station we are analyzing
    m = monthly_df[monthly_df["station_name"] == station].copy()

    # 2) Keep only months with >= 80% coverage on ALL required variables
    valid_months_mask = m[coverage_fields].ge(min_coverage).all(axis=1)

    # 3) Exclude the reference month
    not_reference_mask = ~((m["year"] == ref_year) & (m["month"] == ref_month))

    valid_months = m.loc[valid_months_mask & not_reference_mask, ["year", "month"]].drop_duplicates()

    # 4) Filter daily to the same station, then keep only (year, month) in valid_months
    d = daily_df[daily_df["station_name"] == station].copy()

    valid_pairs = set(map(tuple, valid_months[["year", "month"]].to_numpy()))
    d_pairs = list(zip(d["year"].to_numpy(), d["month"].to_numpy()))

    mask = [pair in valid_pairs for pair in d_pairs]
    return d.loc[mask].copy()

In [139]:
def _wasserstein_safe(a: pd.Series, b: pd.Series, min_samples: int = 10):
    """Compute Wasserstein distance with basic safeguards."""
    a = a.dropna().astype(float)
    b = b.dropna().astype(float)
    if len(a) < min_samples or len(b) < min_samples:
        return np.nan
    return float(wasserstein_distance(a.values, b.values))

In [142]:
def get_similar_months(
    station: str,
    year: int,
    month: int,
    top_k: int = 10,
    min_samples_per_var: int = 10,
    variable_weights: dict | None = None,
):
    if not is_feasible_to_check(station, year, month):
        return pd.DataFrame()

    id_cols = ["date", "year", "month", "day", "station_name"]
    numeric_vars = [
        "temperature_mean", "temperature_max", "temperature_min",
        "precipitation", "humidity_mean"
    ]

    cols = id_cols + numeric_vars

    # Reference month daily samples
    ref = daily_df.loc[
        (daily_df["station_name"] == station) &
        (daily_df["year"] == year) &
        (daily_df["month"] == month),
        cols
    ].copy()

    # Candidate months daily samples (already: same station, valid months, reference excluded)
    cand = get_candidates(station, year, month, monthly_df, daily_df)[cols].copy()

    # Weights (default = 1.0)
    weights = {v: 1.0 for v in numeric_vars}
    if variable_weights is not None:
        for v in numeric_vars:
            if v in variable_weights:
                weights[v] = float(variable_weights[v])

    results = []
    for (cand_year, cand_month), grp in cand.groupby(["year", "month"], sort=False):
        row = {
            "station_name": station,
            "reference_year": year,
            "reference_month": month,
            "candidate_year": int(cand_year),
            "candidate_month": int(cand_month),
        }

        # Per-variable distances
        per_var = {}
        for v in numeric_vars:
            d = _wasserstein_safe(ref[v], grp[v], min_samples=min_samples_per_var)
            per_var[v] = d
            row[f"w_{v}"] = d

        # Aggregate (weighted mean over finite distances)
        num = 0.0
        den = 0.0
        for v in numeric_vars:
            d = per_var[v]
            if np.isfinite(d):
                w = weights[v]
                num += w * d
                den += w

        row["distance_total"] = (num / den) if den > 0 else np.nan
        row["n_valid_vars"] = int(sum(np.isfinite(per_var[v]) for v in numeric_vars))
        results.append(row)

    out = pd.DataFrame(results).dropna(subset=["distance_total"])
    out = out.sort_values("distance_total", ascending=True).head(top_k).reset_index(drop=True)
    return out

In [143]:
res = get_similar_months(
    station="Monte Lussari",
    year=2025,
    month=1,
    top_k=5
)

res

[OK] Monte Lussari, 2025-01 is feasible


Unnamed: 0,station_name,reference_year,reference_month,candidate_year,candidate_month,w_temperature_mean,w_temperature_max,w_temperature_min,w_precipitation,w_humidity_mean,distance_total,n_valid_vars
0,Monte Lussari,2025,1,2025,2,0.951037,1.035945,0.729724,2.420737,4.50576,1.928641,5
1,Monte Lussari,2025,1,2023,1,1.688988,1.990323,1.57208,2.698999,5.044494,2.598977,5
2,Monte Lussari,2025,1,2023,3,2.122581,2.758817,1.545161,2.167742,4.709677,2.660796,5
3,Monte Lussari,2025,1,2022,11,3.527957,3.058817,3.656882,1.110968,3.172043,2.905333,5
4,Monte Lussari,2025,1,2021,12,1.667742,1.522581,1.570968,2.625806,7.645161,3.006452,5
