In [9]:
import sys

PROJECT_ROOT = Path.cwd().parent
sys.path.insert(0, str(PROJECT_ROOT))

import json
import numpy as np
import pandas as pd

from pathlib import Path
from collections import Counter

from src.schema.observations import DailyObservation, SCHEMA_VERSION

In [31]:
RAW_DIR = Path("../data/raw/arpa")

rows = []
for f in RAW_DIR.glob("*.json"):
    with open(f, "r", encoding="utf-8") as fh:
        data = json.load(fh)
    for _, records in data.items():
        rows.extend(records)

raw_df = pd.DataFrame(rows)

# normalizza missing
raw_df = raw_df.replace("-", np.nan)

raw_df.shape

(67442, 16)

In [32]:
COLUMN_MAP = {
    "anno": "year",
    "mese": "month",
    "giorno*": "day",
    "stazione": "station_name",
    "Pioggia mm": "precipitation",
    "Temp. min °C": "temperature_min",
    "Temp. med °C": "temperature_mean",
    "Temp. max °C": "temperature_max",
    "Umidita' min %": "humidity_min",
    "Umidita' med %": "humidity_mean",
    "Umidita' max %": "humidity_max",
    "Vento med km/h": "wind_speed_mean",
    "Vento max km/h": "wind_speed_max",
    "Dir. V. max °N": "wind_direction_max",
    "Radiaz. KJ/m2": "solar_radiation",
    "Press. med hPa": "pressure_mean",
}

df = raw_df.rename(columns=COLUMN_MAP)
display(df.head(1))

Unnamed: 0,month,day,precipitation,temperature_min,temperature_mean,temperature_max,humidity_min,humidity_mean,humidity_max,wind_speed_mean,wind_speed_max,wind_direction_max,solar_radiation,pressure_mean,year,station_name
0,1,1.0,,,,,,,,,,,,,2004,Piancavallo


In [38]:
NUMERIC_COLS = [
    "year", "month", "day",
    "precipitation",
    "temperature_min", "temperature_mean", "temperature_max",
    "humidity_min", "humidity_mean", "humidity_max",
    "wind_speed_mean", "wind_speed_max",
    "wind_direction_max",
    "solar_radiation",
    "pressure_mean",
]

for col in NUMERIC_COLS:
    if col in df.columns:
        df[col] = pd.to_numeric(df[col], errors="coerce")

df["station_name"] = df["station_name"].astype(str).str.strip()

In [39]:
df["date"] = pd.to_datetime(
    df[["day", "month", "year"]],
    errors="coerce"
)

In [40]:
df

Unnamed: 0,month,day,precipitation,temperature_min,temperature_mean,temperature_max,humidity_min,humidity_mean,humidity_max,wind_speed_mean,wind_speed_max,wind_direction_max,solar_radiation,pressure_mean,year,station_name,date
0,1.0,1.0,,,,,,,,,,,,,2004,Piancavallo,2004-01-01
1,1.0,2.0,,,,,,,,,,,,,2004,Piancavallo,2004-01-02
2,1.0,3.0,,,,,,,,,,,,,2004,Piancavallo,2004-01-03
3,1.0,4.0,,,,,,,,,,,,,2004,Piancavallo,2004-01-04
4,1.0,5.0,,,,,,,,,,,,,2004,Piancavallo,2004-01-05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
67437,12.0,17.0,,0.4,0.9,1.7,97.0,99.0,100.0,5.0,18.0,202.0,1492.0,827.6,2025,Monte Zoncolan,2025-12-17
67438,12.0,18.0,,-0.9,0.9,3.6,100.0,100.0,100.0,12.0,27.0,237.0,3913.0,831.6,2025,Monte Zoncolan,2025-12-18
67439,,,,,,,,,,,,,,,2025,Monte Zoncolan,NaT
67440,,,,,,,,,,,,,,,2025,Monte Zoncolan,NaT


In [41]:
# Missingness
missing_pct = df.isna().mean().sort_values(ascending=False) * 100
missing_pct

precipitation         39.697814
pressure_mean         30.144420
wind_direction_max    28.519320
wind_speed_mean       28.216838
wind_speed_max        26.729634
solar_radiation       23.805640
humidity_mean         16.517897
humidity_min          15.100383
humidity_max          14.897245
temperature_max       12.935559
temperature_min       12.517422
temperature_mean      12.456629
date                   2.126271
day                    2.126271
month                  0.685033
year                   0.000000
station_name           0.000000
dtype: float64

In [43]:
# Range checks
RANGES = {
    "temperature_min": (-50, 50),
    "temperature_mean": (-50, 50),
    "temperature_max": (-50, 50),
    "humidity_min": (0, 100),
    "humidity_mean": (0, 100),
    "humidity_max": (0, 100),
    "pressure_mean": (800, 1050),
}

range_report = []

for col, (lo, hi) in RANGES.items():
    s = df[col]
    range_report.append({
        "column": col,
        "min": s.min(),
        "max": s.max(),
        "violations": ((s < lo) | (s > hi)).sum()
    })

pd.DataFrame(range_report)

Unnamed: 0,column,min,max,violations
0,temperature_min,-21.5,28.5,0
1,temperature_mean,-19.6,31.6,0
2,temperature_max,-18.2,38.5,0
3,humidity_min,0.0,100.0,0
4,humidity_mean,7.0,100.0,0
5,humidity_max,7.0,100.0,0
6,pressure_mean,774.7,1044.0,79


In [44]:
df.duplicated(subset=["station_name", "date"]).sum()

np.int64(1426)

In [46]:
gap_summary = []

for station, g in df.dropna(subset=["date"]).groupby("station_name"):
    dates = g.sort_values("date")["date"].drop_duplicates()
    gaps = dates.diff().dt.days

    # totale giorni mancanti dentro i buchi: (gap - 1) sommato su gap > 1
    missing_total_days = (gaps[gaps > 1] - 1).sum()

    gap_summary.append({
        "station": station,
        "max_gap_days": gaps.max(),
        "gap_events": (gaps > 1).sum(),
        "missing_total_days": missing_total_days
    })

pd.DataFrame(gap_summary).sort_values("missing_total_days", ascending=False)

Unnamed: 0,station,max_gap_days,gap_events,missing_total_days
0,Gemona del Friuli,1.0,0,0.0
1,Lignano,1.0,0,0.0
2,Lignano Sabbiadoro,1.0,0,0.0
3,Monte Lussari,1.0,0,0.0
4,Monte Matajur,1.0,0,0.0
5,Monte Zoncolan,1.0,0,0.0
6,Musi,1.0,0,0.0
7,Piancavallo,1.0,0,0.0
