# Preverjanje urne popolnosti meritev

Notebook preveri, ali ima vsako merilno mesto zapis za vsako uro v svojem 훾asovnem razponu, ali so vsi 탑igi na polno uro ter ali imajo vse postaje enak 훾asovni za훾etek/konec.

In [25]:
from pathlib import Path
import pandas as pd

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 50)

In [None]:
DATA_DIR = Path("../data/original")
files = sorted(DATA_DIR.glob("*.csv"))
len(files), files[:3]

(0, [])

In [27]:
def check_hourly_completeness(csv_path: Path):
    df = pd.read_csv(csv_path)
    if "datetime" not in df.columns:
        raise ValueError(f"Datoteka {csv_path.name} nima stolpca 'datetime'.")

    dt = pd.to_datetime(df["datetime"], errors="coerce")
    invalid_count = int(dt.isna().sum())

    valid_dt = dt.dropna().sort_values()
    if valid_dt.empty:
        return {
            "station": csv_path.stem,
            "rows": len(df),
            "first": pd.NaT,
            "last": pd.NaT,
            "expected_hours": 0,
            "actual_unique_hours": 0,
            "missing_hours": 0,
            "duplicate_hours": 0,
            "invalid_datetime": invalid_count,
            "off_hour_timestamps": 0,
            "missing_list": []
        }

    off_hour_count = int(((valid_dt.dt.minute != 0) | (valid_dt.dt.second != 0) | (valid_dt.dt.microsecond != 0)).sum())

    unique_hours = valid_dt.drop_duplicates()
    unique_hours_index = pd.DatetimeIndex(unique_hours.to_numpy())
    full_index = pd.date_range(unique_hours_index.min(), unique_hours_index.max(), freq="h")
    missing = full_index.difference(unique_hours_index)

    return {
        "station": csv_path.stem,
        "rows": len(df),
        "first": unique_hours.min(),
        "last": unique_hours.max(),
        "expected_hours": len(full_index),
        "actual_unique_hours": len(unique_hours),
        "missing_hours": len(missing),
        "duplicate_hours": int(valid_dt.size - unique_hours.size),
        "invalid_datetime": invalid_count,
        "off_hour_timestamps": off_hour_count,
        "missing_list": missing.tolist()
    }

In [28]:
results = []
errors = []

for path in files:
    try:
        row = check_hourly_completeness(path)
        if isinstance(row, dict):
            results.append(row)
        else:
            errors.append({"file": path.name, "error": "Result is not a dict"})
    except Exception as e:
        errors.append({"file": path.name, "error": str(e)})

summary = pd.DataFrame(results)
if not summary.empty and "station" in summary.columns:
    summary = summary.sort_values("station").reset_index(drop=True)

if errors:
    print(f"Napake pri preverjanju: {len(errors)}")
    print(pd.DataFrame(errors).head(20))

if not summary.empty and {"first", "last"}.issubset(summary.columns):
    global_first = summary["first"].min()
    global_last = summary["last"].max()
    summary["same_global_start"] = summary["first"] == global_first
    summary["same_global_end"] = summary["last"] == global_last
else:
    summary["same_global_start"] = pd.Series(dtype=bool)
    summary["same_global_end"] = pd.Series(dtype=bool)

cols = [
    "station", "rows", "first", "last",
    "expected_hours", "actual_unique_hours",
    "missing_hours", "duplicate_hours",
    "invalid_datetime", "off_hour_timestamps",
    "same_global_start", "same_global_end"
]
for c in cols:
    if c not in summary.columns:
        summary[c] = pd.Series(dtype="object")

summary[cols]


Unnamed: 0,station,rows,first,last,expected_hours,actual_unique_hours,missing_hours,duplicate_hours,invalid_datetime,off_hour_timestamps,same_global_start,same_global_end


In [29]:
issues = summary[(summary["missing_hours"] > 0) | (summary["duplicate_hours"] > 0) | (summary["invalid_datetime"] > 0) | (summary["off_hour_timestamps"] > 0) | (~summary["same_global_start"]) | (~summary["same_global_end"])]
issues[["station", "missing_hours", "duplicate_hours", "invalid_datetime", "off_hour_timestamps", "same_global_start", "same_global_end"]]

Unnamed: 0,station,missing_hours,duplicate_hours,invalid_datetime,off_hour_timestamps,same_global_start,same_global_end


In [30]:
def missing_hours_table(summary_df: pd.DataFrame):
    rows = []
    for _, row in summary_df.iterrows():
        station = row["station"]
        for ts in row["missing_list"]:
            rows.append({"station": station, "missing_datetime": ts})
    if not rows:
        return pd.DataFrame(columns=["station", "missing_datetime"])
    return pd.DataFrame(rows).sort_values(["station", "missing_datetime"]).reset_index(drop=True)

missing_detail = missing_hours_table(summary)
missing_detail.head(50)

Unnamed: 0,station,missing_datetime


In [31]:
print("Globalni skupni razpon:")
print(f"- zacetek: {global_first}")
print(f"- konec:   {global_last}")
print()
print("Skupni pregled kontrol:")
print(f"- postaje z manjkajocimi urami: {(summary['missing_hours'] > 0).sum()}")
print(f"- postaje s podvojenimi urami:   {(summary['duplicate_hours'] > 0).sum()}")
print(f"- postaje z neveljavnim casom:  {(summary['invalid_datetime'] > 0).sum()}")
print(f"- postaje izven polne ure:      {(summary['off_hour_timestamps'] > 0).sum()}")
print(f"- postaje z drugim zacetkom:    {(~summary['same_global_start']).sum()}")
print(f"- postaje z drugim koncem:      {(~summary['same_global_end']).sum()}")

Globalni skupni razpon:
- zacetek: 2024-05-02 20:00:00
- konec:   2025-12-11 07:00:00

Skupni pregled kontrol:
- postaje z manjkajocimi urami: 0
- postaje s podvojenimi urami:   0
- postaje z neveljavnim casom:  0
- postaje izven polne ure:      0
- postaje z drugim zacetkom:    0
- postaje z drugim koncem:      0


In [32]:
OUTPUT_DIR = Path("../reports")
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

summary.drop(columns=["missing_list"]).to_csv(OUTPUT_DIR / "hourly_completeness_summary.csv", index=False)
missing_detail.to_csv(OUTPUT_DIR / "hourly_missing_hours_detail.csv", index=False)

print("Shranjeno:")
print(OUTPUT_DIR / "hourly_completeness_summary.csv")
print(OUTPUT_DIR / "hourly_missing_hours_detail.csv")

KeyError: "['missing_list'] not found in axis"