# Dopolnitev CSV in manjkajoce vrednosti po stolpcih

Notebook:
- dopolni vsako postajo na urni indeks in shrani v `../completed`
- izpise manjkajoce vrednosti po stolpcih (in po postajah)
- shrani en pregleden graf delezov manjkajocih po stolpcih

In [None]:
from pathlib import Path
import pandas as pd
import matplotlib.pyplot as plt

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", 100)

In [None]:
INPUT_DIR = Path("../combined")
COMPLETED_DIR = Path("../completed")
REPORTS_DIR = Path("../reports")
PLOTS_DIR = REPORTS_DIR / "plots"

COMPLETED_DIR.mkdir(parents=True, exist_ok=True)
REPORTS_DIR.mkdir(parents=True, exist_ok=True)
PLOTS_DIR.mkdir(parents=True, exist_ok=True)

files = sorted(INPUT_DIR.glob("*.csv"))
len(files)

In [None]:
def complete_station_csv(path: Path):
    station = path.stem
    df = pd.read_csv(path)
    df["datetime"] = pd.to_datetime(df["datetime"], errors="coerce")
    invalid_datetime = int(df["datetime"].isna().sum())

    df = df.dropna(subset=["datetime"]).sort_values("datetime")
    if df.empty:
        empty = pd.DataFrame(columns=["datetime"] + [c for c in df.columns if c != "datetime"])
        empty.to_csv(COMPLETED_DIR / f"{station}.csv", index=False)
        return {
            "station": station,
            "rows_original": 0,
            "rows_completed": 0,
            "inserted_rows": 0,
            "invalid_datetime": invalid_datetime
        }, empty

    before_rows = len(df)
    df = df.drop_duplicates(subset=["datetime"], keep="last")

    full_index = pd.date_range(df["datetime"].min(), df["datetime"].max(), freq="h")
    completed = df.set_index("datetime").reindex(full_index)
    completed.index.name = "datetime"
    completed = completed.reset_index()

    completed.to_csv(COMPLETED_DIR / f"{station}.csv", index=False)

    return {
        "station": station,
        "rows_original": before_rows,
        "rows_completed": len(completed),
        "inserted_rows": int(len(completed) - len(df)),
        "invalid_datetime": invalid_datetime
    }, completed

In [None]:
meta_rows = []
completed_frames = {}
for path in files:
    meta, completed = complete_station_csv(path)
    meta_rows.append(meta)
    completed_frames[meta["station"]] = completed

completion_summary = pd.DataFrame(meta_rows).sort_values("station").reset_index(drop=True)
completion_summary

In [None]:
missing_rows = []
for station, cdf in completed_frames.items():
    total_rows = len(cdf)
    for col in cdf.columns:
        if col == "datetime":
            continue
        missing_count = int(cdf[col].isna().sum())
        missing_rows.append({
            "station": station,
            "column": col,
            "total_rows": total_rows,
            "missing_count": missing_count,
            "missing_pct": (missing_count / total_rows * 100) if total_rows else 0.0
        })

missing_by_station_col = pd.DataFrame(missing_rows).sort_values(["station", "column"]).reset_index(drop=True)
missing_by_col = missing_by_station_col.groupby("column", as_index=False)[["total_rows", "missing_count"]].sum()
missing_by_col["missing_pct"] = missing_by_col["missing_count"] / missing_by_col["total_rows"] * 100
missing_by_col = missing_by_col.sort_values("missing_pct", ascending=False).reset_index(drop=True)

missing_by_col

In [None]:
# Pregleden graf: delez manjkajocih po stolpcih
fig, ax = plt.subplots(figsize=(9, 4.8))
ax.bar(missing_by_col["column"], missing_by_col["missing_pct"], color="#2a9d8f")
ax.set_title("Delez manjkajocih vrednosti po stolpcih")
ax.set_xlabel("Stolpec")
ax.set_ylabel("Manjkajoce [%]")
ax.grid(axis="y", alpha=0.25)
plt.xticks(rotation=30, ha="right")
plt.tight_layout()
fig.savefig(PLOTS_DIR / "missing_pct_by_column.png", dpi=160)
plt.show()

In [None]:
completion_summary.to_csv(REPORTS_DIR / "completion_inserted_rows_summary.csv", index=False)
missing_by_station_col.to_csv(REPORTS_DIR / "missing_by_station_column.csv", index=False)
missing_by_col.to_csv(REPORTS_DIR / "missing_by_column_overall.csv", index=False)

print("Shranjeno:")
print(REPORTS_DIR / "completion_inserted_rows_summary.csv")
print(REPORTS_DIR / "missing_by_station_column.csv")
print(REPORTS_DIR / "missing_by_column_overall.csv")
print(PLOTS_DIR / "missing_pct_by_column.png")