In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

DATA_PATH = "../data/weather_kriviyrih.csv"  
df = pd.read_csv(DATA_PATH)

df.head()

In [None]:
print(df.shape)
print(df.columns.tolist())
df.info()

In [None]:
if "time" in df.columns:
    df["time"] = pd.to_datetime(df["time"])
    df = df.sort_values("time").reset_index(drop=True)

dups = df.duplicated(subset=["time"]).sum() if "time" in df.columns else df.duplicated().sum()
miss = df.isna().mean().sort_values(ascending=False)

print("Duplicates:", dups)
display(miss.head(10))

df = df.interpolate(limit_direction="both")
df = df.fillna(method="bfill").fillna(method="ffill")

In [None]:
cols = [c for c in ["temperature_2m","relativehumidity_2m","windspeed_10m","precipitation_probability"] if c in df.columns]
df[cols].describe().T

In [None]:
if "time" in df.columns and len(df) > 24*30:
    last = df[df["time"] >= (df["time"].max() - pd.Timedelta(days=30))]
else:
    last = df.copy()

fig = plt.figure(figsize=(12,4))
plt.plot(last["time"] if "time" in last.columns else np.arange(len(last)), last["temperature_2m"], label="temp (°C)")
plt.title("Temperature trend")
plt.legend()
plt.xticks(rotation=30)
plt.tight_layout()
plt.show()

In [None]:
if "time" in df.columns:
    df["hour"] = df["time"].dt.hour
    df["dow"] = df["time"].dt.dayofweek
    df["hour_sin"] = np.sin(2*np.pi*df["hour"]/24)
    df["hour_cos"] = np.cos(2*np.pi*df["hour"]/24)
    df["dow_sin"] = np.sin(2*np.pi*df["dow"]/7)
    df["dow_cos"] = np.cos(2*np.pi*df["dow"]/7)

df.head()

In [None]:
from sklearn.model_selection import train_test_split

input_hours = 48
horizon = 24

features = [c for c in [
    "temperature_2m",
    "relativehumidity_2m",
    "windspeed_10m",
    "precipitation_probability",
    "hour_sin","hour_cos","dow_sin","dow_cos"
] if c in df.columns]

target = "temperature_2m"

assert target in df.columns, f"Не знайдено target колонку: {target}"
assert len(features) >= 4, f"Замало фіч: {features}"

values = df[features].values.astype("float32")
y = df[target].values.astype("float32")

def make_windows(values, y, input_hours=48, horizon=24):
    Xs, Ys = [], []
    for i in range(input_hours, len(values)-horizon):
        Xs.append(values[i-input_hours:i])
        Ys.append(y[i:i+horizon])
    return np.array(Xs, dtype="float32"), np.array(Ys, dtype="float32")

X, Y = make_windows(values, y, input_hours, horizon)
X.shape, Y.shape

In [None]:
n = len(X)
train_end = int(n*0.7)
val_end = int(n*0.85)

X_train, Y_train = X[:train_end], Y[:train_end]
X_val, Y_val = X[train_end:val_end], Y[train_end:val_end]
X_test, Y_test = X[val_end:], Y[val_end:]

print(X_train.shape, X_val.shape, X_test.shape)

In [None]:
os.makedirs("../artifacts", exist_ok=True)
np.savez("../artifacts/dataset_windows.npz",
         X_train=X_train, Y_train=Y_train,
         X_val=X_val, Y_val=Y_val,
         X_test=X_test, Y_test=Y_test,
         features=np.array(features, dtype=object),
         input_hours=input_hours, horizon=horizon)
print("Saved: ../artifacts/dataset_windows.npz")