In [39]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from datetime import datetime, timedelta
import joblib
from tensorflow.keras.models import load_model
import keras.backend as K

# Load dataset
df = pd.read_csv("data_until_2024.csv")

# Load saved encoders and scalers
encoder = joblib.load("encoder2.pkl")
scaler_X = joblib.load("scaler_X2.pkl")
scaler_y = joblib.load("scaler_y2.pkl")

# Define custom MSE function for model loading
def mse(y_true, y_pred):
    return K.mean(K.square(y_true - y_pred))

# Load trained model
model = load_model("weather_forecasting_model2.h5", custom_objects={"mse": mse})

# Convert ID columns to numeric
df["province_id"] = pd.to_numeric(df["province_id"], errors="coerce")
df["region_id"] = pd.to_numeric(df["region_id"], errors="coerce")
df["station_id"] = pd.to_numeric(df["station_id"], errors="coerce")

# Convert Date column and remove time component
df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y", errors='coerce')

# Debugging: Check for any invalid dates
if df["Date"].isna().sum() > 0:
    print("Warning: Some dates could not be converted! Check your dataset.")
    print(df[df["Date"].isna()])

# Ensure name columns exist (Modify if dataset has different column names)
if "province_name" not in df.columns or "region_name" not in df.columns or "station_name" not in df.columns:
    print("Error: Ensure dataset contains 'province_name', 'region_name', and 'station_name' columns!")

# Display available provinces with IDs and Names
df_provinces = df[["province_id", "province_name"]].drop_duplicates().dropna()
print("\nAvailable Provinces:")
for _, row in df_provinces.iterrows():
    print(f"ID: {int(row['province_id'])} - Name: {row['province_name']}")

# Step 1: Get Province ID
while True:
    try:
        province_id = int(input("\nEnter Province ID: "))
        if province_id in df_provinces["province_id"].values:
            province_name = df_provinces[df_provinces["province_id"] == province_id]["province_name"].values[0]
            break
        else:
            print("Invalid Province ID! Please enter a valid one.")
    except ValueError:
        print("Please enter a numeric value.")

# Display available regions with IDs and Names (Filtered by selected Province)
df_regions = df[df["province_id"] == province_id][["region_id", "region_name"]].drop_duplicates().dropna()
print("\nAvailable Regions:")
for _, row in df_regions.iterrows():
    print(f"ID: {int(row['region_id'])} - Name: {row['region_name']}")

# Step 2: Get Region ID
while True:
    try:
        region_id = int(input("\nEnter Region ID: "))
        if region_id in df_regions["region_id"].values:
            region_name = df_regions[df_regions["region_id"] == region_id]["region_name"].values[0]
            break
        else:
            print("Invalid Region ID! Please enter a valid one.")
    except ValueError:
        print("Please enter a numeric value.")

# Display available stations with IDs and Names (Filtered by selected Region)
df_stations = df[(df["province_id"] == province_id) & (df["region_id"] == region_id)][["station_id", "station_name"]].drop_duplicates().dropna()
print("\nAvailable Stations:")
for _, row in df_stations.iterrows():
    print(f"ID: {int(row['station_id'])} - Name: {row['station_name']}")

# Step 3: Get Station ID
while True:
    try:
        station_id = int(input("\nEnter Station ID: "))
        if station_id in df_stations["station_id"].values:
            station_name = df_stations[df_stations["station_id"] == station_id]["station_name"].values[0]
            break
        else:
            print("Invalid Station ID! Please enter a valid one.")
    except ValueError:
        print("Please enter a numeric value.")

# Step 4: Get Date
def get_user_input_date():
    while True:
        try:
            input_date = input("Enter the end date for predictions (DD-MM-YYYY): ")
            prediction_date = pd.to_datetime(input_date, format='%d-%m-%Y')
            return prediction_date
        except ValueError:
            print("Invalid date format. Please enter the date in DD-MM-YYYY format.")

# 🔹 User-selected end date
end_date = get_user_input_date()

# Start prediction from 01-01-2025
start_date = pd.to_datetime("01-01-2025")

df_filtered = df[(df["province_id"] == province_id) &
                 (df["region_id"] == region_id) &
                 (df["station_id"] == station_id)]

# Get the last 7 days of data before the target date
df_last_7_days = df_filtered[(df_filtered["Date"] < pd.to_datetime("01-01-2025")) &
                             (df_filtered["Date"] >= pd.to_datetime("01-01-2025") - pd.Timedelta(days=7))]

# Sort by date to maintain chronological order
df_last_7_days = df_last_7_days.sort_values(by="Date")

# Get latitude and longitude from the dataset for the selected station
df["station_id"] = pd.to_numeric(df["station_id"], errors="coerce")

df_station = df_filtered[df_filtered["station_id"] == station_id]
station_info = df_station.iloc[0]  # Safe access
latitude = station_info["latitude"]
longitude = station_info["longitude"]


# Create a list of the last 7 dates
last_7_dates = [pd.to_datetime("01-01-2025") - pd.Timedelta(days=i) for i in range(1, 8)]
missing_dates = [date for date in last_7_dates if date not in df_last_7_days["Date"].values]

# Define numerical features (excluding latitude and longitude)
numerical_features = ["Min Temperature", "Max Temperature", "Average Temperature",
                      "Average Humidity", "Rainfall", "Sunshine Duration",
                      "Max Wind Speed", "Wind Direction at Max Speed",
                      "Average Wind Speed"]

# Store new data in a list for later concatenation
new_data_list = []

# Handle missing dates using forward fill, and fallback to backward fill if needed
for missing_date in missing_dates:
    # Try forward fill - look for the latest previous day with data
    previous_day = missing_date - timedelta(days=1)
    prev_data_row = None

    while previous_day >= df_last_7_days["Date"].min():
        prev_data_row = df_last_7_days[df_last_7_days["Date"] == previous_day]
        if not prev_data_row.empty:
            break
        previous_day -= timedelta(days=1)

    # If forward fill failed, try backward fill - look for the next available date
    if prev_data_row is None or prev_data_row.empty:
        next_day = missing_date + timedelta(days=1)
        while next_day <= df_last_7_days["Date"].max():
            prev_data_row = df_last_7_days[df_last_7_days["Date"] == next_day]
            if not prev_data_row.empty:
                break
            next_day += timedelta(days=1)

    # If both forward and backward fill fail, skip this missing date
    if prev_data_row is None or prev_data_row.empty:
        print(f"⚠️ No data available to fill missing date: {missing_date.strftime('%d-%m-%Y')}")
        continue

    # Copy the found day's values and assign the missing date
    new_data = prev_data_row.iloc[0].copy()
    new_data["Date"] = missing_date
    new_data_list.append(new_data)




# Convert new data list to DataFrame
if new_data_list:
    new_data_df = pd.DataFrame(new_data_list)
    df_last_7_days = pd.concat([df_last_7_days, new_data_df], ignore_index=True)

# Convert Date column to datetime again (in case new rows were added)
df_last_7_days["Date"] = pd.to_datetime(df_last_7_days["Date"])

# Sort data again after adding new rows
df_last_7_days = df_last_7_days.sort_values(by="Date")

numerical_features = ["Min Temperature", "Max Temperature", "Average Temperature",
                      "Average Humidity", "Rainfall", "Sunshine Duration",
                      "Max Wind Speed", "Wind Direction at Max Speed",
                      "Average Wind Speed", "latitude", "longitude",
                      "region_id", "province_id", "station_id"]

from datetime import timedelta

while df_last_7_days["Date"].max() < end_date:
    # -------- Prepare Input --------
    X_input_numerical = df_last_7_days[numerical_features]

    wind_direction_encoded = encoder.transform(df_last_7_days[["Most Wind Direction"]])
    wind_direction_df = pd.DataFrame(wind_direction_encoded, columns=encoder.get_feature_names_out(["Most Wind Direction"]))

    X_input_df = pd.concat([X_input_numerical.reset_index(drop=True), wind_direction_df.reset_index(drop=True)], axis=1)

    X_input_df['Year'] = df_last_7_days['Date'].dt.year.values
    X_input_df['Month'] = df_last_7_days['Date'].dt.month.values
    X_input_df['Day'] = df_last_7_days['Date'].dt.day.values
    X_input_df['DayOfWeek'] = df_last_7_days['Date'].dt.dayofweek.values

    X_input_scaled = scaler_X.transform(X_input_df)
    X_input_reshaped = X_input_scaled.reshape(1, 7, X_input_scaled.shape[1])

    # -------- Predict Next Day --------
    prediction_date = df_last_7_days["Date"].max() + timedelta(days=1)

    predicted_weather = model.predict(X_input_reshaped)
    predicted_weather_original = scaler_y.inverse_transform(predicted_weather)

    num_encoded_features = wind_direction_encoded.shape[1]
    predicted_weather_numerical = predicted_weather_original[:, :-num_encoded_features]
    predicted_weather_wind_direction = predicted_weather_original[:, -num_encoded_features:]

    predicted_wind_direction = encoder.inverse_transform(predicted_weather_wind_direction)

    # -------- Construct Predicted Row --------
    predicted_row = {
        "Date": prediction_date,
        "province_id": df_last_7_days["province_id"].iloc[-1],
        "region_id": df_last_7_days["region_id"].iloc[-1],
        "station_id": df_last_7_days["station_id"].iloc[-1],
        "latitude": df_last_7_days["latitude"].iloc[-1],
        "longitude": df_last_7_days["longitude"].iloc[-1],
        "Most Wind Direction": predicted_wind_direction[0][0]
    }

    for name, value in zip(target_columns, predicted_weather_numerical[0]):
        predicted_row[name] = round(value, 2)

    # -------- Update Dataset --------
    df_last_7_days = pd.concat([df_last_7_days, pd.DataFrame([predicted_row])], ignore_index=True)
    df_last_7_days = df_last_7_days.sort_values("Date").reset_index(drop=True)
    df_last_7_days = df_last_7_days.iloc[1:]  # drop the oldest row to maintain 7-day window

    print(f"Predicted and added weather for: {prediction_date.strftime('%Y-%m-%d')}")

# After the while-loop finishes
# The last row in df_last_7_days will be for end_date
final_row = df_last_7_days[df_last_7_days["Date"] == end_date].copy()

# Select the same features for decoding
X_numerical = final_row[numerical_features]
X_wind_encoded = encoder.transform(final_row[["Most Wind Direction"]])
X_wind_decoded = encoder.inverse_transform(X_wind_encoded)

# Print the values
print(f"\n**Prediction Date:** {end_date.strftime('%d-%m-%Y')}\n")
for name in target_columns:
    print(f"{name}: {final_row[name].values[0]:.2f}")

# Print decoded wind direction
print(f"Most Wind Direction: {X_wind_decoded[0][0]}")






Available Provinces:
ID: 1 - Name: Nanggroe Aceh Darussalam
ID: 2 - Name: Sumatera Utara
ID: 10 - Name: Kep. Riau
ID: 4 - Name: Riau
ID: 3 - Name: Sumatera Barat
ID: 5 - Name: Jambi
ID: 6 - Name: Sumatera Selatan
ID: 9 - Name: Kep. Bangka Belitung
ID: 7 - Name: Bengkulu
ID: 8 - Name: Lampung
ID: 34 - Name: Kalimantan Utara
ID: 23 - Name: Kalimantan Timur
ID: 20 - Name: Kalimantan Barat
ID: 21 - Name: Kalimantan Tengah
ID: 22 - Name: Kalimantan Selatan
ID: 16 - Name: Banten
ID: 11 - Name: DKI Jakarta
ID: 12 - Name: Jawa Barat
ID: 13 - Name: Jawa Tengah
ID: 14 - Name: DI Yogyakarta
ID: 15 - Name: Jawa Timur
ID: 24 - Name: Sulawesi Utara
ID: 25 - Name: Sulawesi Tengah
ID: 28 - Name: Gorontalo
ID: 29 - Name: Sulawesi Barat
ID: 26 - Name: Sulawesi Selatan
ID: 27 - Name: Sulawesi Tenggara
ID: 17 - Name: Bali
ID: 18 - Name: Nusa Tenggara Barat
ID: 19 - Name: Nusa Tenggara Timur
ID: 31 - Name: Maluku Utara
ID: 33 - Name: Papua Barat
ID: 32 - Name: Papua
ID: 30 - Name: Maluku

Enter Province I