In [None]:
about_the_run = input("About the run: ")
run, load_run_number = 2, 45             #  0:"single_run"  1:"keras_tuner" 2:"load_best_model"
colab = 0
model_choice = "tf"
percentage = 100
epochs = 10
batch_size=32
limit_of_nans_in_a_timestep = 120
percentage_of_data_in_summer_months = [0.4]  # Desired number of zero values
divide_latitude_in_these_many_parts = 5  #latitude has 10 values
divide_longitude_in_these_many_parts = 6 # longitude has 14 values

# Configurations

In [None]:
import xarray as xr
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import r2_score
import plotly.express as px
import math
import time
from sklearn.linear_model import LinearRegression
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import optimizers
from kerastuner.tuners import RandomSearch
import logging
import os
import sys
from contextlib import contextmanager
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, SimpleRNN
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras import optimizers
from keras.models import load_model

In [None]:
# for the WSL conda environment
# 1. python kernel is named wslminiconda3 (Python 3.11.14)
# 2. the environment is named base and is in the directory /home/arhab/wslminiconda3
# 3. Do not use the python kernel named base

In [None]:
def get_next_folder_name(base_folder):
    run_number = 1
    while True:
        folder_name = os.path.join(base_folder, f"run_{run_number}")
        if not os.path.exists(folder_name):
            return folder_name
        run_number += 1

# Base folder directory
base_directory = r"D:\thesis_data\notebooks\model_runs"

# Get the next folder name
folder_name = get_next_folder_name(base_directory)

# Create the folder
os.makedirs(folder_name, exist_ok=True)
run_number = f"run_{folder_name.split('_')[3]}"
print(run_number)

In [None]:
# Set up logging and save the log file in the folder
log_file = os.path.join(folder_name, 'log.txt')
logger = logging.getLogger(folder_name)
logger.setLevel(logging.INFO)

# Create a file handler and set the formatter
file_handler = logging.FileHandler(log_file)
formatter = logging.Formatter('%(asctime)s - %(levelname)s - %(message)s')
file_handler.setFormatter(formatter)

# Add the file handler to the logger
logger.addHandler(file_handler)

# Create a stream handler to display log messages in Jupyter Notebook console
stream_handler = logging.StreamHandler()
stream_handler.setFormatter(formatter)
logger.addHandler(stream_handler)

# Log information
logger.info("Logging information check")
logger.info(run_number)

In [None]:
logger.info(f"About the run: {about_the_run}")
logger.info(f"run = {run}   =>  0:single_run  1:keras_tuner 2:load_best_model")
logger.info(f"model_choice ={model_choice}")
logger.info(f"percentage = {percentage}")
logger.info(f"epochs = {epochs}")
logger.info(f"batch_size = {batch_size}")
logger.info(f"percentage_of_data_in_summer_months = {percentage_of_data_in_summer_months}")
logger.info(f"divide_latitude_in_these_many_parts = {divide_latitude_in_these_many_parts}")  
logger.info(f"divide_longitude_in_these_many_parts = {divide_longitude_in_these_many_parts}") 


In [None]:
import contextlib
import sys

@contextlib.contextmanager
def stdout_redirected(new_stdout):
    save_stdout = sys.stdout
    sys.stdout = new_stdout
    try:
        yield None
    finally:
        sys.stdout = save_stdout


In [None]:
if colab == 1:
    # !pip install cftime
    from google.colab import drive
    drive.mount('/content/drive')
    import cftime
    %run "/content/drive/My Drive/Colab Notebooks/main/functions.ipynb"
else: 
    %run "functions.ipynb"

# Data Loading

In [None]:
scenario = "rcp45"

In [None]:
# Step 1: Load NDSI Labels
if colab !=1:
  ndsi_ds = xr.open_mfdataset(r".\cropped_data\label.nc")

if colab == 1:
  ndsi_ds = xr.open_mfdataset(f'/content/drive/My Drive/Colab Notebooks/cropped_data/label.nc')

# filtered_dates, nan_interpolation_df = get_filtered_dates_for_ndsi(percentage_limit=percentage_limit, pixel_interpolation_limit=pixel_interpolation_limit)
# ndsi_ds = ndsi_ds.sel(time=filtered_dates)
selected_dates = get_dates(ndsi_ds)
ndsi_labels = ndsi_ds['NDSI_Snow_Cover'].values
ndsi_ds = ndsi_ds.interpolate_na(dim='lon', method='linear',  max_gap=4, use_coordinate=False)
ndsi_ds = ndsi_ds.interpolate_na(dim='lat', method='linear', max_gap=3, use_coordinate=False)

ndsi_ds['time'] = xr.DataArray(ndsi_ds['time'].values.astype('datetime64[ns]'), dims='time', attrs=ndsi_ds['time'].attrs)
ndsi_ds.close()

In [None]:
ndsi_ds

In [None]:
# Step 2: Load Data
# Load climate variables
climate_vars = ["tas", "pr", "hurs", "psl",  "rsds", "sfcWind"]
data = []

for parameter in climate_vars:
    

    if colab == 0:
      ds = xr.open_mfdataset(rf'.\cropped_data\{scenario}\{parameter}_{scenario}.nc') #fixxx
    if colab == 1:
      ds = xr.open_mfdataset(f'/content/drive/My Drive/Colab Notebooks/cropped_data/{scenario}/{parameter}_{scenario}.nc')

    data.append(ds[parameter].values)
    ds.close()

data = np.array(data)

In [None]:
ds

# Conversion to pandas from xarray

In [None]:
# Step 5: Flatten Data
n_time_steps = data[0].shape[0]
n_lat, n_lon = data[0].shape[1], data[0].shape[2]
logger.info("for feature")
logger.info(f"{n_time_steps} , {n_lat}, {n_lon}")

n_time_steps = ndsi_labels.shape[0]
n_lat, n_lon = ndsi_labels.shape[1], ndsi_labels.shape[2]
logger.info("for label")
logger.info(f"{n_time_steps} , {n_lat}, {n_lon}")


In [None]:
temp0 = to_array(data, 0)
temp1 = to_array(data, 1)
temp2 = to_array(data, 2)
temp3 = to_array(data, 3)
temp4 = to_array(data, 4)
temp5 = to_array(data, 5)


In [None]:
dates = ds["time"].values
lats = ndsi_ds["lat"].values
lons = ndsi_ds["lon"].values

#********** Dates *************
dates_array_to_append = []
for a in range(len(dates)):
    x = dates[a]
    for b in range(140):
        dates_array_to_append.append(x)
dates_array_to_append = np.array(dates_array_to_append)

#********** Latitude *************
lats_array_to_append = []
for c in range(len(dates)):
    for a in range(len(lats)):
        x = lats[a]
        for b in range(len(lons)):
            lats_array_to_append.append(x)
lats_array_to_append = np.array(lats_array_to_append)

#********** Longitude *************
lons_array_to_append = []
for b in range(len(dates)*len(lats)):
    for a in range(len(lons)):
        x = lons[a]
        lons_array_to_append.append(x)

lons_array_to_append = np.array(lons_array_to_append)


logger.info(f"Length of Dates: {len(dates_array_to_append)}")
logger.info(f"Length of Latitude: {len(lats_array_to_append)}")
logger.info(f"Length of Longitude: {len(lons_array_to_append)}")

# Data cleaning, feature engineering with pandas

In [None]:
dict_temp = {
    "Date": dates_array_to_append,
    "Latitude": lats_array_to_append,
    "Longitude": lons_array_to_append,
    climate_vars[0]: temp0,
    climate_vars[1]: temp1,
    climate_vars[2]: temp2,
    climate_vars[3]: temp3,
    climate_vars[4]: temp4,
    climate_vars[5]: temp5
}


# Create the DataFrame
df = pd.DataFrame(dict_temp)
df['Latitude'] = df['Latitude'].round(2)
df['Longitude'] = df['Longitude'].round(2)

df, drop_nan = df.dropna(ignore_index=False), "yes"


In [None]:
# df = df.reset_index()
df["Month"] = df['Date'].dt.month
df['month_sin'] = np.sin(2 * np.pi * df['Month']/12)
df['month_cos'] = np.cos(2 * np.pi * df['Month']/12)

df['week_number'] = df["Date"].dt.isocalendar().week

df

In [None]:
lats = np.unique(df.Latitude.values)
lons = np.unique(df.Longitude.values)

latitude_bins = divide_range(lats[0], lats[-1], divide_latitude_in_these_many_parts)   #10 values 
longitude_bins = divide_range(lons[0], lons[-1], divide_longitude_in_these_many_parts) #14 values

df['Latitude_Group'] = pd.cut(df['Latitude'], bins=latitude_bins, labels=False)
df['Longitude_Group'] = pd.cut(df['Longitude'], bins=longitude_bins, labels=False)

# Apply one-hot encoding
latitude_dummies = pd.get_dummies(df['Latitude_Group'], prefix='Latitude', dtype=int)
longitude_dummies = pd.get_dummies(df['Longitude_Group'], prefix='Longitude', dtype=int)

# Concatenate one-hot encoded columns with the original DataFrame
df = pd.concat([df, latitude_dummies, longitude_dummies], axis=1)
temp = df
df = df.drop(["Latitude_Group",	"Longitude_Group"], axis =1)

temp = temp[temp["Date"] == temp.Date[1]]
grouped_latitudes = temp.groupby('Latitude_Group')['Latitude'].unique().to_dict()
logger.info("Latitude Group")
logger.info("{\n" + ",\n".join(f" {key}: {value}" for key, value in grouped_latitudes.items()) + "\n}")

grouped_longitudes = temp.groupby('Longitude_Group')['Longitude'].unique().to_dict()
logger.info("Longitude Group")
logger.info("{\n" + ",\n".join(f" {key}: {value}" for key, value in grouped_longitudes.items()) + "\n}")


In [None]:
#bookmark
print(df.columns)

In [None]:
df= df.drop(["Month", "week_number"],axis =1)

In [None]:
parameter_array = ["tas", "pr", "hurs", "psl",  "rsds", "sfcWind"]
min_max_df = pd.read_csv("./min_max_of_all_parameters.csv")
for parameter in parameter_array:
    min_val, max_val =  get_min_max(min_max_df, parameter)
    df[parameter] = (df[parameter] - min_val) / (max_val - min_val)

In [None]:
df_insurance = df

In [None]:
# b_array = [0.1, 0.15, 0.2, 0.25, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99]  # Desired number of zero values
zero_values_array = []
b_array = percentage_of_data_in_summer_months
for b in b_array:
    
    df = df_insurance
    df["Month"] = df['Date'].dt.month
    df_to_delete = df
    df = df[df['Month'].isin([1, 2, 3, 12])]
    logger.info(f"Number of datapoints in Original df: {len(df_to_delete)}")

    # Step 6: Display the final value of b
    logger.info(f"Value of b: {b}")


    # Now you can use this final value of b to sample your DataFrame
    final_sampled_data = pd.concat([df_to_delete[(df_to_delete['Date'].dt.month == month)].sample(frac=b, random_state=1) for month in range(4, 12)])
    df = pd.concat([final_sampled_data, df], ignore_index=False)
    df = df.sort_index()
    logger.info(f"Number of datapoints after sampling df: {len(df)}")


In [None]:
if percentage == 100:
    df.to_csv(f"./ig_complete_df_{scenario}.csv")

#bookmark2

# Data splitting

In [None]:
percentage = 100
load_run_number = 51

In [None]:
scenario_array = ["rcp26", "rcp45", "rcp85"]
prediction_array = []

for scenario in scenario_array:
    df = pd.read_csv(f"./ig_complete_df_{scenario}.csv")
    df = df.sample(frac = percentage/100, random_state=1)
    df = df.sort_index()
    logger.info(f"Number of rows: {len(df)}")

    X_og= df.drop(["Date", "Latitude", "Longitude", "Month", "Unnamed: 0"], axis=1).values
    temp_shape = X_og.shape[1]
    X = X_og.reshape(-1, 1, temp_shape)  # Reshaping to (146246, 1, 32)
    logger.info(f"X shape is: {X.shape}")

    # Check for GPU availability
    if tf.config.list_physical_devices('GPU'):
        logger.info('GPU found. Running on GPU.')
    else:
        logger.info('No GPU found. Running on CPU.')

    # Specify GPU device if available
    physical_devices = tf.config.list_physical_devices('GPU')
    if len(physical_devices) > 0:
        tf.config.experimental.set_memory_growth(physical_devices[0], True)

    # Convert data to TensorFlow tensors
    X_tensor = tf.convert_to_tensor(X, dtype=tf.float32)
    model = load_model(rf'D:\thesis_data\notebooks\model_runs\run_{load_run_number}\model.h5')

    y = model.predict(X_tensor)
    remove_eta_lines(run_number)
    prediction_array.append(y)



In [None]:
prediction_array = np.array(prediction_array)

# Plotting

In [None]:
# y_for_df = y.flatten()
# y_val_pred = y_val_pred.flatten()
comparison_df = pd.DataFrame({
    "Date" : df["Date"].values,
    "Latitude" : df["Latitude"].values,
    "Longitude" : df["Longitude"].values,
    f"{scenario_array[0]}": prediction_array[0].flatten(),
    f"{scenario_array[1]}": prediction_array[1].flatten(),
    f"{scenario_array[2]}": prediction_array[2].flatten()
    })
comparison_df['Date'] = pd.to_datetime(comparison_df['Date'])

# df_image_path = os.path.join(folder_name, 'val_vs_predicted.csv')
# comparison_df.to_csv(df_image_path)
comparison_df

In [None]:
# comparison_df, scale = comparison_df.sort_values(by='Date'), "Original without mean"
# fig = px.scatter(comparison_df, x="Date" , y=scenario)
# fig.update_layout(title=f'{scenario} | {scale}', width = 1800, height = 400)

# # plotly_image_path = os.path.join(folder_name, 'Original_without_mean.png')
# # fig.write_image(plotly_image_path)

# fig

In [None]:
comparison_df_1 = comparison_df[(comparison_df['Latitude'] ==  46.58) & (comparison_df['Longitude'] == 11.15)]
comparison_df_1, scale = comparison_df_1.sort_values(by='Date'), "Centre without mean"
fig = px.line(comparison_df_1, x="Date" , y = scenario_array)
fig.update_layout(title=f'{scenario_array} | {scale}', width = 1800, height = 400)

# plotly_image_path = os.path.join(folder_name, 'Centre_without_mean.png')
# fig.write_image(plotly_image_path)

fig

In [None]:
# top left
temp_lat =  lats[-3]
temp_lon = lons[2]
comparison_df_1 = comparison_df[(comparison_df['Latitude'] ==  temp_lat) & (comparison_df['Longitude'] == temp_lon)]
comparison_df_1, scale = comparison_df_1.sort_values(by='Date'), "Top left"
fig = px.line(comparison_df_1, x="Date" , y = scenario_array)
fig.update_layout(title=f'{scenario_array} | {scale}', width = 1800, height = 400)

plotly_image_path = os.path.join(folder_name, f"{scale}.png")
fig.write_image(plotly_image_path)

fig

In [None]:
# top right
temp_lat =  lats[-2]
temp_lon = lons[-2]
comparison_df_1 = comparison_df[(comparison_df['Latitude'] ==  temp_lat) & (comparison_df['Longitude'] == temp_lon)]
comparison_df_1, scale = comparison_df_1.sort_values(by='Date'), "Top Right"
fig = px.line(comparison_df_1, x="Date" , y = scenario_array)
fig.update_layout(title=f'{scenario_array} | {scale}', width = 1800, height = 400)

plotly_image_path = os.path.join(folder_name, f"{scale}.png")
fig.write_image(plotly_image_path)

fig

In [None]:
# bottom left
temp_lat =  lats[1]
temp_lon = lons[2]
comparison_df_1 = comparison_df[(comparison_df['Latitude'] ==  temp_lat) & (comparison_df['Longitude'] == temp_lon)]
comparison_df_1, scale = comparison_df_1.sort_values(by='Date'), "Bottom left"
fig = px.line(comparison_df_1, x="Date" , y = scenario_array)
fig.update_layout(title=f'{scenario_array} | {scale}', width = 1800, height = 400)

plotly_image_path = os.path.join(folder_name, f"{scale}.png")
fig.write_image(plotly_image_path)

fig

In [None]:
# bottom right
temp_lat =  lats[1]
temp_lon = lons[-4]
comparison_df_1 = comparison_df[(comparison_df['Latitude'] ==  temp_lat) & (comparison_df['Longitude'] == temp_lon)]
comparison_df_1, scale = comparison_df_1.sort_values(by='Date'), "Bottom right"
fig = px.line(comparison_df_1, x="Date" , y = scenario_array)
fig.update_layout(title=f'{scenario_array} | {scale}', width = 1800, height = 400)

plotly_image_path = os.path.join(folder_name, f"{scale}.png")
fig.write_image(plotly_image_path)

fig

In [None]:
comparison_df2 = comparison_df
comparison_df2['Date'] = pd.to_datetime(comparison_df2['Date'], dayfirst=True)
comparison_df2['Date'], scale = comparison_df2['Date'].dt.strftime('%d-%m-%Y'), "Average Daily"
comparison_df2= comparison_df2.groupby('Date')[scenario_array].mean().reset_index()
comparison_df2['Date'] = pd.to_datetime(comparison_df2['Date'], format='%d-%m-%Y')
comparison_df2 = comparison_df2.sort_values(by='Date')
# comparison_df2
fig = px.scatter(comparison_df2, x="Date" , y=scenario_array)
fig.update_layout(title=f'{scenario_array} | {scale}', width = 1800, height = 400)

plotly_image_path = os.path.join(folder_name, f"{scale}.png")
fig.write_image(plotly_image_path)
fig

In [None]:
fig = px.line(comparison_df2, x="Date" , y=scenario_array)
fig.update_layout(title=f'{scenario_array} | {scale}', width = 1800, height = 400)

plotly_image_path = os.path.join(folder_name, f"{scale}.png")
fig.write_image(plotly_image_path)
fig

In [None]:
comparison_df_3 = comparison_df
comparison_df_3['Date'] = pd.to_datetime(comparison_df_3['Date'], dayfirst=True)
comparison_df_3['Date'], scale = comparison_df_3['Date'].dt.strftime('%m-%Y'), "Average monthly"
comparison_df_3= comparison_df_3.groupby('Date')[scenario_array].mean().reset_index()
comparison_df_3['Date'] = pd.to_datetime(comparison_df_3['Date'])
comparison_df_3 = comparison_df_3.sort_values(by='Date')

fig = px.line(comparison_df_3, x="Date" , y=scenario_array)
fig.update_layout(title=f'{scenario_array} | {scale}', width = 1800, height = 400)

plotly_image_path = os.path.join(folder_name, f"{scale}.png")
fig.write_image(plotly_image_path)

fig

In [None]:
# Make a copy of the DataFrame
comparison_df_4 = comparison_df.copy()
comparison_df_4['Date'] = pd.to_datetime(comparison_df_4['Date'], dayfirst=True)
# Convert 'Date' column to month-year format
comparison_df_4['Date'], scale = comparison_df_4['Date'].dt.strftime('%Y'), "Average yearly"

# Group by 'Date' (year) and calculate the mean for each year
comparison_df_4 = comparison_df_4.groupby('Date')[scenario_array].mean().reset_index()

# Convert 'Date' back to datetime format
comparison_df_4['Date'] = pd.to_datetime(comparison_df_4['Date'])

# Sort DataFrame by 'Date'
comparison_df_4 = comparison_df_4.sort_values(by='Date')

# Create the plot with Plotly
fig = px.line(comparison_df_4, x="Date", y=scenario_array)
fig.update_layout(title=f'{scenario_array} | {scale}', width=1800, height=400)

plotly_image_path = os.path.join(folder_name, f"{scale}.png")
fig.write_image(plotly_image_path)

fig.show()