# Import all the necessary packages

In [58]:
import pandas as pd
import glob
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
import math

# Load the data

In [6]:
import numpy as np
import os

# List all files in dir
files = os.listdir('/Users/annastonek/Desktop/Studium Unterlagen/AI_ML_Climate_Change/LamaH-CE/A_basins_total_upstrm/2_timeseries/daily')

# Select 0.5 of the files randomly 
random_files = np.random.choice(files, 100)
random_files

array(['ID_434.csv', 'ID_143.csv', 'ID_296.csv', 'ID_135.csv',
       'ID_485.csv', 'ID_772.csv', 'ID_804.csv', 'ID_437.csv',
       'ID_336.csv', 'ID_379.csv', 'ID_479.csv', 'ID_356.csv',
       'ID_731.csv', 'ID_589.csv', 'ID_47.csv', 'ID_647.csv',
       'ID_330.csv', 'ID_620.csv', 'ID_680.csv', 'ID_27.csv',
       'ID_326.csv', 'ID_611.csv', 'ID_191.csv', 'ID_112.csv',
       'ID_437.csv', 'ID_449.csv', 'ID_787.csv', 'ID_105.csv',
       'ID_641.csv', 'ID_867.csv', 'ID_166.csv', 'ID_586.csv',
       'ID_436.csv', 'ID_411.csv', 'ID_248.csv', 'ID_524.csv',
       'ID_573.csv', 'ID_173.csv', 'ID_520.csv', 'ID_466.csv',
       'ID_648.csv', 'ID_181.csv', 'ID_179.csv', 'ID_816.csv',
       'ID_676.csv', 'ID_45.csv', 'ID_663.csv', 'ID_731.csv',
       'ID_697.csv', 'ID_87.csv', 'ID_811.csv', 'ID_742.csv',
       'ID_749.csv', 'ID_52.csv', 'ID_153.csv', 'ID_50.csv', 'ID_387.csv',
       'ID_510.csv', 'ID_643.csv', 'ID_95.csv', 'ID_194.csv',
       'ID_307.csv', 'ID_749.csv', 'ID_359.csv',

In [16]:
# Get CSV files list from a folder
path = '/Users/annastonek/Desktop/Studium Unterlagen/AI_ML_Climate_Change/LamaH-CE/A_basins_total_upstrm/2_timeseries/daily/'
csv_files = []
for x in random_files:
    csv_files.append(os.path.join(path, x))

# Read each CSV file into DataFrame
# This creates a list of dataframes
counter = 0
df_list = []
for file in csv_files:
    aux = pd.read_csv(file, sep=';')
    aux['Region'] = counter
    df_list.append(aux)
    counter = counter + 1

# Pipeline

Now, we will create a regression tree from every dataset in the list and calculate the predictions.

In [53]:
predictions = []
test_values = []
for df in df_list:
    # separate the target column from the feature columns
    y = df[['prec']]
    X = df.drop(['prec', '2m_temp_min', '2m_temp_max', '2m_dp_temp_min', '2m_dp_temp_mean', '2m_dp_temp_max'], axis=1)
    # split the data
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
    
    # preprocessing - scale the data
    scaler = MinMaxScaler()
    X_train[['2m_temp_mean', '10m_wind_u', '10m_wind_v',
       'fcst_alb', 'lai_high_veg', 'lai_low_veg', 'swe',
       'surf_net_solar_rad_max', 'surf_net_solar_rad_mean',
       'surf_net_therm_rad_max', 'surf_net_therm_rad_mean', 'surf_press',
       'total_et', 'volsw_123', 'volsw_4']] = scaler.fit_transform(X_train[['2m_temp_mean', '10m_wind_u', '10m_wind_v',
       'fcst_alb', 'lai_high_veg', 'lai_low_veg', 'swe',
       'surf_net_solar_rad_max', 'surf_net_solar_rad_mean',
       'surf_net_therm_rad_max', 'surf_net_therm_rad_mean', 'surf_press',
       'total_et', 'volsw_123', 'volsw_4']])

    y_train = scaler.fit_transform(y_train)

    X_test[['2m_temp_mean', '10m_wind_u', '10m_wind_v',
       'fcst_alb', 'lai_high_veg', 'lai_low_veg', 'swe',
       'surf_net_solar_rad_max', 'surf_net_solar_rad_mean',
       'surf_net_therm_rad_max', 'surf_net_therm_rad_mean', 'surf_press',
       'total_et', 'volsw_123', 'volsw_4']] = scaler.fit_transform(X_test[['2m_temp_mean', '10m_wind_u', '10m_wind_v',
       'fcst_alb', 'lai_high_veg', 'lai_low_veg', 'swe',
       'surf_net_solar_rad_max', 'surf_net_solar_rad_mean',
       'surf_net_therm_rad_max', 'surf_net_therm_rad_mean', 'surf_press',
       'total_et', 'volsw_123', 'volsw_4']])

    y_test = scaler.fit_transform(y_test)
    
    #build tree
    tree = DecisionTreeRegressor().fit(X_train, y_train)
    #calculate predictions
    y_pred = tree.predict(X_test)
    
    predictions.append(y_pred)
    test_values.append(y_test)

# Evaluation

In [60]:
y_pred = pd.DataFrame(predictions)

y_true = []
for i in range(len(test_values)):
    y_true.append(np.array(test_values[0]).flatten())

y_true = pd.DataFrame(y_true)
print("MSE: ", mean_squared_error(y_true.mean(), y_pred.mean()))
print("RMSE: ", math.sqrt(mean_squared_error(y_true.mean(), y_pred.mean())))

MSE:  0.004984132715314812
RMSE:  0.07059839031674031
