# Requirements

In [5]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler, LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, KFold, RepeatedKFold, GridSearchCV, cross_val_predict
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import classification_report, accuracy_score, f1_score, confusion_matrix, mean_squared_error, mean_absolute_error, r2_score
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBClassifier, XGBRegressor

In [None]:
# Add as many imports as you need.

# Laboratory Exercise - Run Mode (8 points)

## Introduction
In this laboratory assignment, the focus is on time series forecasting, specifically targeting the prediction of the current **average sea-level pressure** in the city of Skopje. Your task involves employing bagging and boosting methods to forecast the average sea-level pressure. To accomplish this, you will use data from the preceding three days, consisting of average, minimal, and maximal temperatures, precipitation, as well as wind direction and speed, and the current season. By applying these ensemble learning techniques, you aim to enhance the accuracy and reliability of your predictions, gaining valuable insights into the temporal dynamics of sea-level pressure based on the given meteorological variables.

**Note: You are required to perform this laboratory assignment on your local machine.**

## The Weather Dataset

## Exploring the Weather Dataset
This dataset consists of daily weather records for the city of Skopje from January 1, 2021, to August 1, 2023. Each entry includes a unique station ID, city name, date, corresponding season (e.g., summer, winter), and various meteorological parameters such as average, minimum, and maximum temperatures in Celsius, precipitation in millimeters, average wind direction in degrees, average wind speed in kilometers per hour, and average sea-level pressure in hectopascals. The dataset offers comprehensive insights into the climatic conditions, allowing for analysis and exploration of weather patterns in Skopje over the specified time period.

The dataset comprises the following columns:
- station_id - unique ID for the weather station,
- city_name - name of the city where the station is located,
- date - date of the weather record,
- season - season corresponding to the date (e.g., summer, winter),
- avg_temp_c - average temperature in Celsius,
- min_temp_c - minimum temperature in Celsius,
- max_temp_c - maximum temperature in Celsius,
- precipitation_mm - precipitation in millimeters,
- avg_wind_dir_deg - average wind direction in degrees,
- avg_wind_speed_kmh - average wind speed in kilometers per hour, and
- avg_sea_level_pres_hpa - average sea-level pressure in hectopascals.

*Note: The dataset is complete, with no missing values in any of its entries.*

Load the dataset into a `pandas` data frame.

In [7]:
data = pd.read_csv('weather.csv')
data

Unnamed: 0,station_id,city_name,date,season,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,avg_wind_dir_deg,avg_wind_speed_kmh,avg_sea_level_pres_hpa
0,13588,Skopje,2021-01-01,Winter,5.1,0.5,13.2,0.0,330.0,5.9,1021.2
1,13588,Skopje,2021-01-02,Winter,3.0,-2.6,11.2,0.0,330.0,5.9,1021.2
2,13588,Skopje,2021-01-03,Winter,6.8,3.5,12.5,1.3,339.0,8.0,1017.8
3,13588,Skopje,2021-01-04,Winter,6.6,6.1,7.2,3.6,298.0,5.3,1011.3
4,13588,Skopje,2021-01-05,Winter,4.3,2.3,6.7,4.6,11.0,5.1,1014.5
...,...,...,...,...,...,...,...,...,...,...,...
938,13588,Skopje,2023-07-28,Summer,22.8,12.3,32.7,0.0,2.0,6.8,1014.6
939,13588,Skopje,2023-07-29,Summer,26.3,16.3,35.4,0.0,261.0,6.2,1011.7
940,13588,Skopje,2023-07-30,Summer,28.2,19.5,36.4,0.0,317.0,8.0,1009.8
941,13588,Skopje,2023-07-31,Summer,25.8,20.9,32.1,0.0,307.0,12.3,1010.9


Explore the dataset using visualizations of your choice.

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 943 entries, 0 to 942
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   station_id              943 non-null    int64  
 1   city_name               943 non-null    object 
 2   date                    943 non-null    object 
 3   season                  943 non-null    object 
 4   avg_temp_c              943 non-null    float64
 5   min_temp_c              943 non-null    float64
 6   max_temp_c              943 non-null    float64
 7   precipitation_mm        943 non-null    float64
 8   avg_wind_dir_deg        943 non-null    float64
 9   avg_wind_speed_kmh      943 non-null    float64
 10  avg_sea_level_pres_hpa  943 non-null    float64
dtypes: float64(7), int64(1), object(3)
memory usage: 81.2+ KB


In [11]:
data[['avg_temp_c','min_temp_c', 'max_temp_c', 'precipitation_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh', 'avg_sea_level_pres_hpa']].corr()

Unnamed: 0,avg_temp_c,min_temp_c,max_temp_c,precipitation_mm,avg_wind_dir_deg,avg_wind_speed_kmh,avg_sea_level_pres_hpa
avg_temp_c,1.0,0.955178,0.976177,-0.056634,0.025137,-0.065397,-0.393837
min_temp_c,0.955178,1.0,0.898194,0.04543,0.016872,-0.028296,-0.451426
max_temp_c,0.976177,0.898194,1.0,-0.117093,0.037166,-0.099911,-0.339306
precipitation_mm,-0.056634,0.04543,-0.117093,1.0,-0.01124,0.050021,-0.193343
avg_wind_dir_deg,0.025137,0.016872,0.037166,-0.01124,1.0,0.099502,0.088692
avg_wind_speed_kmh,-0.065397,-0.028296,-0.099911,0.050021,0.099502,1.0,-0.157845
avg_sea_level_pres_hpa,-0.393837,-0.451426,-0.339306,-0.193343,0.088692,-0.157845,1.0


Remove the highly correlated features.

In [38]:
# corr_data = pd.DataFrame(data[['avg_temp_c','min_temp_c', 'max_temp_c', 'precipitation_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh', 'avg_sea_level_pres_hpa']].corr())
# corr_data
# threshold = 0.85
# high_corr_pairs = []
# for i in range(len(corr_data.columns)):
#     for j in range(i + 1, len(corr_data.columns)):
#         if abs(corr_data.iloc[i, j]) > threshold:
#             high_corr_pairs.append((corr_data.index[i], corr_data.columns[j]))

# print("Highly correlated pairs:", high_corr_pairs)

# to_remove = set()
# for pair in high_corr_pairs:
#     to_remove.add(pair[1])  # Arbitrarily choose to remove the second feature

# print("Features to remove:", to_remove)

# reduced_corr_matrix = corr_data.drop(columns=to_remove, index=to_remove)
# print("Reduced Correlation Matrix:\n", reduced_corr_matrix)

In [13]:
data = data.drop(columns=['city_name','min_temp_c','max_temp_c', 'station_id'], axis=1)
data

Unnamed: 0,date,season,avg_temp_c,precipitation_mm,avg_wind_dir_deg,avg_wind_speed_kmh,avg_sea_level_pres_hpa
0,2021-01-01,Winter,5.1,0.0,330.0,5.9,1021.2
1,2021-01-02,Winter,3.0,0.0,330.0,5.9,1021.2
2,2021-01-03,Winter,6.8,1.3,339.0,8.0,1017.8
3,2021-01-04,Winter,6.6,3.6,298.0,5.3,1011.3
4,2021-01-05,Winter,4.3,4.6,11.0,5.1,1014.5
...,...,...,...,...,...,...,...
938,2023-07-28,Summer,22.8,0.0,2.0,6.8,1014.6
939,2023-07-29,Summer,26.3,0.0,261.0,6.2,1011.7
940,2023-07-30,Summer,28.2,0.0,317.0,8.0,1009.8
941,2023-07-31,Summer,25.8,0.0,307.0,12.3,1010.9


Encode the categorical features.

In [15]:
def label_data(data:pd.DataFrame, columns:list):
  encoder = LabelEncoder()
  data_copy = data.copy()

  for column in columns:
    data_copy[column] = encoder.fit_transform(data_copy[[column]].astype(str).values.ravel())
    
  return data_copy

In [17]:
data = label_data(data=data, columns=['season'])
data

Unnamed: 0,date,season,avg_temp_c,precipitation_mm,avg_wind_dir_deg,avg_wind_speed_kmh,avg_sea_level_pres_hpa
0,2021-01-01,3,5.1,0.0,330.0,5.9,1021.2
1,2021-01-02,3,3.0,0.0,330.0,5.9,1021.2
2,2021-01-03,3,6.8,1.3,339.0,8.0,1017.8
3,2021-01-04,3,6.6,3.6,298.0,5.3,1011.3
4,2021-01-05,3,4.3,4.6,11.0,5.1,1014.5
...,...,...,...,...,...,...,...
938,2023-07-28,2,22.8,0.0,2.0,6.8,1014.6
939,2023-07-29,2,26.3,0.0,261.0,6.2,1011.7
940,2023-07-30,2,28.2,0.0,317.0,8.0,1009.8
941,2023-07-31,2,25.8,0.0,307.0,12.3,1010.9


# Feauture Extraction
Select the relevant features for prediction and apply a lag of one, two, and three days to each chosen feature (except `season`), creating a set of features representing the meteorological conditions from the previous three days. To maintain dataset integrity, eliminate any resulting missing values at the beginning of the dataset.

Hint: Use `df['column_name'].shift(period)`. Check the documentation at https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.shift.html.

In [19]:
data = data.set_index('date')
data = data.sort_index()
data

Unnamed: 0_level_0,season,avg_temp_c,precipitation_mm,avg_wind_dir_deg,avg_wind_speed_kmh,avg_sea_level_pres_hpa
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2021-01-01,3,5.1,0.0,330.0,5.9,1021.2
2021-01-02,3,3.0,0.0,330.0,5.9,1021.2
2021-01-03,3,6.8,1.3,339.0,8.0,1017.8
2021-01-04,3,6.6,3.6,298.0,5.3,1011.3
2021-01-05,3,4.3,4.6,11.0,5.1,1014.5
...,...,...,...,...,...,...
2023-07-28,2,22.8,0.0,2.0,6.8,1014.6
2023-07-29,2,26.3,0.0,261.0,6.2,1011.7
2023-07-30,2,28.2,0.0,317.0,8.0,1009.8
2023-07-31,2,25.8,0.0,307.0,12.3,1010.9


In [21]:
lag = 3
for i in range(1, lag + 1):
  data[[f'avg_temp_c_prev_{i}', f'precipitation_mm_prev_{i}', f'avg_wind_dir_deg_prev_{i}', f'avg_wind_speed_kmh_prev_{i}', f'avg_sea_level_pres_hpa_prev_{i}']] = data[['avg_temp_c', 'precipitation_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh', 'avg_sea_level_pres_hpa']].shift(i)

In [23]:
data

Unnamed: 0_level_0,season,avg_temp_c,precipitation_mm,avg_wind_dir_deg,avg_wind_speed_kmh,avg_sea_level_pres_hpa,avg_temp_c_prev_1,precipitation_mm_prev_1,avg_wind_dir_deg_prev_1,avg_wind_speed_kmh_prev_1,...,avg_temp_c_prev_2,precipitation_mm_prev_2,avg_wind_dir_deg_prev_2,avg_wind_speed_kmh_prev_2,avg_sea_level_pres_hpa_prev_2,avg_temp_c_prev_3,precipitation_mm_prev_3,avg_wind_dir_deg_prev_3,avg_wind_speed_kmh_prev_3,avg_sea_level_pres_hpa_prev_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-01,3,5.1,0.0,330.0,5.9,1021.2,,,,,...,,,,,,,,,,
2021-01-02,3,3.0,0.0,330.0,5.9,1021.2,5.1,0.0,330.0,5.9,...,,,,,,,,,,
2021-01-03,3,6.8,1.3,339.0,8.0,1017.8,3.0,0.0,330.0,5.9,...,5.1,0.0,330.0,5.9,1021.2,,,,,
2021-01-04,3,6.6,3.6,298.0,5.3,1011.3,6.8,1.3,339.0,8.0,...,3.0,0.0,330.0,5.9,1021.2,5.1,0.0,330.0,5.9,1021.2
2021-01-05,3,4.3,4.6,11.0,5.1,1014.5,6.6,3.6,298.0,5.3,...,6.8,1.3,339.0,8.0,1017.8,3.0,0.0,330.0,5.9,1021.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2023-07-28,2,22.8,0.0,2.0,6.8,1014.6,20.7,0.0,316.0,16.0,...,28.2,0.0,242.0,13.4,1002.9,29.2,0.0,275.0,4.7,1007.7
2023-07-29,2,26.3,0.0,261.0,6.2,1011.7,22.8,0.0,2.0,6.8,...,20.7,0.0,316.0,16.0,1012.6,28.2,0.0,242.0,13.4,1002.9
2023-07-30,2,28.2,0.0,317.0,8.0,1009.8,26.3,0.0,261.0,6.2,...,22.8,0.0,2.0,6.8,1014.6,20.7,0.0,316.0,16.0,1012.6
2023-07-31,2,25.8,0.0,307.0,12.3,1010.9,28.2,0.0,317.0,8.0,...,26.3,0.0,261.0,6.2,1011.7,22.8,0.0,2.0,6.8,1014.6


In [25]:
data.isnull().sum()

season                           0
avg_temp_c                       0
precipitation_mm                 0
avg_wind_dir_deg                 0
avg_wind_speed_kmh               0
avg_sea_level_pres_hpa           0
avg_temp_c_prev_1                1
precipitation_mm_prev_1          1
avg_wind_dir_deg_prev_1          1
avg_wind_speed_kmh_prev_1        1
avg_sea_level_pres_hpa_prev_1    1
avg_temp_c_prev_2                2
precipitation_mm_prev_2          2
avg_wind_dir_deg_prev_2          2
avg_wind_speed_kmh_prev_2        2
avg_sea_level_pres_hpa_prev_2    2
avg_temp_c_prev_3                3
precipitation_mm_prev_3          3
avg_wind_dir_deg_prev_3          3
avg_wind_speed_kmh_prev_3        3
avg_sea_level_pres_hpa_prev_3    3
dtype: int64

In [27]:
data = data.dropna(axis=0)
data.head(10)

Unnamed: 0_level_0,season,avg_temp_c,precipitation_mm,avg_wind_dir_deg,avg_wind_speed_kmh,avg_sea_level_pres_hpa,avg_temp_c_prev_1,precipitation_mm_prev_1,avg_wind_dir_deg_prev_1,avg_wind_speed_kmh_prev_1,...,avg_temp_c_prev_2,precipitation_mm_prev_2,avg_wind_dir_deg_prev_2,avg_wind_speed_kmh_prev_2,avg_sea_level_pres_hpa_prev_2,avg_temp_c_prev_3,precipitation_mm_prev_3,avg_wind_dir_deg_prev_3,avg_wind_speed_kmh_prev_3,avg_sea_level_pres_hpa_prev_3
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2021-01-04,3,6.6,3.6,298.0,5.3,1011.3,6.8,1.3,339.0,8.0,...,3.0,0.0,330.0,5.9,1021.2,5.1,0.0,330.0,5.9,1021.2
2021-01-05,3,4.3,4.6,11.0,5.1,1014.5,6.6,3.6,298.0,5.3,...,6.8,1.3,339.0,8.0,1017.8,3.0,0.0,330.0,5.9,1021.2
2021-01-06,3,6.2,0.0,18.0,6.7,1017.2,4.3,4.6,11.0,5.1,...,6.6,3.6,298.0,5.3,1011.3,6.8,1.3,339.0,8.0,1017.8
2021-01-07,3,7.3,0.5,0.0,4.5,1015.2,6.2,0.0,18.0,6.7,...,4.3,4.6,11.0,5.1,1014.5,6.6,3.6,298.0,5.3,1011.3
2021-01-08,3,5.7,7.9,346.0,6.5,1009.7,7.3,0.5,0.0,4.5,...,6.2,0.0,18.0,6.7,1017.2,4.3,4.6,11.0,5.1,1014.5
2021-01-09,3,2.9,8.1,300.0,5.9,1016.8,5.7,7.9,346.0,6.5,...,7.3,0.5,0.0,4.5,1015.2,6.2,0.0,18.0,6.7,1017.2
2021-01-10,3,2.2,6.1,354.0,4.7,1017.1,2.9,8.1,300.0,5.9,...,5.7,7.9,346.0,6.5,1009.7,7.3,0.5,0.0,4.5,1015.2
2021-01-11,3,3.3,26.2,338.0,6.7,1014.2,2.2,6.1,354.0,4.7,...,2.9,8.1,300.0,5.9,1016.8,5.7,7.9,346.0,6.5,1009.7
2021-01-12,3,2.4,0.8,293.0,5.8,1013.5,3.3,26.2,338.0,6.7,...,2.2,6.1,354.0,4.7,1017.1,2.9,8.1,300.0,5.9,1016.8
2021-01-13,3,2.1,0.0,294.0,6.0,1012.5,2.4,0.8,293.0,5.8,...,3.3,26.2,338.0,6.7,1014.2,2.2,6.1,354.0,4.7,1017.1


In [77]:
# features = [f'avg_temp_c_prev{i}' for i in range(1, lag+1)] + \
#            [f'precipitation_mm_prev_{i}' for i in range(1, lag+1)] + \
#            [f'avg_wind_dir_deg_prev_{i}' for i in range(1, lag+1)] + \
#            [f'avg_wind_speed_kmh_prev_{i}' for i in range(1, lag+1)] + \
#            [f'avg_sea_level_pres_hpa_prev_{i}' for i in range(1, lag+1)]

#vtor nachin
# features = [f'{feature}_prev_{i}' for feature in ['avg_temp_c', 'precipitation_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh', 'avg_sea_level_pres_hpa'] for i in range(1, lag+1)]

In [29]:
features = ['avg_temp_c', 'precipitation_mm', 'avg_wind_dir_deg', 'avg_wind_speed_kmh', 'avg_sea_level_pres_hpa']

## Dataset Splitting
Partition the dataset into training and testing sets with an 80:20 ratio.

**WARNING: DO NOT SHUFFLE THE DATASET.**



In [31]:
results = {}

for feature in features:
    X, Y = data[[f'{feature}_prev_1', f'{feature}_prev_2', f'{feature}_prev_3']], data[feature]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)
    
    rf_model = RandomForestRegressor(n_estimators=50, criterion='squared_error', max_depth=20)
    rf_model.fit(X_train, Y_train)
    
    y_pred_rf = rf_model.predict(X_test)
    y_pred_rf
    
    mse = mean_squared_error(Y_test, y_pred_rf)
    mae = mean_absolute_error(Y_test, y_pred_rf)
    r2 = r2_score(Y_test, y_pred_rf)

    results[feature] = {
        "MSE": mse,
        "MAE": mae,
        "R2": r2
    }

for feature, metrics in results.items():
    print(f"Feature: {feature}")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

Feature: avg_temp_c
  MSE: 4.9146
  MAE: 1.6902
  R2: 0.9253
Feature: precipitation_mm
  MSE: 8.9545
  MAE: 1.7090
  R2: -0.2662
Feature: avg_wind_dir_deg
  MSE: 10733.8633
  MAE: 79.1776
  R2: -0.0727
Feature: avg_wind_speed_kmh
  MSE: 14.0884
  MAE: 2.3940
  R2: -0.1351
Feature: avg_sea_level_pres_hpa
  MSE: 13.2135
  MAE: 2.6433
  R2: 0.7003


## Ensemble Learning Methods

### Bagging

Create an instance of a Random Forest model and train it using the `fit` function.

Use the trained model to make predictions for the test set.

Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

### Boosting

Create an instance of an XGBoost model and train it using the `fit` function.

In [None]:
# Write your code here. Add as many boxes as you need.

Use the trained model to make predictions for the test set.

In [None]:
# Write your code here. Add as many boxes as you need.

Assess the performance of the model by using different metrics provided by the `scikit-learn` library.

In [35]:
results_xgb = {}

for feature in features:
    X, Y = data[[f'{feature}_prev_1', f'{feature}_prev_2', f'{feature}_prev_3']], data[feature]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, shuffle=False)
    
    xgb_model = XGBRegressor(n_estimators=50, max_depth=15, learning_rate=0.1, objective='reg:squarederror')
    xgb_model.fit(X_train, Y_train)
    
    y_pred_xgb = xgb_model.predict(X_test)
    y_pred_xgb
    
    mse_xgb = mean_squared_error(Y_test, y_pred_xgb)
    mae_xgb = mean_absolute_error(Y_test, y_pred_xgb)
    r2_xgb = r2_score(Y_test, y_pred_xgb)

    results_xgb[feature] = {
        "MSE": mse_xgb,
        "MAE": mae_xgb,
        "R2": r2_xgb
    }

for feature, metrics in results_xgb.items():
    print(f"Feature: {feature}")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

Feature: avg_temp_c
  MSE: 6.3938
  MAE: 1.9301
  R2: 0.9028
Feature: precipitation_mm
  MSE: 10.8840
  MAE: 1.6446
  R2: -0.5390
Feature: avg_wind_dir_deg
  MSE: 12314.7475
  MAE: 82.4888
  R2: -0.2307
Feature: avg_wind_speed_kmh
  MSE: 18.0159
  MAE: 2.8683
  R2: -0.4516
Feature: avg_sea_level_pres_hpa
  MSE: 15.2991
  MAE: 2.9432
  R2: 0.6530


# Laboratory Exercise - Bonus Task (+ 2 points)

As part of the bonus task in this laboratory assignment, your objective is to fine-tune the max_depth (`max_depth`) for the Random Forest model using a cross-validation with grid search and time series split. This involves systematically experimenting with various values for `max_depth` and evaluating the model's performance using cross-validation. Upon determining the most suitable `max_depth` value, evaluate the model's performance on a test set for final assessment.

Hints:
- For grid search use the `GridCVSearch` from the `scikit-learn` library. Check the documentation at https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html.
- For cross-validation use the `TimeSeriesSplit` from the `scikit-learn` library. Check the documentation at https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.TimeSeriesSplit.html.

## Dataset Splitting
Partition the dataset into training and testing sets with an 90:10 ratio.

**WARNING: DO NOT SHUFFLE THE DATASET.**

In [None]:
# Write your code here. Add as many boxes as you need.

## Fine-tuning the Random Forest Hyperparameter
Experiment with various values for `max_depth` and evaluate the model's performance using cross-validation.

In [None]:
# Write your code here. Add as many boxes as you need.

## Final Assessment of the Model Performance
Upon determining the most suitable `max_depth` value, evaluate the model's performance on a test set for final assessment.

In [37]:
from sklearn.model_selection import TimeSeriesSplit

In [47]:
results = {}

for feature in features:
    X, Y = data[[f'{feature}_prev_1', f'{feature}_prev_2', f'{feature}_prev_3']], data[feature]
    
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.1, shuffle=False)

    tcsv = TimeSeriesSplit(5)

    param_grid = {'n_estimators': [10, 15, 20, 25, 30, 40, 50, 60],
                  'max_depth': [3, 5, 10, 15, 20]}

    rf_model = RandomForestRegressor()

    grid_search = GridSearchCV(
        estimator=rf_model,
        param_grid = param_grid,
        scoring = 'neg_mean_squared_error',
        cv=tcsv
    )
    
    
    grid_search.fit(X_train, Y_train)

    best_max_depth = grid_search.best_params_['max_depth']
    best_estimator = grid_search.best_params_['n_estimators']
    print(f'Best_parameters for {feature}: best_max_depth: {best_max_depth} best_estimator: {best_estimator}')

    final_model = RandomForestRegressor(n_estimators=best_estimator, max_depth=best_max_depth)
    final_model.fit(X_train, Y_train)
    
    y_pred_rf = final_model.predict(X_test)
    
    mse = mean_squared_error(Y_test, y_pred_rf)
    mae = mean_absolute_error(Y_test, y_pred_rf)
    r2 = r2_score(Y_test, y_pred_rf)

    results[feature] = {
        "MSE": mse,
        "MAE": mae,
        "R2": r2
    }
    
print('Final model with best parameters')
for feature, metrics in results.items():
    print(f"Feature: {feature}")
    for metric, value in metrics.items():
        print(f"  {metric}: {value:.4f}")

Best_parameters for avg_temp_c: best_max_depth: 5 best_estimator: 25
Best_parameters for precipitation_mm: best_max_depth: 3 best_estimator: 10
Best_parameters for avg_wind_dir_deg: best_max_depth: 3 best_estimator: 40
Best_parameters for avg_wind_speed_kmh: best_max_depth: 3 best_estimator: 10
Best_parameters for avg_sea_level_pres_hpa: best_max_depth: 5 best_estimator: 60
Final model with best parameters
Feature: avg_temp_c
  MSE: 4.3817
  MAE: 1.5573
  R2: 0.8085
Feature: precipitation_mm
  MSE: 6.3339
  MAE: 1.5800
  R2: -0.0591
Feature: avg_wind_dir_deg
  MSE: 8347.3017
  MAE: 69.3854
  R2: 0.1543
Feature: avg_wind_speed_kmh
  MSE: 7.4361
  MAE: 1.9502
  R2: -0.0392
Feature: avg_sea_level_pres_hpa
  MSE: 4.5461
  MAE: 1.5952
  R2: 0.5769
