

## Libraries

In [None]:
import numpy as np
import pandas as pd
from scipy.stats import spearmanr
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
import seaborn as sns

## Loading data

- `X_train` and `X_test` both have $35$ columns that represent the same explanatory variables but over different time periods.

- `X_train` and `Y_train` share the same column `ID` - each row corresponds to a unique ID associated wwith a day and a country.

- The target of this challenge `TARGET` in `Y_train` corresponds to the price change for daily futures contracts of 24H electricity baseload.




In [None]:
X_train = pd.read_csv('X_train.csv')
Y_train_clean = pd.read_csv('Y_train.csv')
X_test = pd.read_csv('X_test.csv')

#Model and train score

The benchmark for this challenge consists in a simple linear regression, after a light cleaning of the data: The missing (NaN) values are simply filled with 0's and the `COUNTRY` column is dropped - namely we used the same model for France and Germany.

In [None]:
def metric_train(output):

    return  spearmanr(output, Y_train_clean).correlation

##Linear Regression

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_val, Y_train, Y_val = train_test_split(Modified_X_train, Y_train_clean, test_size=0.2, random_state=42)

lr = LinearRegression()
lr.fit(X_train, Y_train)

output_train = lr.predict(X_train)
output_val = lr.predict(X_val)

mse = mean_squared_error(Y_val, output_val)
print("MSE:",mse)
mae = mean_absolute_error(Y_val, output_val)
print("MAE:",mae)

# Calculate Normalized Mean Absolute Error (NMAE)
range_y_val = np.max(Y_val) - np.min(Y_val)
nmae = mae / range_y_val
print("NMAE:", nmae)

# Calculate Normalized Mean Squared Error (NMSE)
var_y_val = np.var(Y_val)
nmse = mse / var_y_val
print("NMSE:", nmse)



MSE: 1.1894424076353105
MAE: 0.5889078075102198
NMAE: 0.0517925434724655
NMSE: 0.9958721897575211


## Random Forest

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error

# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train_clean, Y_train_clean, test_size=0.2, random_state=42)

# Initialize the Random Forest model with regularization parameters
model = RandomForestRegressor(
    n_estimators=200,   # Number of trees in the forest
    max_depth=10,       # Maximum depth of the trees
    min_samples_split=2, # Minimum number of samples required to split an internal node
    min_samples_leaf=1,  # Minimum number of samples required to be at a leaf node
    max_features='sqrt', # Number of features to consider when looking for the best split
    bootstrap=True       # Whether bootstrap samples are used when building trees
)

# Fit the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the validation data
output_val = model.predict(X_val)

# Calculate the custom evaluation metric on the validation data
spearman_correlation = metric_train(Y_val, output_val)
print("Spearman correlation score on validation set:", spearman_correlation)

# Calculate mean squared error and mean absolute error
mse = mean_squared_error(Y_val, output_val)
print("MSE:", mse)
mae = mean_absolute_error(Y_val, output_val)
print("MAE:", mae)

# Calculate Normalized Mean Absolute Error (NMAE)
range_y_val = np.max(Y_val) - np.min(Y_val)
nmae = mae / range_y_val
print("NMAE:", nmae)

# Calculate Normalized Mean Squared Error (NMSE)
var_y_val = np.var(Y_val)
nmse = mse / var_y_val
print("NMSE:", nmse)



Spearman correlation score on validation set: 0.18289780251846197
MSE: 1.2398234473032794
MAE: 0.6052393187863744
NMAE: 0.05322884724863274
NMSE: 1.038054203762006


In [None]:
# Analyze feature importance
feature_importance = model.feature_importances_
sorted_idx = np.argsort(feature_importance)[::-1]  # Sort feature indices by importance in descending order

# Print feature importance
print("Feature Importance:")
for i, idx in enumerate(sorted_idx):
    print(f"{i+1}. Feature {X_train.columns[idx]}: Importance = {feature_importance[idx]}")


Feature Importance:
1. Feature COUNTRY_FR: Importance = 0.05970705178117934
2. Feature COUNTRY_DE: Importance = 0.05868735762669619
3. Feature GAS_RET: Importance = 0.054989015837217665
4. Feature FR_GAS: Importance = 0.047685020488853896
5. Feature CARBON_RET: Importance = 0.04470285986916853
6. Feature FR_WINDPOW: Importance = 0.04343109464711406
7. Feature DE_SOLAR: Importance = 0.04139701336029036
8. Feature DE_WINDPOW: Importance = 0.041050594816436616
9. Feature DE_NET_IMPORT: Importance = 0.040360876573712484
10. Feature DE_RESIDUAL_LOAD: Importance = 0.03994235202017012
11. Feature DE_FR_EXCHANGE: Importance = 0.03781127775691077
12. Feature DE_CONSUMPTION: Importance = 0.035131270781504735
13. Feature DE_LIGNITE: Importance = 0.034897597068050214
14. Feature DE_GAS: Importance = 0.031608936102971524
15. Feature DE_WIND: Importance = 0.03049036142193737
16. Feature FR_WIND: Importance = 0.027894754090968556
17. Feature DAY_ID: Importance = 0.027475593417851878
18. Feature FR_SO

Here, we can see that the most important features are the country, the gas price and the french windpower.
Here, we calculate the most important features based on the impurity reduction. Features that lead to greater impurity reduction when used in the tree nodes are considered more important.



In [None]:
from sklearn.inspection import permutation_importance

result = permutation_importance(
    model, X_val, Y_val.drop(columns=['ID']).values.ravel(), n_repeats=30, random_state=42
)

#Print feature importances
feature_importance = result.importances_mean
for feature, importance in zip(X_val.columns, feature_importance):
    print(f'{feature}:{importance}')

DAY_ID:-0.003948870445356686
DE_CONSUMPTION:-0.013795871448043565
FR_CONSUMPTION:-0.009045379708466564
DE_FR_EXCHANGE:-0.0022843741290210348
DE_NET_IMPORT:0.00190005036076261
FR_NET_IMPORT:-0.007387389701128932
DE_GAS:-0.009495720814810812
FR_GAS:-0.024484363344200566
DE_COAL:-0.0053515438069910255
FR_COAL:-0.0035897145494320883
DE_HYDRO:-0.004036565751573754
FR_HYDRO:-0.007062632081786865
DE_NUCLEAR:-0.01157822044447423
FR_NUCLEAR:-0.0030006513504041415
DE_SOLAR:-0.006081569693745573
FR_SOLAR:-0.00477751836129896
DE_WINDPOW:0.0011824340665952172
FR_WINDPOW:-0.007857326093125358
DE_LIGNITE:-0.011493258487166721
DE_RESIDUAL_LOAD:0.008649829427971767
DE_RAIN:-0.0017245262984826389
FR_RAIN:-0.003034705192578861
DE_WIND:-0.0033043604666919803
FR_WIND:-0.00392872646005881
DE_TEMP:-0.0033792531841365585
FR_TEMP:-0.0038913553384845034
GAS_RET:-0.009462270173136056
COAL_RET:-0.007707958989795409
CARBON_RET:-0.008673047214353564
COUNTRY_DE:0.002655419508387339
COUNTRY_FR:0.006827140981687389


## XgBoost

In [None]:
from sklearn.model_selection import train_test_split

# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train_clean, Y_train_clean, test_size=0.2, random_state=42)

# Initialize the XGBoost model with regularization parameters
model = xgb.XGBRegressor(
    max_depth=2,       # Maximum tree depth
    min_child_weight=1,   # Minimum sum of instance weight needed in a child
    gamma=0,           # Minimum loss reduction required to make a further partition on a leaf node
    subsample=0.8,     # Subsample ratio of the training instances
    colsample_bytree=0.8, # Subsample ratio of columns when constructing each tree
    reg_alpha=0,       # L1 regularization term on weights
    reg_lambda=1       # L2 regularization term on weights
)

# Fit the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the validation data
output_val = model.predict(X_val)

# Calculate the custom evaluation metric on the validation data
spearman_correlation = metric_train(Y_val, output_val)
print("Spearman correlation score on validation set:", spearman_correlation)

mse = mean_squared_error(Y_val, output_val)
print("MSE:",mse)
mae = mean_absolute_error(Y_train_clean, output_train)
print("MAE:",mae)

# Calculate Normalized Mean Absolute Error (NMAE)
range_y_val = np.max(Y_val) - np.min(Y_val)
nmae = mae / range_y_val
print("NMAE:", nmae)

# Calculate Normalized Mean Squared Error (NMSE)
var_y_val = np.var(Y_val)
nmse = mse / var_y_val
print("NMSE:", nmse)

Spearman correlation score on validation set: 0.1338286458216426
MSE: 1.3246923780632476
MAE: 0.08839304488357316
NMAE: 0.00777388338448308
NMSE: 1.1091115390106603


## AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.model_selection import train_test_split
import numpy as np

# Split the data into training and validation sets
X_train, X_val, Y_train, Y_val = train_test_split(X_train_clean, Y_train_clean, test_size=0.2, random_state=42)

# Initialize the AdaBoost model
base_estimator = DecisionTreeRegressor(max_depth=2)  # Base estimator for AdaBoost
n_estimators = 100  # Number of boosting stages
model = AdaBoostRegressor(base_estimator=base_estimator, n_estimators=n_estimators, random_state=42)

# Fit the model on the training data
model.fit(X_train, Y_train)

# Make predictions on the validation data
output_val = model.predict(X_val)

# Calculate the custom evaluation metric on the validation data
spearman_correlation = metric_train(Y_val, output_val)
print("Spearman correlation score on validation set:", spearman_correlation)

# Calculate mean squared error and mean absolute error
mse = mean_squared_error(Y_val, output_val)
print("MSE:", mse)
mae = mean_absolute_error(Y_val, output_val)
print("MAE:", mae)

# Calculate Normalized Mean Absolute Error (NMAE)
range_y_val = np.max(Y_val) - np.min(Y_val)
nmae = mae / range_y_val
print("NMAE:", nmae)

# Calculate Normalized Mean Squared Error (NMSE)
var_y_val = np.var(Y_val)
nmse = mse / var_y_val
print("NMSE:", nmse)





Spearman correlation score on validation set: 0.13447891650040628
MSE: 1.694381906694985
MAE: 0.9855346622064585
NMAE: 0.0866745969148385
NMSE: 1.4186376817188622
