In [14]:
import pandas as pd

dec_merged = pd.read_csv('../DATASET/obs_est_merged/dec_merged.csv')

In [15]:
dec_merged

Unnamed: 0,lat,lon,year,month,precip_est,precip_obs,bias_dec
0,45.0,-20.0,1982,1,114.242190,158.107760,-43.865570
1,45.0,-19.0,1982,1,109.765625,138.565060,-28.799435
2,45.0,-18.0,1982,1,106.218750,122.867584,-16.648834
3,45.0,-17.0,1982,1,105.335940,109.741210,-4.405270
4,45.0,-16.0,1982,1,103.375000,106.399536,-3.024536
...,...,...,...,...,...,...,...
230251,20.0,16.0,2017,6,0.351885,3.776550,-3.424665
230252,20.0,17.0,2017,6,0.426104,3.719330,-3.293226
230253,20.0,18.0,2017,6,1.090166,3.147125,-2.056959
230254,20.0,19.0,2017,6,1.437823,0.000000,1.437823


In [50]:
dec_average= dec_merged
dec_average = dec_average.drop(columns=['precip_obs'])
dec_average = dec_average[(dec_average['lon'] >= -18) & (dec_average['lon'] <= 0)]
dec_average = dec_average[(dec_average['lat'] >= 20) & (dec_average['lat'] <= 38)]

In [51]:
dec_average[dec_average['lon'] == -18]

Unnamed: 0,lat,lon,year,month,precip_est,bias_dec
289,38.0,-18.0,1982,1,76.445310,37.123410
330,37.0,-18.0,1982,1,69.164060,34.366390
371,36.0,-18.0,1982,1,65.484375,-3.401425
412,35.0,-18.0,1982,1,67.460940,0.585480
453,34.0,-18.0,1982,1,71.710940,-6.635310
...,...,...,...,...,...,...
230053,24.0,-18.0,2017,6,1.277666,0.705462
230094,23.0,-18.0,2017,6,0.711260,0.253497
230135,22.0,-18.0,2017,6,0.269854,0.040972
230176,21.0,-18.0,2017,6,0.226885,-0.001997


In [52]:
dec_average

Unnamed: 0,lat,lon,year,month,precip_est,bias_dec
289,38.0,-18.0,1982,1,76.445310,37.123410
290,38.0,-17.0,1982,1,74.929690,21.062350
291,38.0,-16.0,1982,1,72.187500,3.183440
292,38.0,-15.0,1982,1,71.421875,5.347505
293,38.0,-14.0,1982,1,70.515625,11.593811
...,...,...,...,...,...,...
230231,20.0,-4.0,2017,6,0.629229,-2.861219
230232,20.0,-3.0,2017,6,0.605791,-0.138075
230233,20.0,-2.0,2017,6,0.613604,-0.473585
230234,20.0,-1.0,2017,6,0.898760,-3.907758


In [53]:
dec_average.describe()

Unnamed: 0,lat,lon,year,month,precip_est,bias_dec
count,77976.0,77976.0,77976.0,77976.0,77976.0,77976.0
mean,29.0,-9.0,1999.5,3.5,17.09062,5.001516
std,5.477261,5.477261,10.388361,1.707836,21.51938,18.777963
min,20.0,-18.0,1982.0,1.0,-3.890422e-19,-350.37476
25%,24.0,-14.0,1990.75,2.0,1.071596,0.087105
50%,29.0,-9.0,1999.5,3.5,5.938294,1.611873
75%,34.0,-4.0,2008.25,5.0,28.96681,10.587353
max,38.0,0.0,2017.0,6.0,202.6487,162.94824


In [54]:
X, y = dec_average[['lat', 'lon', 'month', 'precip_est']], dec_average['bias_dec'] 
# X = X.sort_values('month')

### Model selection

In [55]:
from sklearn.linear_model import LinearRegression
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from lightgbm import LGBMRegressor
from sklearn.model_selection import KFold

In [56]:
# Define the models
models = {
    "Linear Regression": LinearRegression(),
    "XGBoost": XGBRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(random_state=42),
    "ExtraTrees Regressor": ExtraTreesRegressor(random_state=42),
    "LGBM Regressor": LGBMRegressor(random_state=42)
}
k = 4
kf = KFold(n_splits=k)
results = {}

for model_name, model in models.items():
    train_rmse_scores = []
    test_rmse_scores = []

    # Note that we pass groups to the split method
    for train_index, test_index in kf.split(X):
        x_train_fold, x_test_fold = X.iloc[train_index].drop(columns=['month']), X.iloc[test_index].drop(columns=['month'])
        y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

        # Train and predict
        model.fit(x_train_fold, y_train_fold)
        y_pred_train = model.predict(x_train_fold)
        y_pred_test = model.predict(x_test_fold)

        # Calculate RMSE
        rmse_train = mean_squared_error(y_train_fold, y_pred_train, squared=False)
        rmse_test = mean_squared_error(y_test_fold, y_pred_test, squared=False)

        train_rmse_scores.append(rmse_train)
        test_rmse_scores.append(rmse_test)

    avg_train_rmse = sum(train_rmse_scores) / k
    avg_test_rmse = sum(test_rmse_scores) / k

    results[model_name] = {
        "train_rmse": avg_train_rmse,
        "test_rmse": avg_test_rmse,
    }

# Optionally, print results
for model_name, metrics in results.items():
    print(f"{model_name} - Train RMSE: {metrics['train_rmse']}, Test RMSE: {metrics['test_rmse']}")

In [None]:
import plotly.graph_objects as go
import numpy as np
import plotly.io as pio

data = results
models = list(data.keys())
train_rmse = [data[model]['train_rmse'] for model in models]
test_rmse = [data[model]['test_rmse'] for model in models]

train_rmse = [round(num, 2) for num in train_rmse]
test_rmse = [round(num, 2) for num in test_rmse]

standard_deviation = np.std(y)  # Calculate standard deviation using numpy
sample_size = len(y)  # Calculate sample size

standard_error = standard_deviation / np.sqrt(sample_size)

In [None]:
fig = go.Figure()

# Bar chart for RMSE
fig.add_trace(go.Bar(
    x=models,
    y=train_rmse,
    name='Train RMSE',
    marker_color='blue',
    text=train_rmse,  # Add this line to specify the text for each bar
    # 'auto' places the text inside the bars; you can also use 'outside' or 'inside'
    textposition='auto'
))

fig.add_trace(go.Bar(
    x=models,
    y=test_rmse,
    name='Test RMSE',
    marker_color='red',
    text=test_rmse,  # Add this line to specify the text for each bar
    # 'auto' places the text inside the bars; you can also use 'outside' or 'inside'
    textposition='auto'
))
# Update the layout
fig.update_layout(
    barmode='group',
    title='RMSE',
    xaxis_title='Models',
    yaxis_title='Value',
    legend_title='Data',
    width=600,
    # plot_bgcolor='rgba(0,0,0,0)',  # Set plot background color to transparent
    # paper_bgcolor='rgba(0,0,0,0)'
)

# # Line chart for std
# fig.add_trace(go.Scatter(
#     x=models,
#     y=[stdev for model in models],
#     mode='lines+markers',
#     name='Std',
#     line=dict(color='green', width=2)
# ))

# Line chart for std
fig.add_trace(go.Scatter(
    x=models,
    y=[standard_deviation for i in range(len(models))],
    mode='lines+markers',
    name='Std',
    line=dict(color='orange', width=2)
))
fig.show()

### Hyperparams tuning

In [None]:
from xgboost import XGBRegressor
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import make_scorer, mean_squared_error

# Define your model
xgb = LGBMRegressor(random_state=42)

# Define the parameter grid
param_grid = {
    'num_leaves': [31, 41, 51],  # Increase in steps to see the effect
    'max_depth': [5, 10, 15],  # Adjust based on the complexity of the problem
    'learning_rate': [0.01, 0.05],  # Small steps to see incremental benefits
    'n_estimators': [100, 200],  # More trees can be better, but watch for overfitting
    'subsample': [0.8, 0.9, 1.0],  # Typical subsampling rates
    'min_child_samples': [20, 30, 40]  # Increasing it can combat overfitting
}

# Setup the scoring function
scorer = make_scorer(mean_squared_error, greater_is_better=False, squared=False)

# Setup the GroupKFold
tscv = TimeSeriesSplit(n_splits=3)  # Adjust the number of splits as necessary

# Setup GridSearchCV with GroupKFold
grid_search = GridSearchCV(estimator=xgb, param_grid=param_grid, scoring=scorer, cv=tscv, verbose=2)

# Fit the model using groups
grid_search.fit(X.drop(columns=['month']), y)

# Get the best estimator and its parameters
best_xgb = grid_search.best_estimator_
best_params = grid_search.best_params_

print("Best parameters:", best_params)
print("Best RMSE:", -grid_search.best_score_)  # Note: 'best_score_' is negative, so take the negative of it

# Optionally, use the best model to make predictions or further analysis
# predictions = best_xgb.predict(X_test)
# rmse = np.sqrt(mean_squared_error(y_test, predictions))
# print("Test RMSE:", rmse)

Fitting 3 folds for each of 324 candidates, totalling 972 fits
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000112 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 1599, number of used features: 3
[LightGBM] [Info] Start training from score 4.398570
[CV] END learning_rate=0.01, max_depth=5, min_child_samples=20, n_estimators=100, num_leaves=31, subsample=0.8; total time=   0.0s
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000097 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 323
[LightGBM] [Info] Number of data points in the train set: 3198, number of used features: 3
[LightGBM] [Info] Start training from score 6.695936
[CV] END learning_rate=0.01, max_depth=5, min_child_samples=20, n_estimators=100, num_leaves=31, subsample=0.8; total time=   0.

KeyboardInterrupt: 

In [None]:
from bayes_opt import BayesianOptimization
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import TimeSeriesSplit, cross_val_score
import numpy as np

# Assume 'y' is your target variable
tscv = TimeSeriesSplit(n_splits=3) 
# Define your RandomForest training function
def RF_evaluate(max_depth, n_estimators, min_samples_split, min_samples_leaf):
    params = {
        'max_depth': int(max_depth),
        'n_estimators': int(n_estimators),
        'min_samples_split': int(min_samples_split),
        'min_samples_leaf': int(min_samples_leaf),
        'random_state': 42
    }
    rf = RandomForestRegressor(**params)
    cv_scores = cross_val_score(rf, X.drop(columns=['month']), y, cv=tscv, scoring='neg_root_mean_squared_error')
    return np.mean(cv_scores)

# Set up Bayesian Optimization
optimizer = BayesianOptimization(
    f=RF_evaluate,
    pbounds={
        'max_depth': (1, 15),
        'n_estimators': (50, 150),
        'min_samples_split': (2, 10),
        'min_samples_leaf': (1, 4)
    },
    random_state=42
)

# Run optimization
optimizer.maximize(init_points=10, n_iter=70)

# Print best parameters
print("Best parameters:", optimizer.max['params'])

|   iter    |  target   | max_depth | min_sa... | min_sa... | n_esti... |
-------------------------------------------------------------------------
| [0m1        [0m | [0m-11.77   [0m | [0m6.244    [0m | [0m3.852    [0m | [0m7.856    [0m | [0m109.9    [0m |
| [0m2        [0m | [0m-12.69   [0m | [0m3.184    [0m | [0m1.468    [0m | [0m2.465    [0m | [0m136.6    [0m |
| [95m3        [0m | [95m-11.41   [0m | [95m9.416    [0m | [95m3.124    [0m | [95m2.165    [0m | [95m147.0    [0m |
| [95m4        [0m | [95m-11.35   [0m | [95m12.65    [0m | [95m1.637    [0m | [95m3.455    [0m | [95m68.34    [0m |
| [0m5        [0m | [0m-11.99   [0m | [0m5.259    [0m | [0m2.574    [0m | [0m5.456    [0m | [0m79.12    [0m |
| [0m6        [0m | [0m-11.39   [0m | [0m9.566    [0m | [0m1.418    [0m | [0m4.337    [0m | [0m86.64    [0m |
| [0m7        [0m | [0m-11.56   [0m | [0m7.385    [0m | [0m3.356    [0m | [0m3.597    [0m | [0m10

In [None]:
best_params_xgb = {'colsample_bytree': 0.8, 'learning_rate': 0.2, 'max_depth': 7, 'n_estimators': 300, 'subsample': 0.8}
best_params_rf = {'max_depth': 10, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 150}
# Create Extra Trees model using the best parameters
# best_model = XGBRegressor(**best_params_xgb, random_state=42)
# best_model = RandomForestRegressor(**best_params_rf, random_state=42)
best_model = LGBMRegressor(random_state=42)

In [None]:
# from sklearn.model_selection import TimeSeriesSplit
# from sklearn.metrics import mean_squared_error
# import numpy as np

# final_results = {}

# # Define the number of splits
# tscv = TimeSeriesSplit(n_splits=5)

# # You can iterate over the splits
# for train_index, test_index in tscv.split(X):
#     # Use .iloc for positional indexing
#     x_train_fold, x_test_fold = X.iloc[train_index].drop(columns=['month']), X.iloc[test_index].drop(columns=['month'])
#     y_train_fold, y_test_fold = y.iloc[train_index], y.iloc[test_index]

#     best_model.fit(x_train_fold, y_train_fold)
#     y_pred_train = best_model.predict(x_train_fold)
#     y_pred_test = best_model.predict(x_test_fold)

#     rmse_train = mean_squared_error(
#         y_train_fold, y_pred_train, squared=False)
#     rmse_test = mean_squared_error(y_test_fold, y_pred_test, squared=False)

#     train_rmse_scores.append(rmse_train)
#     test_rmse_scores.append(rmse_test)

# avg_train_rmse = sum(train_rmse_scores) / k
# avg_test_rmse = sum(test_rmse_scores) / k

# final_results["metrics"] = {
#     "RMSE train": avg_train_rmse,
#     "RMSE test": avg_test_rmse,
# }
# print(final_results)

{'metrics': {'RMSE train': 4.276509256257327, 'RMSE test': 8.32618806517496}}


In [None]:
# rmse = np.sqrt(mean_squared_error(test['bias_dec'], y_sub))