In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import sklearn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, StandardScaler, MinMaxScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
# from sklearn.cluster import DBSCAN
from hdbscan import HDBSCAN
from sklearn.neighbors import NearestNeighbors
from collections import Counter
from imblearn.over_sampling import SMOTE

# sklearn.set_config(transform_output="pandas")
pd.set_option('display.max_columns', None)
# pd.set_option('display.max_rows', None)

In [2]:
flight_data_train = pd.read_csv('./flight_data_train_ts.csv')
flight_data_test = pd.read_csv('./flight_data_test_ts.csv')

In [3]:
flight_data_train['scheduledoffblocktime'] = pd.to_datetime(flight_data_train['scheduledoffblocktime'])
flight_data_test['scheduledoffblocktime'] = pd.to_datetime(flight_data_test['scheduledoffblocktime'])

flight_data_train.sort_values(by='scheduledoffblocktime', inplace=True)
flight_data_test.sort_values(by='scheduledoffblocktime', inplace=True)

In [4]:
departdatetime = flight_data_train['scheduledoffblocktime'].dt

flight_data_train['depart_day'] = departdatetime.day
flight_data_train['depart_month'] = departdatetime.month
flight_data_train['depart_dayofweek'] = departdatetime.dayofweek
flight_data_train['depart_minute'] = departdatetime.hour * 60 + departdatetime.minute
# Test
departdatetime = flight_data_test['scheduledoffblocktime'].dt
flight_data_test['depart_day'] = departdatetime.day
flight_data_test['depart_month'] = departdatetime.month
flight_data_test['depart_dayofweek'] = departdatetime.dayofweek
flight_data_test['depart_minute'] = departdatetime.hour * 60 + departdatetime.minute

flight_data_train.drop(columns=['scheduledoffblocktime'], axis=1, inplace=True)
flight_data_test.drop(columns=['scheduledoffblocktime'], axis=1, inplace=True)

In [5]:
X_train = flight_data_train.drop(columns=['delay_in_secs'], axis=1)
X_test = flight_data_test.drop(columns=['delay_in_secs'], axis=1)

y_train = flight_data_train['delay_in_secs']
y_test = flight_data_test['delay_in_secs']

In [6]:
X_train

Unnamed: 0,traffictypecode,airlinecode_iata,destination_iata,aircraft_iata,isconnecting,publicgatenumber,aircraftterminal,tmpf,dwpf,relh,drct,sknt,p01i,alti,vsby,gust,skyc1,skyc2,skyl1,skyl2,depart_day,depart_month,depart_dayofweek,depart_minute
0,PS,RA,KTM,332,0,F8,2,69.8,62.6,77.90,90.0,5.0,0.0,29.88,6.21,0,CAVOK,CAVOK,0.0,0.0,8,3,6,265
1,PS,ET,ADD,77W,0,D6,1,69.8,62.6,77.90,90.0,5.0,0.0,29.88,6.21,0,CAVOK,CAVOK,0.0,0.0,8,3,6,265
2,PS,6E,BOM,320,0,D7,1,69.8,62.6,77.90,90.0,5.0,0.0,29.88,6.21,0,CAVOK,CAVOK,0.0,0.0,8,3,6,270
3,PS,EK,MLE,77W,0,C14,3,68.0,60.8,77.75,110.0,6.0,0.0,29.88,5.59,0,NSC,CAVOK,0.0,0.0,8,3,6,275
4,PS,EK,CGK,77W,0,A8,3,68.0,60.8,77.75,110.0,6.0,0.0,29.88,5.59,0,NSC,CAVOK,0.0,0.0,8,3,6,275
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171716,PS,EK,IST,77W,0,B10,3,95.0,66.2,39.00,350.0,12.0,0.0,29.71,4.35,0,NSC,CAVOK,0.0,0.0,6,5,4,860
171715,PS,EK,BRU,77W,0,B29,3,95.0,66.2,39.00,350.0,12.0,0.0,29.71,4.35,0,NSC,CAVOK,0.0,0.0,6,5,4,860
171717,PS,EK,DUS,388,0,B17,3,95.0,66.2,39.00,350.0,12.0,0.0,29.71,4.35,0,NSC,CAVOK,0.0,0.0,6,5,4,860
171718,PS,EK,LIS,77W,0,B22,3,95.0,66.2,39.00,350.0,12.0,0.0,29.71,4.35,0,NSC,CAVOK,0.0,0.0,6,5,4,865


In [7]:
# from imblearn.over_sampling import SMOTENC

# smote = SMOTENC(random_state=42, categorical_features=[0, 1, 2, 3, 5, 16, 17])
# print('Original dataset shape %s' % Counter(y_train_cls))
# X_train, y_train_cls = smote.fit_resample(X_train, y_train_cls)
# print('Resampled dataset shape %s' % Counter(y_train_cls))


In [8]:
import pandas as pd
from category_encoders import CatBoostEncoder

high_cardinality_cols = ['airlinecode_iata', 
                         'destination_iata', 
                         'aircraft_iata', 
                         'publicgatenumber',]

catboost_encoder = CatBoostEncoder(cols=high_cardinality_cols, return_df=True)

X_train_encoded = catboost_encoder.fit_transform(X_train, y_train)

X_test_encoded = catboost_encoder.transform(X_test)

X_train = X_train_encoded
X_test = X_test_encoded

In [9]:
one_hot_column =  ['skyc1', 'skyc2', 'traffictypecode', 'aircraftterminal']

ohe = OneHotEncoder(drop='first', sparse_output=False)

encoded = ohe.fit_transform(X_train[one_hot_column])
encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(one_hot_column))
X_train = pd.concat([X_train.drop(columns=one_hot_column), encoded_df], axis=1)

encoded = ohe.transform(X_test[one_hot_column])
encoded_df = pd.DataFrame(encoded, columns=ohe.get_feature_names_out(one_hot_column))
X_test = pd.concat([X_test.drop(columns=one_hot_column), encoded_df], axis=1)



In [10]:
X_train

Unnamed: 0,airlinecode_iata,destination_iata,aircraft_iata,isconnecting,publicgatenumber,tmpf,dwpf,relh,drct,sknt,p01i,alti,vsby,gust,skyl1,skyl2,depart_day,depart_month,depart_dayofweek,depart_minute,skyc1_CAVOK,skyc1_FEW,skyc1_NSC,skyc1_OVC,skyc1_SCT,skyc1_VV,skyc2_CAVOK,skyc2_FEW,skyc2_NSC,skyc2_SCT,traffictypecode_PE,traffictypecode_PS,aircraftterminal_1,aircraftterminal_2,aircraftterminal_3
0,852.965706,852.965706,852.965706,0,852.965706,69.8,62.6,77.90,90.0,5.0,0.0,29.88,6.21,0,0.0,0.0,8,3,6,265,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0
1,852.965706,852.965706,852.965706,0,852.965706,69.8,62.6,77.90,90.0,5.0,0.0,29.88,6.21,0,0.0,0.0,8,3,6,265,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
2,852.965706,852.965706,852.965706,0,852.965706,69.8,62.6,77.90,90.0,5.0,0.0,29.88,6.21,0,0.0,0.0,8,3,6,270,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0
3,852.965706,852.965706,799.482853,0,852.965706,68.0,60.8,77.75,110.0,6.0,0.0,29.88,5.59,0,0.0,0.0,8,3,6,275,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
4,426.482853,852.965706,532.988569,0,852.965706,68.0,60.8,77.75,110.0,6.0,0.0,29.88,5.59,0,0.0,0.0,8,3,6,275,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
171716,619.078798,667.747416,594.074163,0,494.947951,95.0,66.2,39.00,350.0,12.0,0.0,29.71,4.35,0,0.0,0.0,6,5,4,860,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
171715,619.077152,483.704076,594.072602,0,596.575985,95.0,66.2,39.00,350.0,12.0,0.0,29.71,4.35,0,0.0,0.0,6,5,4,860,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
171717,619.091032,534.660685,879.874501,0,713.291934,95.0,66.2,39.00,350.0,12.0,0.0,29.71,4.35,0,0.0,0.0,6,5,4,860,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
171718,619.084519,672.484574,594.089845,0,520.819542,95.0,66.2,39.00,350.0,12.0,0.0,29.71,4.35,0,0.0,0.0,6,5,4,865,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [11]:
numerical_cols = list(set(X_test.columns) - set(['depart_day', 'depart_month', 'depart_dayofweek', 'depart_minute']))

In [12]:
scaler = MinMaxScaler(feature_range=(0, 1))
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [13]:
# Create cyclic features for month and day
X_train['depart_month_sin'] = np.sin(2 * np.pi * X_train['depart_month'] / 12)
X_train['depart_month_cos'] = np.cos(2 * np.pi * X_train['depart_month'] / 12)
X_train['depart_day_sin'] = np.sin(2 * np.pi * X_train['depart_day'] / 31) 
X_train['depart_day_cos'] = np.cos(2 * np.pi * X_train['depart_day'] / 31)
X_train['depart_dayofweek_sin'] = np.sin(2 * np.pi * X_train['depart_dayofweek'] / 7) 
X_train['depart_dayofweek_cos'] = np.cos(2 * np.pi * X_train['depart_dayofweek'] / 7)
X_train['depart_minute_sin'] = np.sin(2 * np.pi * X_train['depart_minute'] / 1440) 
X_train['depart_minute_cos'] = np.cos(2 * np.pi * X_train['depart_minute'] / 1440)

X_test['depart_month_sin'] = np.sin(2 * np.pi * X_test['depart_month'] / 12)
X_test['depart_month_cos'] = np.cos(2 * np.pi * X_test['depart_month'] / 12)
X_test['depart_day_sin'] = np.sin(2 * np.pi * X_test['depart_day'] / 31)
X_test['depart_day_cos'] = np.cos(2 * np.pi * X_test['depart_day'] / 31)
X_test['depart_dayofweek_sin'] = np.sin(2 * np.pi * X_test['depart_dayofweek'] / 7) 
X_test['depart_dayofweek_cos'] = np.cos(2 * np.pi * X_test['depart_dayofweek'] / 7)
X_test['depart_minute_sin'] = np.sin(2 * np.pi * X_test['depart_minute'] / 1440)
X_test['depart_minute_cos'] = np.cos(2 * np.pi * X_test['depart_minute'] / 1440)

# Drop original features
X_train = X_train.drop(['depart_month', 'depart_day', 'depart_minute', 'depart_dayofweek'], axis=1)
X_test = X_test.drop(['depart_month', 'depart_day', 'depart_minute', 'depart_dayofweek'], axis=1)


In [14]:
X_train.describe()

Unnamed: 0,airlinecode_iata,destination_iata,aircraft_iata,isconnecting,publicgatenumber,tmpf,dwpf,relh,drct,sknt,p01i,alti,vsby,gust,skyl1,skyl2,skyc1_CAVOK,skyc1_FEW,skyc1_NSC,skyc1_OVC,skyc1_SCT,skyc1_VV,skyc2_CAVOK,skyc2_FEW,skyc2_NSC,skyc2_SCT,traffictypecode_PE,traffictypecode_PS,aircraftterminal_1,aircraftterminal_2,aircraftterminal_3,depart_month_sin,depart_month_cos,depart_day_sin,depart_day_cos,depart_dayofweek_sin,depart_dayofweek_cos,depart_minute_sin,depart_minute_cos
count,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0,171720.0
mean,0.052304,0.08085,0.060056,0.038219,0.103533,0.462984,0.628277,0.488496,0.564679,0.245467,0.0,0.595309,0.935276,0.004414,0.057889,0.001186,0.712212,0.073987,0.167826,8.7e-05,0.030625,0.002318,0.998515,0.001205,7.6e-05,0.000134,0.036024,0.930503,0.187648,0.263493,0.547909,0.1289336,0.1622953,0.017237,-0.022374,-0.031271,-0.022132,0.09719,-0.104341
std,0.034403,0.044352,0.022813,0.191726,0.044924,0.193086,0.152289,0.186226,0.269624,0.131946,0.0,0.212523,0.155499,0.066293,0.167216,0.032233,0.452733,0.26175,0.373712,0.009346,0.172301,0.048087,0.038507,0.034699,0.008701,0.011572,0.18635,0.254298,0.390432,0.440529,0.497701,0.7206386,0.6616056,0.710155,0.703483,0.709325,0.703844,0.690874,0.708778
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,-1.0,-0.998717,-0.994869,-0.974928,-0.900969,-1.0,-1.0
25%,0.03503,0.050229,0.041992,0.0,0.06779,0.30303,0.525,0.361208,0.333333,0.148148,0.0,0.442105,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,-0.5,-0.5,-0.724793,-0.758758,-0.781831,-0.900969,-0.59131,-0.779884
50%,0.039249,0.072921,0.056056,0.0,0.103537,0.454545,0.625,0.494898,0.555556,0.222222,0.0,0.621053,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,1.224647e-16,6.123234000000001e-17,0.101168,-0.050649,0.0,-0.222521,0.300706,-0.321439
75%,0.053285,0.099985,0.074378,0.0,0.122752,0.606061,0.725,0.626381,0.805556,0.333333,0.0,0.778947,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.8660254,0.8660254,0.724793,0.688967,0.781831,0.62349,0.75184,0.67559
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.998717,1.0,0.974928,1.0,1.0,1.0


In [15]:
# neighbors = NearestNeighbors(n_neighbors=39*2)
# neighbors_fit = neighbors.fit(X_train)
# distances, indices = neighbors_fit.kneighbors(X_train)
# avg_distance = distances.mean(axis=1)
# plt.plot(np.sort(avg_distance))
# plt.show()

In [16]:
# sc = DBSCAN(eps=1, min_samples=39*2, algorithm='kd_tree')
# clusters = pd.DataFrame(sc.fit_predict(X_full), columns=['Cluster'])
# clusters['Cluster'].value_counts()

In [17]:
hdbscan_model = HDBSCAN(
    min_cluster_size=2000,      # Increase to avoid microclusters
    min_samples=5,              # Lower to reduce noise points
    cluster_selection_epsilon=0.6,  # Increase to reduce noise points
    cluster_selection_method='eom',  # 'eom' tends to produce more balanced clusters
    prediction_data=True
)
clusters_train = pd.DataFrame(hdbscan_model.fit_predict(X_train), columns=['Cluster'])
clusters_train['Cluster'].value_counts()

Cluster
 8     62171
 10    26308
 6     21399
 4     16205
-1     15443
 5      8272
 3      5466
 2      3812
 9      3781
 0      3117
 1      2937
 7      2809
Name: count, dtype: int64

In [18]:
from hdbscan import approximate_predict
clusters_test, _ = approximate_predict(hdbscan_model, X_test)

In [19]:
X_train['cluster'] = clusters_train['Cluster']
X_test['cluster'] = clusters_test

In [21]:
from sklearn.ensemble import RandomForestRegressor
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical

rf_models = {}
metrics = {}
model_weights_mse = {}
model_weights_cluster = {}

for cluster in np.unique(clusters_train):
    if cluster == -1:  # Ignore noise points (label -1)
        continue
    
    print(f'Cluster {cluster}')

    # Subset the training and test data for the cluster
    # Get cluster data
    X_cluster = X_train[X_train['cluster'] == cluster].drop(columns=['cluster'])
    y_cluster = y_train.loc[X_cluster.index]
    
    # Split into train and validation sets
    X_train_cluster, X_val_cluster, y_train_cluster, y_val_cluster = train_test_split(
        X_cluster, y_cluster, test_size=0.2, random_state=42
    )
    
    X_test_cluster = X_test[X_test['cluster'] == cluster].drop(columns=['cluster'])
    y_test_cluster = y_test.loc[X_test_cluster.index]

    # Define objective function for hyperopt
    param_space = {
        'n_estimators': Integer(50, 200),
        'max_depth': Integer(5, 50),
        'min_samples_split': Integer(2, 10),
        'min_samples_leaf': Integer(1, 5),
        'max_features': Categorical(['sqrt', 'log2']),
        'criterion': Categorical(['squared_error', 'poisson', 'friedman_mse', 'absolute_error']),
        # 'bootstrap': Categorical([True, False]),
        # 'max_samples': Categorical([0.6, 0.7, 0.8, 0.9, None])
        
    }

    # Use Bayesian optimization for hyperparameter tuning
    bayes_cv = BayesSearchCV(
        estimator=RandomForestRegressor(),
        search_spaces=param_space,
        n_iter=5,
        cv=3,
        n_jobs=-1,
        scoring='neg_mean_squared_error'
    )
    bayes_cv.fit(X_train_cluster, y_train_cluster)
    best_rf = bayes_cv.best_estimator_
    rf_models[cluster] = best_rf
    
    # Cluster weighted
    cluster_weight = len(y_cluster) / len(y_train)
    model_weights_cluster[cluster] = cluster_weight

    # Make predictions on the validation set and evaluate metrics
    y_val_pred = best_rf.predict(X_val_cluster)
    mse = mean_squared_error(y_val_cluster, y_val_pred)
    mae = mean_absolute_error(y_val_cluster, y_val_pred)
    r2 = r2_score(y_val_cluster, y_val_pred)
    model_weights_mse[cluster] = 1 / mse if mse != 0 else 1  # Assign weight to the model based on validation MSE

    # Print metrics
    print(f"Cluster {cluster} Validation Metrics:")
    print(f"Mean Squared Error: {mse}")
    print(f"Mean Absolute Error: {mae}")
    print(f"R2 Score: {r2}")
    print("==========================================")
    print()

Cluster 0


  _data = np.array(data, dtype=dtype, copy=copy,


Cluster 0 Validation Metrics:
Mean Squared Error: 3026051.7697909917
Mean Absolute Error: 946.8287838230034
R2 Score: 0.12338484197310351

Cluster 1
Cluster 1 Validation Metrics:
Mean Squared Error: 5085584.995736982
Mean Absolute Error: 1079.5982615268329
R2 Score: 0.08791931036987255

Cluster 2
Cluster 2 Validation Metrics:
Mean Squared Error: 4341664.44797545
Mean Absolute Error: 1006.0321402452888
R2 Score: 0.04422891387925609

Cluster 3


  _data = np.array(data, dtype=dtype, copy=copy,


Cluster 3 Validation Metrics:
Mean Squared Error: 2657067.180180161
Mean Absolute Error: 811.4117943307358
R2 Score: 0.11178584466360642

Cluster 4


  _data = np.array(data, dtype=dtype, copy=copy,


Cluster 4 Validation Metrics:
Mean Squared Error: 1418680.5658431926
Mean Absolute Error: 607.5891896758889
R2 Score: 0.11762756808420982

Cluster 5
Cluster 5 Validation Metrics:
Mean Squared Error: 1745803.1464771195
Mean Absolute Error: 663.034910275894
R2 Score: 0.08408234654505187

Cluster 6
Cluster 6 Validation Metrics:
Mean Squared Error: 3928889.9991499083
Mean Absolute Error: 996.6799927970342
R2 Score: 0.15081771151983014

Cluster 7
Cluster 7 Validation Metrics:
Mean Squared Error: 729802.9996113878
Mean Absolute Error: 601.0591747153484
R2 Score: 0.11010900241137767

Cluster 8
Cluster 8 Validation Metrics:
Mean Squared Error: 1910703.6530466336
Mean Absolute Error: 618.8049526636922
R2 Score: 0.08364803549791755

Cluster 9
Cluster 9 Validation Metrics:
Mean Squared Error: 7661512.447940543
Mean Absolute Error: 1275.9068443961069
R2 Score: 0.17014030863433482

Cluster 10


  _data = np.array(data, dtype=dtype, copy=copy,


Cluster 10 Validation Metrics:
Mean Squared Error: 3060192.736162648
Mean Absolute Error: 828.1311734863375
R2 Score: 0.07267333456323277



In [22]:
# Normalizing weights to sum to 1
total_weight_mse = sum(model_weights_mse.values())
model_weights_mse = {cluster: weight / total_weight_mse for cluster, weight in model_weights_mse.items()}
model_weights_mse


{0: 0.06587026600023328,
 1: 0.03919447520269469,
 2: 0.04591023497901982,
 3: 0.07501761208502605,
 4: 0.14050156166630748,
 5: 0.11417486296140185,
 6: 0.050733625795005094,
 7: 0.27312416516888627,
 8: 0.1043211670678396,
 9: 0.02601664310552539,
 10: 0.06513538596806052}

In [25]:
model_weights_cluster
# sum(model_weights_mse.values())

{0: 0.018151642208245982,
 1: 0.017103424178895876,
 2: 0.022198928488236663,
 3: 0.03183088749126485,
 4: 0.09436873980899138,
 5: 0.04817144188213371,
 6: 0.1246156533892383,
 7: 0.016358024691358025,
 8: 0.36204868390402983,
 9: 0.02201840204984859,
 10: 0.15320288842301422}

In [26]:
from copy import deepcopy
# Make final predictions on the test set using weighted average
final_predictions_mse_weighted = []
final_predictions_cluster_weighted = []
final_predictions_non_weighted = []
all_y_true = []
for idx in X_test.index:
    cluster = X_test.loc[idx, 'cluster']
    if cluster == -1: 
        continue

    # Give highest weight to the model of the cluster that the test data belongs to and then normalize the wights to 1
    weights_mse_updated = deepcopy(model_weights_mse)
    weights_mse_updated.update({cluster: 1})
    total_weight_mse = sum(weights_mse_updated.values())
    weights_mse_updated = {cluster: weight / total_weight_mse for cluster, weight in weights_mse_updated.items()}
    
    weights_cluster_updated = deepcopy(model_weights_cluster)
    weights_cluster_updated.update({cluster: 1})
    total_weight_cluster = sum(weights_cluster_updated.values())
    weights_cluster_updated = {cluster: weight / total_weight_cluster for cluster, weight in weights_cluster_updated.items()}
    
    # Get predictions from all models, weighted by their respective scores
    weighted_sum_mse = 0
    total_weight_mse = 0
    weighted_sum_cluster = 0
    total_weight_cluster = 0
    for model_cluster, model in rf_models.items():
        prediction = model.predict(X_test.drop(columns=['cluster']).loc[[idx]])[0]
        
        weight_mse = weights_mse_updated.get(model_cluster, 0)
        weighted_sum_mse += prediction * weight_mse
        total_weight_mse += weight_mse

        weight_cluster = weights_cluster_updated.get(model_cluster, 0)
        weighted_sum_cluster += prediction * weight_cluster
        total_weight_cluster += weight_cluster
        
        
    model = rf_models[cluster]
    prediction = model.predict(X_test.drop(columns=['cluster']).loc[[idx]])[0]
    final_predictions_non_weighted.append(round(prediction))   
    

    # Final prediction is the weighted average
    final_prediction_mse_weighted = round(weighted_sum_mse / total_weight_mse)
    final_predictions_mse_weighted.append(final_prediction_mse_weighted)
    final_prediction_cluster_weighted = round(weighted_sum_cluster / total_weight_cluster)
    final_predictions_cluster_weighted.append(final_prediction_cluster_weighted)
    all_y_true.append(y_test.loc[idx])


In [27]:
# Calculate overall metrics
overall_mse = mean_squared_error(all_y_true, final_predictions_mse_weighted)
overall_mae = mean_absolute_error(all_y_true, final_predictions_mse_weighted)
overall_r2 = r2_score(all_y_true, final_predictions_mse_weighted)

print("\nOverall Metrics (Weighted Average MSE Ensemble):")
print(f"Overall Mean Squared Error: {overall_mse}")
print(f"Overall Mean Absolute Error: {overall_mae}")
print(f"Overall R2 Score: {overall_r2}")

overall_mse = mean_squared_error(all_y_true, final_predictions_cluster_weighted)
overall_mae = mean_absolute_error(all_y_true, final_predictions_cluster_weighted)
overall_r2 = r2_score(all_y_true, final_predictions_cluster_weighted)

print("\nOverall Metrics (Weighted Average Cluster Size Ensemble):")
print(f"Overall Mean Squared Error: {overall_mse}")
print(f"Overall Mean Absolute Error: {overall_mae}")
print(f"Overall R2 Score: {overall_r2}")

overall_mse = mean_squared_error(all_y_true, final_predictions_non_weighted)
overall_mae = mean_absolute_error(all_y_true, final_predictions_non_weighted)
overall_r2 = r2_score(all_y_true, final_predictions_non_weighted)

print("\nOverall Metrics (Non-Weighted):")
print(f"Overall Mean Squared Error: {overall_mse}")
print(f"Overall Mean Absolute Error: {overall_mae}")
print(f"Overall R2 Score: {overall_r2}")


Overall Metrics (Weighted Average MSE Ensemble):
Overall Mean Squared Error: 2425735.8679831196
Overall Mean Absolute Error: 761.6869177482283
Overall R2 Score: 0.06221483198833222

Overall Metrics (Weighted Average Cluster Size Ensemble):
Overall Mean Squared Error: 2420554.9323990764
Overall Mean Absolute Error: 753.0492873636437
Overall R2 Score: 0.06421777246147597

Overall Metrics (Non-Weighted):
Overall Mean Squared Error: 2417208.1729437057
Overall Mean Absolute Error: 739.5603153117287
Overall R2 Score: 0.06551162370866837
