## Dynamic Ensemble Selection for Regression tasks 

#### Experiments 

In [4]:
%load_ext autoreload
%autoreload 2

import warnings
warnings.filterwarnings('ignore') 

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [5]:
import time 
import pandas as pd
import numpy as np 
import seaborn as sns 

# static ensemble models 
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import AdaBoostRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor

# classical models 
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LogisticRegression 
from sklearn.linear_model import LinearRegression 
from sklearn.svm import SVR
from sklearn.linear_model import Ridge 
from sklearn.linear_model import Lasso 
from sklearn.tree import DecisionTreeRegressor

# metrics 
from sklearn.metrics import (mean_squared_error, 
                             mean_absolute_error, 
                             r2_score, 
                             mean_squared_log_error)

from sklearn.model_selection import train_test_split  
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn import datasets  

# infodesreg  
from xaidesreg import DES, DRS 

sns.set(style='whitegrid') 
pd.set_option('display.max_columns', None)   

#### Load dataset 

In [11]:
partition_name = './Datasets/Abalone/abalone-5-'
# partition_name = './Datasets/Concrete/concrete-5-'
# partition_name = './Datasets/Liver/liver-5-'
# partition_name = './Datasets/MachineCPU/machineCPU-5-'
# partition_name = './Datasets/Real_estate/Real_estate-5-'
# partition_name = './Datasets/Student_marks/student_marks-5-' 
# partition_name = './Datasets/Wine_quality_red/winequality-red-5-'
# partition_name = './Datasets/Wine_quality_white/winequality-white-5-'
# partition_name = './Datasets/Yacht/yacht_hydrodynamics-5-'

n_fold = 5

errors_DES = []
errors_DRS = []

for p in range(1, n_fold+1):
    pool_models = [
        CatBoostRegressor(verbose=False, random_state=42),
        # XGBRegressor(random_state=42), 
        RandomForestRegressor(random_state=42), 
        LGBMRegressor(random_state=42), 
        # Ridge(random_state=42),
        # DecisionTreeRegressor(random_state=42),
        LinearRegression(), 
        # KNeighborsRegressor(), 
        # SVR()
    ]
    # if p == 2: 
    #     continue 
        
    name_train = partition_name+str(p)+'tra.dat'
    print(name_train)
    name_test = partition_name+str(p)+'tst.dat'
    print(name_test)
    
    data_train = pd.read_csv(name_train, header = None)
    X_train = data_train.iloc[:,:-1]
    y_train = data_train.iloc[:, -1:]

    cols = X_train.columns

    X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train, test_size=0.2, random_state=42)   

    data_test = pd.read_csv(name_test, header=None)
    X_test = data_test.iloc[:,:-1]
    y_test = data_test.iloc[:, -1:]

    scaler = MinMaxScaler()
    scaler.fit(X_train)  
    
    X_train = scaler.transform(X_train)
    X_dsel  = scaler.transform(X_dsel)
    X_test  = scaler.transform(X_test)
    
    X_train = pd.DataFrame(X_train, columns=cols)
    X_dsel  = pd.DataFrame(X_dsel, columns=cols)
    X_test  = pd.DataFrame(X_test, columns=cols)

    
    for model in pool_models: 
        model.fit(X_train, y_train) 


    der = DES(pool_regressors=pool_models, 
          k=7, 
          knn_metric='minkowski', 
          metrics='mape', 
          threshold=0.1)

    der.fit(X_dsel, y_dsel)

    drs = DRS(pool_regressors=pool_models, 
          k=7, 
          knn_metric='minkowski', 
          metrics='mape', 
          threshold=0.05)

    drs.fit(X_dsel, y_dsel)

    y_pred_DES = der.predict(X_test)
    error = mean_squared_error(y_test, y_pred_DES)
    errors_DES.append(error)

    y_pred_DRS = drs.predict(X_test)
    error = mean_squared_error(y_test, y_pred_DRS)
    errors_DRS.append(error)
        
    # print('Partition: ' + str(p))
    # print('Error DES', error) 
    

mean_errors_DES = np.mean(errors_DES)
std_errors_DES = np.std(errors_DES)

print('Mean error DES', mean_errors_DES)
print('Std error DES', std_errors_DES)


mean_errors_DRS = np.mean(errors_DRS)
std_errors_DRS = np.std(errors_DRS)

print('Mean error DRS', mean_errors_DRS)
print('Std error DRS', std_errors_DRS)


./Datasets/Abalone/abalone-5-1tra.dat
./Datasets/Abalone/abalone-5-1tst.dat
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000185 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1286
[LightGBM] [Info] Number of data points in the train set: 2673, number of used features: 8
[LightGBM] [Info] Start training from score 9.944257
./Datasets/Abalone/abalone-5-2tra.dat
./Datasets/Abalone/abalone-5-2tst.dat
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000179 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1288
[LightGBM] [Info] Number of data points in the train set: 2673, number of used features: 8
[LightGBM] [Info] Start training from score 9.878788
./Datasets/Abalone/abalone-5-3tra.dat
./Datasets/Abalone/abalone-5-3tst.dat
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.0002

### Single Models 

In [12]:
pool_models = [
        CatBoostRegressor(verbose=False, random_state=42),
        XGBRegressor(random_state=42), 
        RandomForestRegressor(random_state=42), 
        LGBMRegressor(random_state=42), 
        Ridge(random_state=42),
        DecisionTreeRegressor(random_state=42),
        LinearRegression(), 
        KNeighborsRegressor(), 
        SVR()
    ] 

results = {}
for model in pool_models: 
    results[model.__class__.__name__] = [] 

In [13]:

results = {}
for model in pool_models: 
    results[model.__class__.__name__] = [] 

partition_name = './Datasets/Abalone/abalone-5-'
# partition_name = './Datasets/Concrete/concrete-5-'
# partition_name = './Datasets/Liver/liver-5-'
# partition_name = './Datasets/MachineCPU/machineCPU-5-'
# partition_name = './Datasets/Real_estate/Real_estate-5-'
# partition_name = './Datasets/Student_marks/student_marks-5-' 
# partition_name = './Datasets/Wine_quality_red/winequality-red-5-'
# partition_name = './Datasets/Wine_quality_white/winequality-white-5-'
# partition_name = './Datasets/Yacht/yacht_hydrodynamics-5-'

n_fold = 5



for p in range(1, n_fold+1):
    pool_models = [
        CatBoostRegressor(verbose=False, random_state=42),
        XGBRegressor(random_state=42), 
        RandomForestRegressor(random_state=42), 
        LGBMRegressor(random_state=42), 
        Ridge(random_state=42),
        DecisionTreeRegressor(random_state=42),
        LinearRegression(), 
        KNeighborsRegressor(), 
        SVR()
    ]
    # if p == 2: 
    #     continue 
        
    name_train = partition_name+str(p)+'tra.dat'
    print(name_train)
    name_test = partition_name+str(p)+'tst.dat'
    print(name_test)
    
    data_train = pd.read_csv(name_train, header = None)
    X_train = data_train.iloc[:,:-1]
    y_train = data_train.iloc[:, -1:]

    cols = X_train.columns

    X_train, X_dsel, y_train, y_dsel = train_test_split(X_train, y_train, test_size=0.2, random_state=42)   

    data_test = pd.read_csv(name_test, header=None)
    X_test = data_test.iloc[:,:-1]
    y_test = data_test.iloc[:, -1:]

    scaler = MinMaxScaler()
    scaler.fit(X_train)  
    
    X_train = scaler.transform(X_train)
    X_dsel  = scaler.transform(X_dsel)

    
    X_test  = scaler.transform(X_test)
    
    X_train = pd.DataFrame(X_train, columns=cols)
    X_dsel  = pd.DataFrame(X_dsel, columns=cols)
    X_test  = pd.DataFrame(X_test, columns=cols)

    
    for model in pool_models: 
        model.fit(X_train, y_train) 

        pred = model.predict(X_test)
        error = mean_squared_error(y_test, pred) 

        results[model.__class__.__name__].append(error)
    

./Datasets/Abalone/abalone-5-1tra.dat
./Datasets/Abalone/abalone-5-1tst.dat
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000191 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1286
[LightGBM] [Info] Number of data points in the train set: 2673, number of used features: 8
[LightGBM] [Info] Start training from score 9.944257
./Datasets/Abalone/abalone-5-2tra.dat
./Datasets/Abalone/abalone-5-2tst.dat
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000187 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1288
[LightGBM] [Info] Number of data points in the train set: 2673, number of used features: 8
[LightGBM] [Info] Start training from score 9.878788
./Datasets/Abalone/abalone-5-3tra.dat
./Datasets/Abalone/abalone-5-3tst.dat
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.0001

In [14]:
for model in list(results.keys()): 
    print(f'Mean error {model}', np.mean(results[model]))
    # print(f'Std error {model}', np.std(results[model]))

Mean error CatBoostRegressor 4.654801480687647
Mean error XGBRegressor 5.35754589399621
Mean error RandomForestRegressor 4.772632764031745
Mean error LGBMRegressor 4.806793608191304
Mean error Ridge 5.028431261040495
Mean error DecisionTreeRegressor 9.201431395582041
Mean error LinearRegression 4.944613568087659
Mean error KNeighborsRegressor 5.168794923072515
Mean error SVR 4.996852069744589


In [15]:
for model in list(results.keys()): 
    print(np.mean(results[model]))

4.654801480687647
5.35754589399621
4.772632764031745
4.806793608191304
5.028431261040495
9.201431395582041
4.944613568087659
5.168794923072515
4.996852069744589


##### End of file 