In [20]:
import pandas as pd; pd.set_option('display.max_columns', 100)
import numpy as np

from tqdm.notebook import tqdm

import re

from functools import partial
from scipy.stats import mode

import matplotlib.pyplot as plt; plt.style.use('ggplot')
import seaborn as sns
import plotly.express as px

from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, FunctionTransformer, PowerTransformer, PolynomialFeatures, RobustScaler
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.impute import KNNImputer
from sklearn.multiclass import OneVsRestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, train_test_split, GridSearchCV, RepeatedStratifiedKFold, cross_val_score, cross_val_predict, RepeatedKFold
from sklearn.metrics import roc_auc_score, roc_curve, RocCurveDisplay, cohen_kappa_score, log_loss, f1_score, median_absolute_error, accuracy_score
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.feature_selection import RFE, RFECV
from sklearn.isotonic import IsotonicRegression
from sklearn.calibration import CalibrationDisplay
from sklearn.inspection import PartialDependenceDisplay, permutation_importance
from sklearn.ensemble import RandomForestRegressor, HistGradientBoostingRegressor, GradientBoostingRegressor, ExtraTreesRegressor, VotingRegressor, StackingRegressor
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVR
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from catboost import CatBoostRegressor
from sklego.linear_model import LADRegression

# import tensorflow as tf
# from tensorflow import keras
# import tensorflow_probability as tfp

In [4]:
train = pd.read_csv("train.csv", index_col="ID")
test = pd.read_csv("test.csv", index_col="ID")
sub = pd.read_csv("sample_submission.csv", index_col="ID")

In [8]:
X = train.drop(columns = "y", axis = 1)
Y = train['y']

skf = RepeatedKFold(n_splits = 10, n_repeats = 1, random_state = 42)

In [9]:
stand_tran = make_pipeline(PowerTransformer())
# stand_tran = make_pipeline(RobustScaler())

proccessor = make_column_transformer(
    (stand_tran, ('x_0','x_1', 'x_2', 'x_3', 'x_4', 'x_5', 'x_6', 'x_7', 'x_8', 'x_9', 'x_10')), 
    remainder = 'passthrough')

knn = make_pipeline(proccessor, KNeighborsRegressor())
knn

In [10]:
for i in [5, 10, 15, 20, 25, 30, 35, 40, 45, 50, 70, 100, 150, 200]:
    
    knn_cv_routine = cross_val_score(make_pipeline(proccessor, KNeighborsRegressor(n_neighbors = i)), X, Y,
                                     scoring = 'neg_median_absolute_error',
                                     cv = skf,
                                     n_jobs = -1)

    print(f"The average oof Median absolure error score of the k-NN model with {i} neighbors is {-1*knn_cv_routine.mean()}")

The average oof Median absolure error score of the k-NN model with 5 neighbors is 0.051210625957035204
The average oof Median absolure error score of the k-NN model with 10 neighbors is 0.04245971258789112
The average oof Median absolure error score of the k-NN model with 15 neighbors is 0.039074965970050356
The average oof Median absolure error score of the k-NN model with 20 neighbors is 0.037070512187499814
The average oof Median absolure error score of the k-NN model with 25 neighbors is 0.03654915084765662
The average oof Median absolure error score of the k-NN model with 30 neighbors is 0.0362910825292964
The average oof Median absolure error score of the k-NN model with 35 neighbors is 0.03630648067522983
The average oof Median absolure error score of the k-NN model with 40 neighbors is 0.03637587730468539
The average oof Median absolure error score of the k-NN model with 45 neighbors is 0.03673670384982373
The average oof Median absolure error score of the k-NN model with 50 ne

In [11]:
GB_cv_routine = cross_val_score(GradientBoostingRegressor(loss = 'absolute_error',
                                                          n_estimators = 300,
                                                          learning_rate = 0.1,
                                                          min_samples_leaf = 30,
                                                          max_depth = 5), 
                                X,
                                Y,
                                scoring = 'neg_median_absolute_error',
                                cv = skf, 
                                n_jobs = -1)

print(f"The average oof Median absolure error score of the GB model is {-1*GB_cv_routine.mean()}")

The average oof Median absolure error score of the GB model is 0.00523863127653712


In [12]:
HistGB_cv_routine = cross_val_score(HistGradientBoostingRegressor(loss = 'absolute_error',
                                                                  max_iter = 300,
                                                                  learning_rate = 0.1,
                                                                  min_samples_leaf = 30,
                                                                  max_depth = 5, 
                                                                  random_state = 1), 
                                    X,
                                    Y,
                                    scoring = 'neg_median_absolute_error',
                                    cv = skf, 
                                    n_jobs = -1)

print(f"The average oof Median absolure error score of the HistGB model is {-1*HistGB_cv_routine.mean()}")

The average oof Median absolure error score of the HistGB model is 0.007942206157801479


In [13]:
LGBM_cv_routine = cross_val_score(LGBMRegressor(objective = 'mae',
                                                n_estimators = 300,
                                                learning_rate = 0.1,
                                                colsample_bytree = 0.6), 
                                  X,
                                  Y,
                                  scoring = 'neg_median_absolute_error',
                                  cv = skf, 
                                  n_jobs = -1)

print(f"The average oof Median absolure error score of the LGBM model is {-1*LGBM_cv_routine.mean()}")

[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.009780 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Number of data points in the train set: 36107, number of used features: 12
[LightGBM] [Info] Start training from score 83.282822
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.019932 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Number of data points in the train set: 36106, number of used features: 12
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.020724 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3060[LightGBM] [Info] Start training from score 8

In [14]:
XGB_cv_routine = cross_val_score(XGBRegressor(objective = 'reg:absoluteerror',
                                              tree_method = 'hist',
                                              n_estimators = 300,
                                              learning_rate = 0.1,
                                              colsample_bytree = 0.6), 
                                 X,
                                 Y,
                                 scoring = 'neg_median_absolute_error',
                                 cv = skf, 
                                 n_jobs = -1)

print(f"The average oof Median absolure error score of the XGB model is {-1*XGB_cv_routine.mean()}")

The average oof Median absolure error score of the XGB model is 0.015207214355471167


In [15]:
Cat_cv_routine = cross_val_score(CatBoostRegressor(objective = 'MAE',
                                                   iterations = 300,
                                                   learning_rate = 0.1,
                                                   verbose = False), 
                                 X,
                                 Y,
                                 scoring = 'neg_median_absolute_error',
                                 cv = skf, 
                                 n_jobs = -1)

print(f"The average oof Median absolure error score of the CatBoost model is {-1*Cat_cv_routine.mean()}")

The average oof Median absolure error score of the CatBoost model is 0.01976266213410156


In [16]:
model_performance = pd.DataFrame()
model_performance['Model'] = ['GB', 'Hist', 'LGBM', 'XGB', 'Cat']
model_performance['10-folds oof Median Absolute Error'] = [-1*GB_cv_routine.mean(), -1*HistGB_cv_routine.mean(), -1*LGBM_cv_routine.mean(), -1*XGB_cv_routine.mean(), -1*Cat_cv_routine.mean()]
print(f"The followig table shows the performance of the considered models: \n\n{model_performance}")

The followig table shows the performance of the considered models: 

  Model  10-folds oof Median Absolute Error
0    GB                            0.005239
1  Hist                            0.007942
2  LGBM                            0.015908
3   XGB                            0.015207
4   Cat                            0.019763


In [17]:
md1 = make_pipeline(PowerTransformer(), GradientBoostingRegressor(**{'loss': 'absolute_error',
 'max_depth': 10,
 'learning_rate': 0.041599576923587865,
 'n_estimators': 139,
 'min_samples_leaf': 42,
 'min_samples_split': 11,
 'random_state': 1}))

md2 = make_pipeline(MinMaxScaler(), HistGradientBoostingRegressor(**{'loss': 'absolute_error',
 'l2_regularization': 0.027030940923710774,
 'early_stopping': False,
 'learning_rate': 0.019042088959167168,
 'max_iter': 949,
 'max_depth': 14,
 'max_bins': 255,
 'min_samples_leaf': 50,
 'max_leaf_nodes': 50}))

md3 = LGBMRegressor(**{'objective': 'mae',
 'n_estimators': 668,
 'learning_rate': 0.018499866546319983,
 'max_depth': 11,
 'reg_alpha': 0.4618095706853164,
 'reg_lambda': 0.07505699333277592,
 'num_leaves': 88,
 'subsample': 0.7994357898443023,
 'colsample_bytree': 0.7716446660869791})

md4 = make_pipeline(MinMaxScaler(), XGBRegressor(**{'objective': 'reg:absoluteerror',
 'tree_method': 'hist',
 'max_depth': 7,
 'learning_rate': 0.01912673399861771,
 'n_estimators': 960,
 'gamma': 1.3282085968831892,
 'min_child_weight': 28,
 'colsample_bytree': 0.6171930281823468,
 'subsample': 0.762767668956589}))

md5 = CatBoostRegressor(**{'objective': 'MAE',
 'iterations': 792,
 'learning_rate': 0.033323612065351636,
 'depth': 7,
 'random_strength': 0.04415624028064764,
 'bagging_temperature': 0.5522406534278442,
 'border_count': 241,
 'l2_leaf_reg': 8,
 'verbose': False,
 'task_type': 'CPU'})

md6 = make_pipeline(PowerTransformer(), SVR(kernel = 'rbf', 
                                            C = 10,
                                            gamma = 0.1, 
                                            epsilon = 0.01))

voting_regressor = VotingRegressor(estimators = [('GB', md1),
                                                 ('HGB', md2),
                                                 ('LGBM', md3),
                                                 ('XGB', md4),
                                                 ('Cat', md5)],
                                  n_jobs = -1, 
                                  weights = [0.002309, 0.254678, 0.363684, 0.300134, 0.074709])
voting_regressor

In [21]:
stacker = StackingRegressor(estimators = [('GB', md1),
                                          ('HGB', md2),
                                          ('LGBM', md3),
                                          ('XGB', md4),
                                          ('Cat', md5)],
                            n_jobs = -1, 
                            final_estimator = LADRegression())
stacker

In [25]:
vot_scores, vot_preds = list(), list()
stack_scores, stack_preds = list(), list()

for i, (train_idx, test_idx) in enumerate(skf.split(X, Y)):

    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    Y_train, Y_test = Y.iloc[train_idx], Y.iloc[test_idx]
    
    print('----------------------------------------------------------')
    
    ############
    ## Voting ##
    ############
    
    vot = voting_regressor.fit(X_train, Y_train)
    vot_pred = vot.predict(X_test)
    
    vot_score = median_absolute_error(Y_test, vot_pred)
    vot_scores.append(vot_score)
    
    print('Fold', i, '==> Voting Regressor oof median absolute error score is ==>', vot_score)
    
    vot_pred_test = vot.predict(test.drop(columns = 'ID'))
    vot_preds.append(vot_pred_test)
    
    #############
    ## Stacker ##
    #############
    
    stack = stacker.fit(X_train, Y_train)
    stack_pred = stack.predict(X_test)
    
    stack_score = median_absolute_error(Y_test, stack_pred)
    stack_scores.append(stack_score)
    
    print('Fold', i, '==> Stacking Regressor oof median absolute error score is ==>', stack_score)
    
    stack_pred_test = stack.predict(test.drop(columns = 'ID'))
    stack_preds.append(stack_pred_test)

----------------------------------------------------------
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000977 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Number of data points in the train set: 36106, number of used features: 12
[LightGBM] [Info] Start training from score 83.281799
Fold 0 ==> Voting Regressor oof median absolute error score is ==> 0.007393967212372843


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- y


In [27]:
voting_full = voting_regressor.fit(X, Y)

voting_pred_full = voting_full.predict(test.drop(columns = 'ID'))
voting_pred_cv = pd.DataFrame(vot_preds).apply(np.median, axis = 0)

voting = (voting_pred_full + voting_pred_cv) / 2
sub['y'] = voting

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001265 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3060
[LightGBM] [Info] Number of data points in the train set: 40118, number of used features: 12
[LightGBM] [Info] Start training from score 83.281563


ValueError: The feature names should match those that were passed during fit.
Feature names seen at fit time, yet now missing:
- y
