# Imports

In [None]:
import sys
sys.path.append('../')
from datetime import datetime
import pandas as pd

from run_all.main_preprocess import load_data, add_features
from utilities.utilities import get_latest_file

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Preprocess

In [None]:
%%time
# Load original sources and combine to one DataFrame
df_dataset_WMO = load_data()

In [None]:
%%time
# Feature engineering to get more features
df_dataset_WMO_with_features = add_features(df_dataset_WMO)

## Optional: Write temporary result

In [None]:
# suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')

# df_dataset_WMO_with_features.to_parquet(f'../../data/df_preprocess_WMO_{suffix_datetime}.parquet.gzip',
#               compression='gzip')

# Train

## Optional: Load previous dataset

In [None]:
## Continue with loaded data from preprocess
#df = df_dataset_WMO_with_features.copy()

# ## HARDCODED
# datapath = '../../data/'
# filename = 'df_preprocess_WMO_202103211137.parquet.gzip'
# df = pd.read_parquet(datapath + filename)

# ## SELECT LAST FILE
datapath = '../../data/'
df = get_latest_file(filename_str_contains='df_preprocess_', datapath=datapath, filetype='parquet')

## Train model

### Train imports

In [None]:
# zorgen voor de juiste modules
import pandas as pd
import numpy as np
from datetime import datetime
import pickle

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, RepeatedKFold, GridSearchCV, cross_validate, KFold, cross_val_score
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, LassoCV, ElasticNet, BayesianRidge
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.svm import SVR
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestRegressor

from xgboost import XGBRegressor, XGBClassifier

from ColumnSelector import ColumnSelector

# instellingen voor panda weergave aanpassen
pd.set_option('display.max_rows', 500) # alle rijen tonen
pd.set_option('display.max_columns', 500) # alle kolommen tonen
pd.set_option('display.width', 1000) # kolombreedte
pd.set_option("display.precision", 2)     # precisie van de kolommen aanpassen
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x)) # floats output tot 3 decimalen

### Settings

In [None]:
## Dataframe parameters
# locatie van dataset 
DF_LOCATION = 'C:/_NoBackup/Git/__JADS/WMO_execute_group_project/data/df_dataset_WMO.parquet.gzip'
# Location all data
datapath = '../../data/'
# manier van laden dataset. Bijvoorbeeld read_parquet of read_csv
DF_READ = pd.read_parquet

## X & Y parameters
# de kolommen die uit de X dataset moeten worden gehaald. Dat is in ieder geval de y en eventueel nog meer kolommen.
# X_DROP_VALUES = ['wmoclienten', 'eenpersoonshuishoudens', 'huishoudenszonderkinderen', 'huishoudensmetkinderen']
X_DROP_VALUES = ['wmoclienten', 'percentagewmoclienten','wmoclientenper1000inwoners','perioden']
# de kolom die wordt gebruikt als y value
Y_VALUE = ['wmoclientenper1000inwoners']
# test size voor de train/test split
TEST_SIZE = 0.3
# random state voor de train/test split. Bijvoorbeeld random_state = 42 als vaste seed voor reproduceerbaarheid
RANDOM_STATE = 42

## Pipeline parameters
# strategy en waarde om te vullen bij lege categorische kolommen
NAN_VALUES_CAT_STRATEGY = 'constant'
NAN_VALUES_CAT_VALUES = 'Missing'
# waarden om in te vullen bij lege numerieke kolommen. Bijvoorbeeld mean of median
NAN_VALUES_NUM_STRATEGY = 'mean'
# 
#COLS_SELECT = ['aantalinwoners', 'mannen', 'vrouwen', 'k0tot15jaar'
#               , 'k15tot25jaar', 'k25tot45jaar', 'k45tot65jaar', 'k65jaarofouder', 'gescheiden'
#               , 'verweduwd', 'westerstotaal', 'sterftetotaal', 'gemiddeldehuishoudensgrootte'
#               , 'gemiddeldewoningwaarde', 'koopwoningen', 'huurwoningentotaal', 'inbezitwoningcorporatie'
#               , 'gemiddeldinkomenperinkomensontvanger', 'k40personenmetlaagsteinkomen', 'k20personenmethoogsteinkomen'
#               , 'actieven1575jaar', 'k40huishoudensmetlaagsteinkomen', 'k20huishoudensmethoogsteinkomen'
#               , 'huishoudensmeteenlaaginkomen', 'personenpersoortuitkeringaow', 'rucultuurrecreatieoverigediensten'
#               , 'personenautosperhuishouden', 'matevanstedelijkheid']
COLS_SELECT = None

## Model parameters

# manier van cross validate in de modellen. Bijvoorbeeld 10 of RepeatedKFold(n_splits=30, n_repeats=5, random_state=1)
CROSS_VALIDATE = 5
# manier van scoren in de modellen
MODEL_SCORING = 'neg_mean_squared_error'
## Grid Search parameters

# parameters die gebruikt worden in de grid search

ALPHA = [0.001, 0.01, 0.1, 1, 10, 100, 1000]
NEIGHBORS = [3, 5, 11, 19]
NORMALIZE = [True, False]
KERNEL = ['linear', 'poly', 'rbf', 'sigmoid', 'precomputed']
GAMMA = [0.5, 1, 1.5, 2, 5]
N_ESTIMATORS = [50,100,200]
C_REGULARIZATION = [0.001, 0.01, 0,1, 1]

### Functions

In [None]:
# # functie maken om op basis van de cv scores, het beste RMSE model te selecteren 
# def get_best_model_rmsle(cv_scores):
#     """
#     Return best (most conservative) model from cross_validate object.
    
#     Uses np.argmax to find bottomright point == largest RMSE
#     """
#     index = np.argmax(np.sqrt(-cv_scores['train_neg_mean_squared_error']))
#     model = cv_scores['estimator'][index]
#     rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))
#     return (rmse)

In [None]:
# functie maken om op basis van de grid search best estimator, het beste RMSE model te selecteren 
def rsme_from_gridsearch_best_estimator(grid_search):
    """
    Calculates RMSE from the grid search best estimator
    """
    rmse = np.sqrt(mean_squared_error(y_test, grid_search.best_estimator_.predict(X_test)))
    return (rmse)

In [None]:
def drop_nan_from_specific_columns (df,columns_to_check):
    """
    Drops all rows with nan values in specific columns in a dataframe
    """
    df.dropna(
        axis=0,
        how='any',
        thresh=None,
        subset=columns_to_check,
        inplace=True
    )

### Load data

In [None]:
# ## Done before start of 'Train' chapter
# df = get_latest_file(mypath=datapath)

#### Stappen hieronder mogelijk verplaatsten naar prepare stap, later beoordelen

In [None]:
# def create_x_y_train_test(df, X_DROP_VALUES, Y_VALUE):
#     """
#     Creates X_train, X_test, y_train, y_test from pandas dataframe. 
#     Input:
#         A dataframe, 
#         List of X_values to drop
#         Y_value
#     """    
#     X = df.drop(X_DROP_VALUES, axis=1)
#     y = df[Y_VALUE]
#     X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = TEST_SIZE, random_state = RANDOM_STATE)
#     return X_train

In [None]:
#create_x_y_train_test(df,X_DROP_VALUES,Y_VALUE)

In [None]:
df = get_latest_file(filename_str_contains='df_preprocess_', datapath=datapath, filetype='parquet')

In [None]:
# checken of er rijen in het dataframe zitten waarbij de Y_value leeg is. Die rijen worden eruit gehaald.
drop_nan_from_specific_columns(df,Y_VALUE)

In [None]:
# X en y aanmaken
X = df.drop(X_DROP_VALUES, axis=1)
y = df[Y_VALUE]

In [None]:
# splitsen van X en y in train/test. 
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = TEST_SIZE, random_state = RANDOM_STATE)

In [None]:
# splitsen van X_train in categorische en numerieke kolommen, om apart te kunnen transformeren
cat_cols = X_train.select_dtypes(include=['category']).columns
num_cols = X_train.select_dtypes(include=['int64','float64','float32','int32']).columns

### Pipelines

In [None]:
# pipelines (pl) maken voor imputing, scaling en OneHotEncoding per datatype 

# categorie met waarde die is gegeven aan "MISSING" toevoegen
for col in cat_cols:
    # need to add category for missings, otherwise error with OneHotEncoding (volgens mij ook met alleen imputing)
    X_train[col].cat.add_categories(NAN_VALUES_CAT_VALUES, inplace=True)
categories = [X_train[col].cat.categories for col in cat_cols]

# pipeline voor categorial datatype
pl_ppc_cat = make_pipeline(
     SimpleImputer(
         missing_values = np.nan
        ,strategy = NAN_VALUES_CAT_STRATEGY
        ,fill_value = NAN_VALUES_CAT_VALUES)
    ,OneHotEncoder(categories=categories)
)

# pipeline voor numeriek datatype
pl_ppc_num = make_pipeline(
      ColumnSelector(cols=COLS_SELECT)
    ,SimpleImputer(
         missing_values = np.nan
        ,strategy = NAN_VALUES_NUM_STRATEGY)
    ,StandardScaler()
    #,PCA() # PCA heeft behoorlijk wat (positieve) invloed op de scores
)

In [None]:
# pipelines maken om de preprocessing van de imputing te combineren
pl_ppc_total = make_column_transformer(
     (pl_ppc_cat, cat_cols)
    ,(pl_ppc_num, num_cols)
    ,remainder = 'drop'
)

## Gridsearch

In [None]:
# pipeline maken om in de grid search te kunnen gebruiken
pl_gs_total = Pipeline([('preprocess', pl_ppc_total),
                       ('clf', LinearRegression())]) # Placeholder Estimator
    
# param grid waarin alle classifiers + hyper parameters kunnen worden opgenomen. 
# hier classifiers (modellen) + parameters toevoegen
param_grid_total = [{'clf': [LinearRegression()], 
                     'clf__normalize': NORMALIZE,},
                    
                    {'clf': [Ridge()],  
                     'clf__alpha': ALPHA},
                    
                    {'clf': [Lasso()], 
                     'clf__alpha': ALPHA},
                   
                    {'clf': [KNeighborsRegressor()],  
                     'clf__n_neighbors': NEIGHBORS},
                     
                   {'clf': [SVR()], 
                    'clf__kernel': KERNEL,
                    'clf__C': C_REGULARIZATION},
                    
                   {'clf': [XGBRegressor()],  
                    'clf__gamma': GAMMA,
                    'clf__n_estimators': N_ESTIMATORS},                   
                   ]
    
# grid search aanmaken
grid_search_total = GridSearchCV(pl_gs_total, param_grid_total, cv=CROSS_VALIDATE,
                           scoring=MODEL_SCORING,
                           return_train_score=True)

In [None]:
%%time
#grid search uitvoeren
grid_search_total.fit(X_train, y_train)

### Evaluate

In [None]:
pd_grid_search = pd.DataFrame(grid_search_total.cv_results_)
pd_grid_search = pd_grid_search.sort_values('rank_test_score', ascending=True)
pd_grid_search.head(5)

In [None]:
# de best estimator uit de grid search halen (beste train score)
print(f"Het model met de beste train score is:\n{grid_search_total.best_estimator_['clf']}")
# de RMSE berekenen voor de best estimator
print(f"Dit model heeft een RMSE van {rsme_from_gridsearch_best_estimator(grid_search_total)}")

### Save model

In [None]:
# dit is het beste model uit de grid search
grid_search_total.best_estimator_

In [None]:
suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')
filename = f'../../data/best_model_{suffix_datetime}.pickle'
pickle.dump(grid_search_total.best_estimator_, open(filename, 'wb'))

In [None]:
loaded_model = get_latest_file(filename_str_contains='best_model_', datapath=datapath, filetype='pickle')
# hoe moet ik deze score interpreteren?
result = loaded_model.score(X_test, y_test)
print(result)

loaded_model.predict(X_test)

In [None]:
# regel om te testen of opgeslagen pickle file overeen komt met model
grid_search_total.best_estimator_.predict(X_test)

# OUDE CODE

In [None]:
# models_gs_dict = {'Linear Regression': 'grid_search_lr',
#                   'Ridge Regression': 'grid_search_rr',
#                   'Lasso': 'grid_search_lasso',
#                   'K Nearest Neighbor': 'grid_search_knn',
#                   'Support Vector Machines': 'grid_search_svm',
#                   'XGBoost': 'grid_search_xgb',
#                  }

In [None]:
# # nog aan te passen
# def append_gridsearch_scores(models_gs_dict):
#     for key,item in models_gs_dict.items():
#         gridsearch_rsme_scores.append((key,  rsme_from_gridsearch_best_estimator(item)))

In [None]:
# toevoegen van de beste scores, per model, aan een lijst
#gridsearch_rsme_scores = []
#gridsearch_rsme_scores.append(('Linear Regression',  rsme_from_gridsearch_best_estimator(grid_search_lr)))
#gridsearch_rsme_scores.append(('Ridge Regression',  rsme_from_gridsearch_best_estimator(grid_search_rr)))
#gridsearch_rsme_scores.append(('Lasso', rsme_from_gridsearch_best_estimator(grid_search_lasso)))
#gridsearch_rsme_scores.append(('K Nearest Neighbor', rsme_from_gridsearch_best_estimator(grid_search_knn)))
#gridsearch_rsme_scores.append(('Support Vector Machines', rsme_from_gridsearch_best_estimator(grid_search_svm)))
#gridsearch_rsme_scores.append(('XGBoost', rsme_from_gridsearch_best_estimator(grid_search_xgb)))

In [None]:
# # maken van een dataframe met daarin de gesorteerde scores per model
# #gridsearch_rsme_scores = []
# #append_gridsearch_scores(models_gs_dict)
# gridsearch_rsme_scores = pd.DataFrame(gridsearch_rsme_scores)
# gridsearch_rsme_scores.columns = ['Algorithm', 'RMSE'] 
# gridsearch_rsme_scores['RMSE'] = gridsearch_rsme_scores['RMSE'].map('{:,.10f}'.format)
# gridsearch_rsme_scores = gridsearch_rsme_scores.sort_values('RMSE', ascending=True)
# gridsearch_rsme_scores = gridsearch_rsme_scores.reset_index(drop=True)
# gridsearch_rsme_scores

In [None]:
# # and the winner is...
# print(f"Het algoritme met de laagste RMSE is:\n\n{gridsearch_rsme_scores.iloc[0,0]}, met een RMSE van {gridsearch_rsme_scores.iloc[0,1]}")

### Save model (deze moet nog aangepast worden naar Grid Search)

In [None]:
# # Temporary: select best model (needs function)
# BEST_SCORE = lasso_scores
# index = np.argmax(np.sqrt(-BEST_SCORE['train_neg_mean_squared_error']))
# model = BEST_SCORE['estimator'][index]

In [None]:
# suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')
# filename = f'../../data/best_model_{suffix_datetime}.pickle'

# pickle.dump(model, open(filename, 'wb'))

# joblib.dump(grid.best_estimator_,

#### Test load model (deze moet nog aangepast worden naar Grid Search)

In [None]:
# loaded_model = get_latest_file(filename_str_contains='best_model_', datapath=datapath, filetype='pickle')

In [None]:
# result = loaded_model.score(X_test, y_test)
# print(result)

In [None]:
# loaded_model.predict(X_test)

#### Extra code, nog opschonen

In [None]:
#grid_search.best_params_

In [None]:
#pd_grid_search = pd.DataFrame(grid_search.cv_results_)
#pd_grid_search.head(100)

In [None]:
#grid_search.best_score_

In [None]:
#grid_search.best_estimator_

In [None]:
#final_mse = np.sqrt(mean_squared_error(y_test, grid_search.best_estimator_.predict(X_test)))
#print(f"{final_mse:.10f}")

## Train pipelines maken

In [None]:
# # pipeline maken voor LinearRegression 
# pl_lr = make_pipeline(
#      pl_ppc_total
#     ,LinearRegression()
# )

In [None]:
# # pipeline maken voor RidgeRegression 
# pl_rr = make_pipeline(
#      pl_ppc_total
#     ,Ridge()
# )

In [None]:
# # pipeline maken voor Lasso
# pl_lasso = make_pipeline(
#       pl_ppc_total
#      ,Lasso(alpha=0.001)
# )

In [None]:
# # pipeline maken voor KNN
# pl_knn = make_pipeline(
#       pl_ppc_total
#      ,KNeighborsRegressor()
# )

In [None]:
# # pipeline maken voor SVR
# pl_svm = make_pipeline(
#       pl_ppc_total
#      ,SVR()
# )

In [None]:
# # pipeline maken voor XGB
# pl_xgb = make_pipeline(
#       pl_ppc_total
#      ,XGBRegressor()
# ) 

### Train

##### Lineair Regression

In [None]:
# # scores voor LR berekenen
# lr_scores = cross_validate(
#     pl_lr, X_train, y_train,
#     cv = CROSS_VALIDATE,
#     scoring=([MODEL_SCORING]),
#     return_train_score=True,
#     return_estimator=True,
# )

##### Ridge Regression

In [None]:
# rr_scores = cross_validate(
#     pl_rr, X_train, y_train,
#     cv = CROSS_VALIDATE,
#     scoring = ([MODEL_SCORING]),
#     return_train_score=True,
#     return_estimator=True,
# )

##### Lasso

In [None]:
# lasso_scores = cross_validate(
#     pl_lasso, X_train, y_train,
#     cv = CROSS_VALIDATE,
#     scoring = ([MODEL_SCORING]),
#     return_train_score=True,
#     return_estimator=True,
# )

##### K Nearest Neighbor

In [None]:
# knn_scores = cross_validate(
#     pl_knn, X_train, y_train,
#     cv = CROSS_VALIDATE,
#     scoring = ([MODEL_SCORING]),
#     return_train_score=True,
#     return_estimator=True,
# )

##### Support Vector Machines

In [None]:
# svm_scores = cross_validate(
#     pl_svm, X_train, y_train,
#     cv = CROSS_VALIDATE,
#     scoring = ([MODEL_SCORING]),
#     return_train_score=True,
#     return_estimator=True,
# )

##### XGBoost

In [None]:
# xgb_scores = cross_validate(
#     pl_xgb, X_train, y_train,
#     cv = CROSS_VALIDATE,
#     scoring = ([MODEL_SCORING]),
#     return_train_score=True,
#     return_estimator=True,
# )

### Evaluate

#### Calculate scores

In [None]:
# # toevoegen van de scores van de best estimator, per gridsearch model, aan een lijst
# scores = []
# scores.append(('Linear Regression',  get_best_model_rmsle(lr_scores)))
# scores.append(('Ridge Regression',  get_best_model_rmsle(rr_scores)))
# scores.append(('Lasso', get_best_model_rmsle(lasso_scores)))
# scores.append(('K Nearest Neighbor', get_best_model_rmsle(knn_scores)))
# scores.append(('Support Vector Machines', get_best_model_rmsle(svm_scores)))
# scores.append(('XGBoost', get_best_model_rmsle(xgb_scores)))

In [None]:
# scores.append(('Lasso', get_best_model_rmsle(lasso_scores)))

In [None]:
# # maken van een dataframe met daarin de gesorteerde scores per model
# scores = pd.DataFrame(scores)
# scores.columns = ['Algorithm', 'RMSE'] 
# scores['RMSE'] = scores['RMSE'].map('{:,.10f}'.format)
# scores = scores.sort_values('RMSE', ascending=True)
# scores = scores.reset_index(drop=True)
# scores

In [None]:
# # and the winner is...
# print(f"Het algoritme met de laagste RMSE is:\n\n{scores.iloc[0,0]}, met een RMSE van {scores.iloc[0,1]}")

### Save model

In [None]:
# # Temporary: select best model (needs function)
# BEST_SCORE = lasso_scores
# index = np.argmax(np.sqrt(-BEST_SCORE['train_neg_mean_squared_error']))
# model = BEST_SCORE['estimator'][index]

In [None]:
# suffix_datetime = datetime.strftime(datetime.now(), format='%Y%m%d%H%M')
# filename = f'../../data/best_model_{suffix_datetime}.pickle'

# pickle.dump(model, open(filename, 'wb'))

#### Test load model

In [None]:
# loaded_model = get_latest_file(filename_str_contains='best_model_', datapath=datapath, filetype='pickle')

In [None]:
# result = loaded_model.score(X_test, y_test)
# print(result)

In [None]:
# loaded_model.predict(X_test)

In [None]:
# # pipeline maken voor Linear Regression
# pl_gs_lr = Pipeline(
#      [('preprocess', pl_ppc_total),
#      ('model_lr',LinearRegression()) 
#      ]
# )

# # pipeline maken voor RidgeRegression
# pl_gs_rr = Pipeline(
#      [('preprocess', pl_ppc_total),
#      ('model_rr',Ridge()) 
#      ]
# )

# # pipeline maken voor Lasso
# pl_gs_lasso = Pipeline(
#      [('preprocess', pl_ppc_total),
#        ('model_lasso',Lasso()) 
#      ]
# )

# # pipeline maken voor KNN
# pl_gs_knn = Pipeline(
#      [('preprocess', pl_ppc_total),
#        ('model_knn', KNeighborsRegressor()) 
#      ]
# )

# # pipeline maken voor SVM
# pl_gs_svm = Pipeline(
#      [('preprocess', pl_ppc_total),
#        ('model_svm', SVR()) 
#      ]
# )

# # pipeline maken voor XGB
# pl_gs_xgb = Pipeline(
#      [('preprocess', pl_ppc_total),
#        ('model_xgb',XGBRegressor()) 
#      ]
# )

In [None]:
# # Parameters Linieare Regression
# param_grid_lr = [
#     {'model_lr__normalize': NORMALIZE},
#     ]

# # Parameters RidgeRegression
# param_grid_rr = [
#      {'model_rr__alpha': ALPHA},  
#     ]

# # Parameters Lasso
# param_grid_lasso = [
#      {'model_lasso__alpha': ALPHA},  
#     ]

# # Parameters KNneighbors
# param_grid_knn = [
#      {'model_knn__n_neighbors': NEIGHBORS },  
#     ]

# # Parameters SVM
# param_grid_svm = [
#      {'model_svm__kernel': KERNEL },  
#     ]

# # Parameters XGB
# param_grid_xgb = [
#      {'model_xgb__gamma': GAMMA},  
#     ]

In [None]:
# %%time
# # Gridsearch LR 
# grid_search_lr = GridSearchCV(pl_gs_lr, param_grid_lr, cv=CROSS_VALIDATE,
#                            scoring=MODEL_SCORING,
#                            return_train_score=True)
# grid_search_lr.fit(X_train, y_train)

In [None]:
# %%time
# # Gridsearch RR 
# grid_search_rr = GridSearchCV(pl_gs_rr, param_grid_rr, cv=CROSS_VALIDATE,
#                            scoring=MODEL_SCORING,
#                            return_train_score=True)
# grid_search_rr.fit(X_train, y_train)

In [None]:
# %%time
# # Gridsearch Lasso 
# grid_search_lasso = GridSearchCV(pl_gs_lasso, param_grid_lasso, cv=CROSS_VALIDATE,
#                            scoring=MODEL_SCORING,
#                            return_train_score=True)
# grid_search_lasso.fit(X_train, y_train)

In [None]:
# %%time
# # Gridsearch KNN 
# grid_search_knn = GridSearchCV(pl_gs_knn, param_grid_knn, cv=CROSS_VALIDATE,
#                            scoring=MODEL_SCORING,
#                            return_train_score=True)
# grid_search_knn.fit(X_train, y_train)

In [None]:
# %%time
# # Gridsearch SVM 
# grid_search_svm = GridSearchCV(pl_gs_svm, param_grid_svm, cv=CROSS_VALIDATE,
#                            scoring=MODEL_SCORING,
#                            return_train_score=True)
# grid_search_svm.fit(X_train, y_train)

In [None]:
# %%time
# # Gridsearch XGB
# grid_search_xgb = GridSearchCV(pl_gs_xgb, param_grid_xgb, cv=CROSS_VALIDATE,
#                            scoring=MODEL_SCORING,
#                            return_train_score=True)
# grid_search_xgb.fit(X_train, y_train)