# Try stacking for V75

In [7]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier,Pool,cv,utils 
from sklearn.impute import SimpleImputer

In [8]:
### return a CatBoost model with some default parameters
def get_model(d=6,l2=2,iterations=3000,use_best=True,verbose=False):
    model = CatBoostClassifier(iterations=iterations,use_best_model=use_best, 
        custom_metric=['Logloss', 'AUC','Recall', 'Precision', 'F1', 'Accuracy'],

        eval_metric='Accuracy', 
        depth=d,l2_leaf_reg=l2,
        auto_class_weights='Balanced',verbose=verbose, random_state=2021) 
    return model                

In [9]:
### Features som inte används vid träning
def remove_features(df,remove_mer=[]):
    #remove_mer=['h5_perf','h5_auto','h4_perf','h4_auto', 'h3_perf', 'h2_perf']
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) #
    if remove_mer:
        df.drop(remove_mer,axis=1,inplace=True)
    
    # df=check_unique(df.copy())
    # df=check_corr(df.copy())
    return df

In [10]:
 ## byt ut alla NaN till text för cat_features
def replace_NaN(X_train,X_test=None, cat_features=[]):
    # print('cat_features',cat_features)
    for c in cat_features:
        # print(c)
        X_train.loc[X_train[c].isna(),c] = 'missing'       ### byt ut None-värden till texten 'Missing'
        if X_test is not None:  ## om X_test är med
            X_test.loc [X_test[c].isna(),c] = 'missing'    ### byt ut None-värden till texten 'Missing'
    
    return X_train, X_test

In [11]:
### läs in data och returnera df, alla datum samt index till split-punkt
def load_data(proc=0.75):
    
    df = pd.read_csv('..\\all_data.csv')     
    alla_datum = list(df.datum.unique())
    split_ix = int(len(alla_datum)*proc)
    
    return df,alla_datum,split_ix

In [12]:
def remove_not_used_features(df):
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) 
    return df

In [13]:
df,alla_datum,split_ix = load_data() 
df = remove_not_used_features(df.copy())
CAT_FEATURES=['datum', 'bana', 'häst', 'kusk', 'kön',
        'h1_kusk', 'h1_bana',
        'h2_kusk', 'h2_bana', 
        'h3_kusk',  'h3_bana', 
        'h4_kusk', 'h4_bana', 
        'h5_kusk', 'h5_bana',]

NUM_FEATURES=[item for item in df.columns if item not in CAT_FEATURES and item !='plac']

PLAC_MEAN=df.plac.mean()
PLAC_MEAN

9.208773316093193

In [14]:
# den hittade inget, kanske skall testa igen längre fram
def remove_low_variance_features(df):
    from sklearn.feature_selection import VarianceThreshold
    print(df.shape)
    selection = VarianceThreshold(threshold=(0.1))
    X=selection.fit_transform(df)
    print(X.shape)
    return X

## Functions that are doing the transformations

In [15]:
# fill missing values in categorical features
def impute_cat_features(df, cat_features):
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df[cat_features]=imp1.fit_transform(df[cat_features])  # replae NaN's with 'missing'
    return df

In [16]:
# Set a smooth mean value to the features in df
def calc_smooth_mean(df, by, y, m=300, tot_mean=PLAC_MEAN):

    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[y].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * tot_mean) / (counts + m)

    # Replace each value by the according smoothed mean
    return df[by].map(smooth)


In [17]:
# Handle h1-h5_bana
def transform_hx_bana(df,hx,the_map):
    from sklearn.impute import SimpleImputer
    df[hx] = df[hx].str.lower()
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df[hx]=imp1.fit_transform(df[[hx]])  # replae NaN's with 'missing'

    df[hx] = [item[0] for item in df[hx].str.split('-')]  # remove '-10' from 'solvalla-10' etc
    
    df[hx]=df[hx].map(the_map)  # transform column to numeric by mapping
    # after mapping we get new NaN's - now impute 0
    imp2 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
    df[hx] = imp2.fit_transform(df[[hx]])
    return df
    

In [18]:

# Handle bana and hx_bana  
def transf_bana(df):
    df['bana'] = df.bana.str.lower()
    the_map = df.bana.value_counts() 
    the_map['missing']=0    

    df=transform_hx_bana(df,'h1_bana',the_map)
    df=transform_hx_bana(df,'h2_bana',the_map)
    df=transform_hx_bana(df,'h3_bana',the_map)
    df=transform_hx_bana(df,'h4_bana',the_map)
    df=transform_hx_bana(df,'h5_bana',the_map)

    df['bana']=df.bana.map(the_map)  # transform column to numeric by mapping 
    if df[['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]].isna().sum().sum() != 0:
        print('bana NaNs not 0:',df[['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]].isna().sum())
    
    df.drop(['bana','h1_bana','h2_bana','h3_bana','h4_bana','h5_bana'],axis=1,inplace=True)
    return df


In [19]:
# Handle häst and kusk 
def transf_kusk_häst(df,pref='',m=50,):
    df[pref+'ekipage'] = df[pref+'kusk'].str.cat(df['häst'], sep =", ")  # concatenate 'häst' and 'kusk' into one column
    df[pref+'ekipage'] = calc_smooth_mean(df, by=pref+'ekipage', y='plac',m=50) # make numeric with Target encoding with smooth mean
    df.drop([pref+'kusk'],axis=1,inplace=True)
    return df

In [20]:
# Handle kön  
def transf_kön(df):
    from sklearn.preprocessing import OneHotEncoder
    df['kön'] = df['kön'].str.lower()
    ohe = OneHotEncoder(sparse=False)
    dftemp=pd.DataFrame(ohe.fit_transform(df[['kön']]),columns=['kön_h','kön_s','kön_v'] )  # replae kön with One Hot Encoding
    df=pd.concat([df,dftemp],axis=1)

    # check that kön is correct encoded
    if len(df.loc[(df.kön=='h') & (df.kön_h != 1),'kön']):
        assert False, 'Felaktigt kön h'
    if len(df.loc[(df.kön=='s') & (df.kön_s != 1),'kön']):
       assert False, 'Felaktigt kön s'
    if len(df.loc[(df.kön=='v') & (df.kön_v != 1),'kön']):
        assert False, 'Felaktigt kön v'
    df.drop(['kön'],axis=1,inplace=True)
    return df

In [21]:

def impute_all_numeric_NaNs(df):
    # all features must be numeric
    from sklearn.impute import SimpleImputer
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=-1)
    trdf=imp1.fit_transform(df)  # replae NaN's with 'missing'
    return pd.DataFrame(trdf,columns=df.columns)

## All the transformations in one function

In [22]:

def transf_all(df):
    
    trdf=transf_bana(df.copy())
    trdf=transf_kusk_häst(trdf)
    trdf=transf_kusk_häst(trdf,pref='h1_')
    trdf=transf_kusk_häst(trdf,pref='h2_')
    trdf=transf_kusk_häst(trdf,pref='h3_')
    trdf=transf_kusk_häst(trdf,pref='h4_')
    trdf=transf_kusk_häst(trdf,pref='h5_')
    trdf.drop(['häst'],axis=1,inplace=True)
    trdf=transf_kön(trdf)
    trdf['datum']=pd.to_datetime(trdf.datum).view(float)*10e210
    
    return impute_all_numeric_NaNs(trdf)

In [23]:

# transform all categoricals and impute all NaNs
def prepare_all(df):
    trdf = transf_all(df)
    
    y = (trdf.plac==1) * 1
    trdf = trdf.drop('plac',axis=1)
    
    # all features are now numeric
    trdf = impute_all_numeric_NaNs(trdf)
    if trdf.isna().sum().sum() != 0:
        print('still NaNs in data')
        assert False
    return trdf,y

## stacking prepare and run

In [24]:
# metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# for tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit

### CatBoost

In [25]:
#catBoost preprocessing
def catB_preprocess(df):
        y = (df.plac==1) * 1
        df = df.drop('plac',axis=1)
        df = impute_cat_features(df,cat_features=CAT_FEATURES)

        return df,y


In [26]:

# clean the cat_features
df_catb, y = catB_preprocess(df.copy())
df_catb[CAT_FEATURES].isna().sum().sum()

0

In [27]:
trdf,y=prepare_all(df)
scorer = make_scorer(roc_auc_score)

In [28]:
# CatBoost model GridSearchCV
my_df_1=df_catb             # catboost with Nans abd cat_features
my_cats_1 = CAT_FEATURES
my_df_2 = trdf              # dataset common for all estimators
my_cats_2 = []

my_df = my_df_2
my_cats = my_cats_2
my_pool = Pool(my_df,y,cat_features=my_cats)
my_catb = CatBoostClassifier(cat_features=my_cats)

tscv = TimeSeriesSplit(n_splits=5)
params = {'iterations': [50,100,500,1000],
          'depth': [2,3,4, 5, 6],
          'loss_function': ['Logloss'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
          'eval_metric': ['AUC'],
        #   'use_best_model': ['True'],
          'logging_level':['Silent'],
          'random_seed': [2021],
         }
# clf.fit(df_catb,y)

catb_grid = RandomizedSearchCV(estimator=my_catb, param_distributions=params, scoring=scorer, cv=tscv)

# GridSearchCV  - compare with default
catb_grid.fit(my_df,y)


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=<catboost.core.CatBoostClassifier object at 0x0000021A7D9DF250>,
                   param_distributions={'depth': [2, 3, 4, 5, 6],
                                        'eval_metric': ['AUC'],
                                        'iterations': [50, 100, 500, 1000],
                                        'l2_leaf_reg': array([1.00000000e-20, 3.16227766e-20, 1.00000000e-19]),
                                        'leaf_estimation_iterations': [10],
                                        'logging_level': ['Silent'],
                                        'loss_function': ['Logloss'],
                                        'random_seed': [2021]},
                   scoring=make_scorer(roc_auc_score))

In [29]:
# get best estimator and params
best_catb = catb_grid.best_estimator_
print('best gridsearch',catb_grid.best_score_)
best_param = catb_grid.best_params_
best_param

best gridsearch 0.7878977269333003


{'random_seed': 2021,
 'loss_function': 'Logloss',
 'logging_level': 'Silent',
 'leaf_estimation_iterations': 10,
 'l2_leaf_reg': 3.162277660168379e-20,
 'iterations': 1000,
 'eval_metric': 'AUC',
 'depth': 4}

In [30]:
# print(best_catb.fit(my_df,y).best_score_)
best_catb.get_feature_importance(prettified=True).head(30)

Unnamed: 0,Feature Id,Importances
0,h5_ekipage,15.772651
1,h1_ekipage,14.77255
2,h4_ekipage,13.970035
3,h2_ekipage,13.833793
4,h3_ekipage,13.367222
5,ekipage,10.177373
6,streck,5.888591
7,senast,1.28631
8,h2_kmtid,0.797085
9,kr,0.671566


### XGBoost

In [63]:
# XGBoost model 
import xgboost as xgb
label = y
dtrain = xgb.DMatrix(trdf, label=label)
param = {'max_depth':2, 'eta':1 }
num_round = 10

# GridSearchCV
params = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.09,0.1,0.15], #so called `eta` value
              'max_depth': [7,8,9],
              'min_child_weight': [9,10,11],
              'use_label_encoder':[False],
            #   'silent': [1],
              'eval_metric': ['logloss'],
              'subsample': [0.5,0.9,1.0],
              'colsample_bytree': [0.7, 0.9, 1.0],
              'n_estimators': [7,8,9], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [2021],
              }

xgb_clf = xgb.XGBClassifier(num_round=num_round)
xgb_grid = GridSearchCV(estimator=xgb_clf, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)
xgb_grid.fit(trdf, y )

Parameters: { num_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.




GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
             estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None, gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight...
             n_jobs=3,
             param_grid={'colsample_bytree': [0.7, 0.9, 1.0],
                         'eval_metric': ['logloss'],
                         'learning_rate': [0.09, 0.1, 0.15],
                         'max_depth': [7, 8, 9],
                         'min_child_weight': [9, 10, 11], 'missing': [-999],
                         'n_estim

In [64]:
# get best estimator and params
best_xgb = xgb_grid.best_estimator_
print('best gridsearch', xgb_grid.best_score_)
best_param = xgb_grid.best_params_
best_param

best gridsearch 0.7553428398502139


{'colsample_bytree': 1.0,
 'eval_metric': 'logloss',
 'learning_rate': 0.15,
 'max_depth': 9,
 'min_child_weight': 9,
 'missing': -999,
 'n_estimators': 9,
 'nthread': 4,
 'objective': 'binary:logistic',
 'seed': 2021,
 'subsample': 1.0,
 'use_label_encoder': False}

### ExtraTree

In [66]:
# ExtraTree  model
tscv = TimeSeriesSplit()
from sklearn.tree import ExtraTreeClassifier
et = ExtraTreeClassifier(min_samples_split=2, random_state=2021,class_weight='balanced')

# GridSearchCV
params = {'class_weight': [ 'balanced'  ,None], 
          'max_depth': [None, 5, 10  ,15 ,20],
          'min_samples_leaf': [1, 2 ,3, 4,],
          'min_samples_split': [2,30, 30,  40  ,45],   
          'criterion': [ 'gini'  ,'entropy'],   
          'splitter': ['random',  'best']
         }

et_grid = GridSearchCV(estimator=et, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)
et_grid.fit(trdf, y)

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
             estimator=ExtraTreeClassifier(class_weight='balanced',
                                           random_state=2021),
             n_jobs=3,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': [None, 5, 10, 15, 20],
                         'min_samples_leaf': [1, 2, 3, 4],
                         'min_samples_split': [2, 30, 30, 40, 45],
                         'splitter': ['random', 'best']},
             scoring=make_scorer(roc_auc_score))

In [67]:
# get best estimator and params
best_et = et_grid.best_estimator_
print('best gridsearch', et_grid.best_score_)
best_param = et_grid.best_params_
best_param

best gridsearch 0.7650589419071162


{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 5,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'splitter': 'best'}

### Ridge

In [None]:
# Ridge model
tscv = TimeSeriesSplit()
from sklearn.linear_model import RidgeClassifier
ridge = RidgeClassifier(normalize=False )

# GridSearchCV
params = {'class_weight': ['balanced',None], 
          'alpha': [0.5,1.0,2.0 ],      
         }

ridge_grid = GridSearchCV(estimator=ridge, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)

# GridSearchCV  
ridge_grid.fit(trdf, y)

In [319]:
# get best estimator and params
best_ridge = ridge_grid.best_estimator_
print('best gridsearch', ridge_grid.best_score_)
best_param = ridge_grid.best_params_
best_param

best gridsearch 0.18166847877031872


{'alpha': 1.0, 'class_weight': 'balanced'}

### KNN

In [68]:
# KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=3, )

# GridSearchCV

tscv = TimeSeriesSplit()
params = {'n_neighbors': [10,15,20],
          
         }

knn_grid = GridSearchCV(estimator=knn, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)

# GridSearchCV  
knn_grid.fit(trdf, y)

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
             estimator=KNeighborsClassifier(n_jobs=3), n_jobs=3,
             param_grid={'n_neighbors': [10, 15, 20]},
             scoring=make_scorer(roc_auc_score))

In [69]:
best_knn = knn_grid.best_estimator_
print('best gridsearch',knn_grid.best_score_)
best_param = knn_grid.best_params_
best_param

best gridsearch 0.5019462640497194


{'n_neighbors': 15}

### RandomForrest

In [70]:
# GridSearch
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

tscv = TimeSeriesSplit()
params = {'n_estimators': [5,10,100],
          'max_depth': [4, 5, 6, None],
          'class_weight': ['balanced'],
        #   'loss_function': ['Logloss'],
        #   'eval_metric': ['F1'],
        #   'logging_level':['Silent'],
          'random_state': [2021],
         }
# clf.fit(df_catb,y)

rf_grid = GridSearchCV(estimator=rf, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)

# GridSearchCV  
rf_grid.fit(trdf, y)

GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
             estimator=RandomForestClassifier(), n_jobs=3,
             param_grid={'class_weight': ['balanced'],
                         'max_depth': [4, 5, 6, None],
                         'n_estimators': [5, 10, 100], 'random_state': [2021]},
             scoring=make_scorer(roc_auc_score))

In [71]:
best_rf = rf_grid.best_estimator_
print('best gridsearch',rf_grid.best_score_)
best_param = rf_grid.best_params_
best_param

best gridsearch 0.8209536438465564


{'class_weight': 'balanced',
 'max_depth': 6,
 'n_estimators': 100,
 'random_state': 2021}

In [None]:

pd.DataFrame(best_rf.feature_importances_,index=trdf.columns, columns=['importance']).sort_values(by='importance',ascending=False)

### SVC

In [None]:
# GridSearchCV
# from sklearn.svm import SVC
# svc = SVC(C=1.0, gamma='scale', tol=0.001, cache_size=200, class_weight='balanced', random_state=2021)

# tscv = TimeSeriesSplit()
# params = {'C': [1,2,3],
#           'gamma': ['scale','auto'],
#           'class_weight': ['balanced'],
#           'random_state': [2021],
#          }

# svc_grid = GridSearchCV(estimator=svc, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)

# svc_grid.fit(trdf, y)

In [None]:
# # get best estimator and params
# best_svc = svc_grid.best_estimator_
# print('best gridsearch', svc_grid.best_score_)
# best_param = svc_grid.best_params_
# best_param

## Stack'em

In [75]:
#
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegressionCV
base_models = [('xgb',best_xgb,),
               ('rf',best_rf,),
               ('catb', best_catb,),
              ('knn', best_knn, ),
              ('et', best_et)              # ger mycket sämre res
               # ('ridge', best_ridge, ) ,   # saknar predict_proba - usless!
            #    ('svc', best_svc, ),
               ]
meta_model = LogisticRegressionCV(class_weight='balanced')


In [76]:
def evaluate_model(model, X, y, scoring=scorer):
    print('scorer =',scorer)
    tscv = TimeSeriesSplit(n_splits=5)
    scores = cross_val_score(model, X, y, scoring=scoring, cv=tscv, verbose=1, n_jobs=3, error_score='raise')
    return scores

def Stacking(model_item, X_tr, y_tr, X_final, n_fold):
    model=model_item[1]
    print(model_item[0], end=' ')
    tscv = TimeSeriesSplit(n_splits=n_fold)
    # valid_pred=np.empty((X_valid.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    for n, (train_indices, test_indices) in enumerate(tscv.split(X_tr)):
        if n==0:
            the_first_set_len = len(train_indices) # the first set that cannot be used i timeSeries stacking
            
        X_train, X_test = X_tr.iloc[train_indices], X_tr.iloc[test_indices]
        y_train, y_test = y_tr.iloc[train_indices], y_tr.iloc[test_indices]
        print(n,end=' ')
        model.fit(X=X_train,y=y_train)
        train_pred=np.append(train_pred,model.predict_proba(X_test)[:,1])
    print(f'- final fit (the_first_set_len={the_first_set_len})' )
    model.fit(X=X_tr,y=y_tr) # fit on all data (except the final data)   
    valid_pred = model.predict_proba(X_final)[:,1]
    return model,valid_pred.reshape(-1,1), train_pred, the_first_set_len


In [77]:
split_ix = int(len(trdf)*.8)
train_X = trdf[trdf.index <  split_ix]
valid_X = trdf[trdf.index >= split_ix]
train_y = y[y.index <  split_ix]
valid_y = y[y.index >=  split_ix]
# test 2 models
valid_pred=[None] * len(base_models)
train_pred=[None] * len(base_models)
model=[None] * len(base_models)
for n, model_item in enumerate(base_models):
    model[n],valid_pred[n] ,train_pred[n], the_first_set_len = Stacking(model_item,n_fold=5, X_tr=train_X, y_tr= train_y, X_final=valid_X)
    train_pred[n]=pd.DataFrame(train_pred[n],columns=[model_item[0]])
    valid_pred[n]=pd.DataFrame(valid_pred[n],columns=[model_item[0]])
    
    scores=evaluate_model(model[n],train_X,train_y)
    print(f'mean={np.mean(scores)}: {scores}')
train_y = train_y.iloc[the_first_set_len:]      # remove the first set that can't be used in timeseries stacking
train_pred=pd.concat(train_pred,axis=1)
valid_pred=pd.concat(valid_pred,axis=1)


Parameters: { num_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { num_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { num_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { num_round } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed d

[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:    0.8s finished


mean=0.7387945920838975: [0.68150788 0.74626719 0.73562024 0.74204201 0.78853565]
rf 0 1 2 3 4 - final fit (the_first_set_len=5570)
scorer = make_scorer(roc_auc_score)


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:    4.6s finished


mean=0.8128112934243298: [0.7856774  0.80706858 0.81982219 0.82257813 0.82891016]
catb 0 1 2 3 4 - final fit (the_first_set_len=5570)
scorer = make_scorer(roc_auc_score)


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:   20.8s finished


mean=0.7945946250953815: [0.75069027 0.77673468 0.82354298 0.78790108 0.83410412]
knn 0 1 2 3 4 - final fit (the_first_set_len=5570)
scorer = make_scorer(roc_auc_score)


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:   12.0s finished


mean=0.5023032279714801: [0.50494645 0.5038894  0.50277849 0.49990179 0.5       ]
et 0 1 2 3 4 - final fit (the_first_set_len=5570)
scorer = make_scorer(roc_auc_score)


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:    0.0s finished


mean=0.7557201976910901: [0.742866   0.72839845 0.75693172 0.77421062 0.77619419]


## Final estimation with the meta model

In [78]:
import time
meta_model.fit(train_pred,train_y)
scores=evaluate_model(meta_model,valid_pred,valid_y)
time.sleep(0.2)
print('models', list(valid_pred.columns))
print(f'mean={np.mean(scores)}: {scores}')

scorer = make_scorer(roc_auc_score)


[Parallel(n_jobs=3)]: Using backend LokyBackend with 3 concurrent workers.
[Parallel(n_jobs=3)]: Done   5 out of   5 | elapsed:    0.5s finished


models ['xgb', 'rf', 'catb', 'knn', 'et']
mean=0.8871988720787722: [0.85833371 0.90382673 0.89240001 0.8987632  0.88267071]


## rf, catb, knn, et - 0.88803
## xgb, rf, catb, knn, et - 0.88720