# Try stacking for V75

In [38]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier,Pool,cv,utils 
from sklearn.impute import SimpleImputer

In [39]:
### return a CatBoost model with some default parameters
def get_model(d=6,l2=2,iterations=3000,use_best=True,verbose=False):
    model = CatBoostClassifier(iterations=iterations,use_best_model=use_best, 
        custom_metric=['Logloss', 'AUC','Recall', 'Precision', 'F1', 'Accuracy'],

        eval_metric='Accuracy', 
        depth=d,l2_leaf_reg=l2,
        auto_class_weights='Balanced',verbose=verbose, random_state=2021) 
    return model                

In [40]:
### Features som inte används vid träning
def remove_features(df,remove_mer=[]):
    #remove_mer=['h5_perf','h5_auto','h4_perf','h4_auto', 'h3_perf', 'h2_perf']
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) #
    if remove_mer:
        df.drop(remove_mer,axis=1,inplace=True)
    
    # df=check_unique(df.copy())
    # df=check_corr(df.copy())
    return df

In [41]:
 ## byt ut alla NaN till text för cat_features
def replace_NaN(X_train,X_test=None, cat_features=[]):
    # print('cat_features',cat_features)
    for c in cat_features:
        # print(c)
        X_train.loc[X_train[c].isna(),c] = 'missing'       ### byt ut None-värden till texten 'Missing'
        if X_test is not None:  ## om X_test är med
            X_test.loc [X_test[c].isna(),c] = 'missing'    ### byt ut None-värden till texten 'Missing'

    return X_train,X_test

In [42]:
### läs in data och returnera df, alla datum samt index till split-punkt
def load_data(proc=0.75):
    
    df = pd.read_csv('..\\all_data.csv')     
    alla_datum = list(df.datum.unique())
    split_ix = int(len(alla_datum)*proc)
    
    return df,alla_datum,split_ix

In [43]:
def remove_not_used_features(df):
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) 
    return df

In [44]:
df,alla_datum,split_ix = load_data() 
df = remove_not_used_features(df.copy())
CAT_FEATURES=['datum', 'bana', 'häst', 'kusk', 'kön',
        'h1_kusk', 'h1_bana',
        'h2_kusk', 'h2_bana', 
        'h3_kusk',  'h3_bana', 
        'h4_kusk', 'h4_bana', 
        'h5_kusk', 'h5_bana',]

NUM_FEATURES=[item for item in df.columns if item not in CAT_FEATURES and item !='plac']

PLAC_MEAN=df.plac.mean()
PLAC_MEAN

9.208773316093193

In [45]:
# den hittade inget, kanske skall testa igen längre fram
def remove_low_variance_features(df):
    from sklearn.feature_selection import VarianceThreshold
    print(df.shape)
    selection = VarianceThreshold(threshold=(0.1))
    X=selection.fit_transform(df)
    print(X.shape)
    return X

## Functions that are doing the transformations

In [46]:
# fill missing values in categorical features
def impute_cat_features(df, cat_features):
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df[cat_features]=imp1.fit_transform(df[cat_features])  # replae NaN's with 'missing'
    return df

In [47]:
# Set a smooth mean value to the features in df
def calc_smooth_mean(df, by, y, m=300, tot_mean=PLAC_MEAN):

    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[y].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * tot_mean) / (counts + m)

    # Replace each value by the according smoothed mean
    return df[by].map(smooth)


In [48]:
# Handle h1-h5_bana
def transform_hx_bana(df,hx,the_map):
    from sklearn.impute import SimpleImputer
    df[hx] = df[hx].str.lower()
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df[hx]=imp1.fit_transform(df[[hx]])  # replae NaN's with 'missing'

    df[hx] = [item[0] for item in df[hx].str.split('-')]  # remove '-10' from 'solvalla-10' etc
    
    df[hx]=df[hx].map(the_map)  # transform column to numeric by mapping
    # after mapping we get new NaN's - now impute 0
    imp2 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
    df[hx] = imp2.fit_transform(df[[hx]])
    return df
    

In [49]:

# Handle bana and hx_bana  
def transf_bana(df):
    df['bana'] = df.bana.str.lower()
    the_map = df.bana.value_counts() 
    the_map['missing']=0    

    df=transform_hx_bana(df,'h1_bana',the_map)
    df=transform_hx_bana(df,'h2_bana',the_map)
    df=transform_hx_bana(df,'h3_bana',the_map)
    df=transform_hx_bana(df,'h4_bana',the_map)
    df=transform_hx_bana(df,'h5_bana',the_map)

    df['bana']=df.bana.map(the_map)  # transform column to numeric by mapping 
    if df[['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]].isna().sum().sum() != 0:
        print('bana NaNs not 0:',df[['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]].isna().sum())
    
    df.drop(['bana','h1_bana','h2_bana','h3_bana','h4_bana','h5_bana'],axis=1,inplace=True)
    return df


In [50]:
# Handle häst and kusk 
def transf_kusk_häst(df,pref='',m=50,):
    df[pref+'ekipage'] = df[pref+'kusk'].str.cat(df['häst'], sep =", ")  # concatenate 'häst' and 'kusk' into one column
    df[pref+'ekipage'] = calc_smooth_mean(df, by=pref+'ekipage', y='plac',m=50) # make numeric with Target encoding with smooth mean
    df.drop([pref+'kusk'],axis=1,inplace=True)
    return df

In [51]:
# Handle kön  
def transf_kön(df):
    from sklearn.preprocessing import OneHotEncoder
    df['kön'] = df['kön'].str.lower()
    ohe = OneHotEncoder(sparse=False)
    dftemp=pd.DataFrame(ohe.fit_transform(df[['kön']]),columns=['kön_h','kön_s','kön_v'] )  # replae kön with One Hot Encoding
    df=pd.concat([df,dftemp],axis=1)

    # check that kön is correct encoded
    if len(df.loc[(df.kön=='h') & (df.kön_h != 1),'kön']):
        print('Felaktigt kön','h')
        error()
    if len(df.loc[(df.kön=='s') & (df.kön_s != 1),'kön']):
        print('Felaktigt kön','s')
        error()
    if len(df.loc[(df.kön=='v') & (df.kön_v != 1),'kön']):
        print('Felaktigt kön','v')
        error()
    df.drop(['kön'],axis=1,inplace=True)
    return df

In [52]:

def impute_all_numeric_NaNs(df):
    # all features must be numeric
    from sklearn.impute import SimpleImputer
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=-1)
    trdf=imp1.fit_transform(df)  # replae NaN's with 'missing'
    return pd.DataFrame(trdf,columns=df.columns)

## All the transformations in one function

In [53]:

def transf_all(df):
    
    trdf=transf_bana(df.copy())
    trdf=transf_kusk_häst(trdf)
    trdf=transf_kusk_häst(trdf,pref='h1_')
    trdf=transf_kusk_häst(trdf,pref='h2_')
    trdf=transf_kusk_häst(trdf,pref='h3_')
    trdf=transf_kusk_häst(trdf,pref='h4_')
    trdf=transf_kusk_häst(trdf,pref='h5_')
    trdf.drop(['häst'],axis=1,inplace=True)
    trdf=transf_kön(trdf)
    trdf['datum']=pd.to_datetime(trdf.datum).view(float)*10e210
    
    return impute_all_numeric_NaNs(trdf)

In [54]:

# transform all categoricals and impute all NaNs
def prepare_all(df):
    trdf = transf_all(df)
    
    y = (trdf.plac==1) * 1
    trdf = trdf.drop('plac',axis=1)
    
    # all features are now numeric
    trdf = impute_all_numeric_NaNs(trdf)
    if trdf.isna().sum().sum() != 0:
        print('still NaNs in data')
        assert False
    return trdf,y

## CatBoost

In [55]:
#catBoost preprocessing
def catB_preprocess(df):
        y = (df.plac==1) * 1
        df = df.drop('plac',axis=1)
        df = impute_cat_features(df,cat_features=CAT_FEATURES)

        return df,y


In [56]:

# clean the cat_features
df_catb, y = catB_preprocess(df.copy())
df_catb[CAT_FEATURES].isna().sum().sum()

0

In [57]:

# metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score

# for tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit

In [58]:
trdf,y=prepare_all(df)

In [59]:
# CatBoost model GridSearchCV
my_df_1=df_catb             # catboost with Nans abd cat_features
my_cats_1 = CAT_FEATURES
my_df_2 = trdf              # dataset common for all estimators
my_cats_2 = []

my_df = my_df_2
my_cats = my_cats_2
my_pool = Pool(my_df,y,cat_features=my_cats)
my_catb = CatBoostClassifier(cat_features=my_cats)

tscv = TimeSeriesSplit(n_splits=5)
params = {'iterations': [50,100,500],
          'depth': [4, 5, 6],
          'loss_function': ['Logloss'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
          'eval_metric': ['F1'],
        #   'use_best_model': ['True'],
          'logging_level':['Silent'],
          'random_seed': [2021],
         }
# clf.fit(df_catb,y)

scorer = make_scorer(matthews_corrcoef)
catb_grid = RandomizedSearchCV(estimator=my_catb, param_distributions=params, scoring=scorer, cv=tscv)

# GridSearchCV  - compare with default
catb_grid.fit(my_df,y)


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=<catboost.core.CatBoostClassifier object at 0x000001C05A073BE0>,
                   param_distributions={'depth': [4, 5, 6],
                                        'eval_metric': ['F1'],
                                        'iterations': [50, 100, 500],
                                        'l2_leaf_reg': array([1.00000000e-20, 3.16227766e-20, 1.00000000e-19]),
                                        'leaf_estimation_iterations': [10],
                                        'logging_level': ['Silent'],
                                        'loss_function': ['Logloss'],
                                        'random_seed': [2021]},
                   scoring=make_scorer(matthews_corrcoef))

In [60]:
# get best estimator and params
best_catb = catb_grid.best_estimator_
print('best Logloss gridsearch',catb_grid.best_score_)
best_param = catb_grid.best_params_
best_param

best Logloss gridsearch 0.7099996522577294


{'random_seed': 2021,
 'loss_function': 'Logloss',
 'logging_level': 'Silent',
 'leaf_estimation_iterations': 10,
 'l2_leaf_reg': 1e-20,
 'iterations': 500,
 'eval_metric': 'F1',
 'depth': 4}

In [61]:
print(best_catb.fit(my_df,y).best_score_)
best_catb.get_feature_importance(prettified=True).head(30)

{'learn': {'Logloss': 0.10284111770116602, 'F1': 0.7587376993552766}}


Unnamed: 0,Feature Id,Importances
0,h5_ekipage,18.2312
1,h2_ekipage,15.147799
2,h3_ekipage,15.049699
3,h4_ekipage,12.663214
4,h1_ekipage,12.426701
5,ekipage,12.275516
6,streck,6.548204
7,senast,0.596055
8,spår,0.522682
9,h2_kmtid,0.493945


In [62]:

params = {
         'use_best_model': True,
         'eval_metric' : 'F1',
         "loss_function": "Logloss",
         'early_stopping_rounds': 100,
         # 'verbose': 50,
         'iterations': 2000,
         'logging_level': 'Silent',
         'leaf_estimation_iterations': 10,
         'l2_leaf_reg': 3.162277660168379e-20,
         'depth': 5,
}

cv_score =cv(pool=my_pool, 
   params=params, 
   seed=2021, 
   stratified=True,
   as_pandas=True,
   type='TimeSeries')

In [63]:
cv_score[['test-F1-mean','train-F1-mean','test-Logloss-mean','train-Logloss-mean']].describe()

Unnamed: 0,test-F1-mean,train-F1-mean,test-Logloss-mean,train-Logloss-mean
count,701.0,701.0,701.0,701.0
mean,0.684884,0.739627,0.136203,0.114455
std,0.08796,0.130601,0.058253,0.066165
min,0.207294,0.184522,0.115739,0.067644
25%,0.685692,0.698208,0.116395,0.080476
50%,0.712692,0.776551,0.120257,0.098539
75%,0.729353,0.829813,0.127725,0.121771
max,0.73431,0.869643,0.652395,0.651956


## other models that need no NaN and no Categorical

In [64]:
# XGBoost model 
# cat?
# NaN's?

In [65]:
# ANN model - Approx near neighbours
# kör all preproc ovan
# GridSearchCV

### RandomForrest

In [66]:
# GridSearch
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

tscv = TimeSeriesSplit()
params = {'n_estimators': [5,10,100],
          'max_depth': [4, 5, 6, None],
          'class_weight': ['balanced'],
        #   'loss_function': ['Logloss'],
        #   'eval_metric': ['F1'],
        #   'logging_level':['Silent'],
          'random_state': [2021],
         }
# clf.fit(df_catb,y)

scorer = make_scorer(matthews_corrcoef)
rf_grid = GridSearchCV(estimator=rf, param_grid=params, scoring=scorer, cv=tscv)

# GridSearchCV  
rf_grid.fit(trdf, y)



GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
             estimator=RandomForestClassifier(),
             param_grid={'class_weight': ['balanced'],
                         'max_depth': [4, 5, 6, None],
                         'n_estimators': [5, 10, 100], 'random_state': [2021]},
             scoring=make_scorer(matthews_corrcoef))

In [67]:
# get best estimator and params
best_rf = rf_grid.best_estimator_
print('best gridsearch',rf_grid.best_score_)
best_param = rf_grid.best_params_
best_param

best gridsearch 0.4250374092308687


{'class_weight': 'balanced',
 'max_depth': 6,
 'n_estimators': 100,
 'random_state': 2021}

In [68]:

pd.DataFrame(best_rf.feature_importances_,index=trdf.columns, columns=['importance']).sort_values(by='importance',ascending=False)

Unnamed: 0,importance
h2_ekipage,0.170345
h3_ekipage,0.142896
h1_ekipage,0.137229
streck,0.130473
h5_ekipage,0.128405
...,...
kön_v,0.000089
h4_auto,0.000078
h2_auto,0.000068
h3_auto,0.000029


In [69]:
# SVM  model
# All preproc ovan?
# GridSearchCV

## Stack'em

In [70]:
#
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegressionCV
base_models = [('random_forest', best_rf),
               ('catboost', best_catb),
            #    ('knn', KNeighborsClassifier(n_neighbors=11))
               ]
meta_model = LogisticRegressionCV()
stacking_model = StackingClassifier(estimators=base_models, 
                                    final_estimator=meta_model, 
                                    passthrough=True, 
                                    cv=tscv,
                                    verbose=2)

# stacking_model.fit(trdf,y)

In [75]:
def evaluate_model(model, X, y):
    tscv = TimeSeriesSplit(n_splits=5, n_repeats=2, random_state=1)
    scores = cross_val_score(model, X, y, scoring='AUC', cv=tscv, verbose=1, n_jobs=3, error_score='raise')
    return scores

def Stacking(model, X_tr, y_tr, X_final, n_fold):
    tscv = TimeSeriesSplit(n_splits=n_fold)
    # valid_pred=np.empty((X_valid.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    for train_indices, test_indices in tscv.split(X_tr):
        X_train, X_test = X_tr.iloc[train_indices], X_tr.iloc[test_indices]
        y_train, y_test = y_tr.iloc[train_indices], y_tr.iloc[test_indices]
        print(f'test indices {len(test_indices)} X_test {len(X_test)} train_pred before {len(train_pred)}')
        model.fit(X=X_train,y=y_train)
        train_pred=np.append(train_pred,model.predict_proba(X_test)[:,1])
        print('len train_pred',len(train_pred), 'one predict',len(model.predict_proba(X_test)))
    valid_pred = model.predict_proba(X_final)[:,1]
    return valid_pred.reshape(-1,1), train_pred


In [None]:
# tscv = TimeSeriesSplit(n_splits=5)
# totlen=0
# rf = base_models[0][1]
# # valid_pred=np.empty((X_valid.shape[0],1),float)
# train_pred=np.empty((0,1),float)
# for train_indices, test_indices in tscv.split(train_X):
#     X_train, X_test = train_X.iloc[train_indices], train_X.iloc[test_indices]
#     y_train, y_test = train_y.iloc[train_indices], train_y.iloc[test_indices]

#     rf.fit(X=X_train,y=y_train)
#     my_pred = rf.predict_proba(X_test)
#     # print(my_pred)
#     print(my_pred[:,1])
#     break
    
# # print(totlen,train_X.shape[0]-totlen)

In [76]:
split_ix = int(len(trdf)*.8)
train_X = trdf[trdf.index <  split_ix]
valid_X = trdf[trdf.index >= split_ix]
train_y = y[y.index <  split_ix]
# test 2 models

print(base_models[0][0])
model1 = base_models[0][1]
valid_pred1 ,train_pred1=Stacking(model=model1,n_fold=5, X_tr=train_X, y_tr= train_y, X_final=valid_X)
train_pred1=pd.DataFrame(train_pred1)
valid_pred1=pd.DataFrame(valid_pred1)

print(base_models[1][0])
model2 = base_models[1][1]
valid_pred2 ,train_pred2=Stacking(model=model2,n_fold=5, X_tr=train_X, y_tr= train_y, X_final=valid_X)
train_pred2=pd.DataFrame(train_pred2)
valid_pred2=pd.DataFrame(valid_pred2)


random_forest
test indices 5568 X_test 5568 train_pred before 0
len train_pred 5568 one predict 5568
test indices 5568 X_test 5568 train_pred before 5568
len train_pred 11136 one predict 5568
test indices 5568 X_test 5568 train_pred before 11136
len train_pred 16704 one predict 5568
test indices 5568 X_test 5568 train_pred before 16704
len train_pred 22272 one predict 5568
test indices 5568 X_test 5568 train_pred before 22272
len train_pred 27840 one predict 5568
catboost
test indices 5568 X_test 5568 train_pred before 0
len train_pred 5568 one predict 5568
test indices 5568 X_test 5568 train_pred before 5568
len train_pred 11136 one predict 5568
test indices 5568 X_test 5568 train_pred before 11136
len train_pred 16704 one predict 5568
test indices 5568 X_test 5568 train_pred before 16704
len train_pred 22272 one predict 5568
test indices 5568 X_test 5568 train_pred before 22272
len train_pred 27840 one predict 5568


In [218]:
print(train_X.shape,train_y.shape,valid_X.shape)
train_pred1.shape,valid_pred1.shape #,train_pred2.shape,valid_pred2.shape

(33410, 63) (33410,) (8353, 63)


((27840, 1), (8353, 1))

In [None]:
## Testar TimeSeries folders manuell loop
tscv = TimeSeriesSplit()
print(tscv)
train_pred = pd.DataFrame()
test_pred=pd.DataFrame()
i=1

for train_index, test_index in tscv.split(trdf):
    print(f"TRAIN1_{i}, start={train_index[:1]} end={train_index[-1:]}, TEST_{i},start={test_index[0:1]} end={test_index[-1:]}")
    X_train, X_test = trdf.iloc[train_index], trdf.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    print(f'X_train.shape {X_train.shape}\tX_test.shape {X_test.shape} ')
    temp_pred=pd.DataFrame()
    for name,model in base_models:
        print(name)
        model.fit(X_train,y_train)
        y_hat = pd.DataFrame(model.predict_proba(X_test))
        temp_pred = pd.concat([y_hat, y_test],axis=1)
        print(temp_pred.tail())
    train_pred = pd.concat([train_pred,temp_pred],axis=0)
        
    i+=1
# train_pred.columns=['rf','cb','y']


In [100]:
# tscv.split(train,y.values)
for train_index, test_index in tscv.split(trdf):
    print(train_index[:1],train_index[-1:], test_index[:1],test_index[-1:],)
    print(y.iloc[train_index][-10:].values,y.iloc[test_index][-10:].values)
print()  
y.tail(10).values

[0] [6962] [6963] [13922]
[0 0 0 0 0 0 0 0 1 0] [0 0 1 0 0 0 0 0 0 0]
[0] [13922] [13923] [20882]
[0 0 1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0]
[0] [20882] [20883] [27842]
[0 0 0 0 0 0 0 0 0 0] [0 0 0 1 0 0 0 0 0 0]
[0] [27842] [27843] [34802]
[0 0 0 1 0 0 0 0 0 0] [0 0 1 0 0 0 0 0 0 0]
[0] [34802] [34803] [41762]
[0 0 1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 1 0]

alt
[0] [6962] [6963] [13922]
[0 0 0 0 0 0 0 0 1 0] [0 0 1 0 0 0 0 0 0 0]
[0] [13922] [13923] [20882]
[0 0 1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 0 0]
[0] [20882] [20883] [27842]
[0 0 0 0 0 0 0 0 0 0] [0 0 0 1 0 0 0 0 0 0]
[0] [27842] [27843] [34802]
[0 0 0 1 0 0 0 0 0 0] [0 0 1 0 0 0 0 0 0 0]
[0] [34802] [34803] [41762]
[0 0 1 0 0 0 0 0 0 0] [0 0 0 0 0 0 0 0 1 0]



array([0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

## cv

In [None]:


cv_pool = Pool(df,y,cat_features=cat_features)

params = {
         'use_best_model': True,
         'eval_metric' : 'Recall',
         "loss_function": "Logloss",
         'early_stopping_rounds': 100,
         'verbose': 50,
         'iterations': 2000,
         'seed': 2021,
         'startified': True,
         'as_pandas': True,
         'type': 'TimeSeries'
}

cv_score =cv(pool=cv_pool, 
   params=params, 
   dtrain=None, 
   iterations=2000, 
   num_boost_round=None,
   fold_count=5, 
   nfold=None,
   inverted=False,
   partition_random_seed=0,
   seed=2021, 
   shuffle=False, 
   logging_level=None, 
   stratified=True,
   as_pandas=True,
   type='TimeSeries')

In [None]:
cv_score

Unnamed: 0,iterations,test-F1-mean,test-F1-std,train-F1-mean,train-F1-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.000000,0.000000,0.000000,0.000000,0.660234,0.003815,0.659794,0.004020
1,1,0.000000,0.000000,0.000000,0.000000,0.629065,0.003617,0.628580,0.003901
2,2,0.000000,0.000000,0.000741,0.001284,0.598386,0.009334,0.600561,0.006937
3,3,0.000000,0.000000,0.000371,0.000642,0.572443,0.008103,0.574281,0.005280
4,4,0.002953,0.005115,0.006238,0.009857,0.548483,0.007865,0.550048,0.005760
...,...,...,...,...,...,...,...,...,...
184,184,0.120758,0.017292,0.280924,0.003395,0.244545,0.004970,0.207284,0.006843
185,185,0.121312,0.017374,0.282253,0.004709,0.244574,0.004972,0.207173,0.006899
186,186,0.122547,0.015863,0.282494,0.003678,0.244520,0.004856,0.207063,0.006966
187,187,0.119446,0.016485,0.283570,0.004286,0.244538,0.004805,0.206988,0.007005


In [None]:
cv

<function catboost.core.cv(pool=None, params=None, dtrain=None, iterations=None, num_boost_round=None, fold_count=None, nfold=None, inverted=False, partition_random_seed=0, seed=None, shuffle=True, logging_level=None, stratified=None, as_pandas=True, metric_period=None, verbose=None, verbose_eval=None, plot=False, early_stopping_rounds=None, save_snapshot=None, snapshot_file=None, snapshot_interval=None, metric_update_interval=0.5, folds=None, type='Classical', return_models=False, log_cout=<ipykernel.iostream.OutStream object at 0x000001B50082C1C0>, log_cerr=<ipykernel.iostream.OutStream object at 0x000001B50082CF70>)>

In [None]:
cv_score[cv_score['test-Logloss-mean'].min() == cv_score['test-Logloss-mean']]