# kolla med cv mina val:
1. Hur långt tillbaka skall jag köra learn på
2. Finns det någon avkodning av vissa categories som är bättre för catboost
   1.  Häst
   2.  Kusk
   3.  Bana
   4.  Ekipage
3. En gång för alla kolla med färre features


## import och förbered catboost classifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier, Pool, cv

In [2]:
"""
cross validation with catboost
"""
def catboost_cv(X, y, cat_features, fold_count=5):
    
    # Create a pool
    pool = Pool(data=X, label=y, cat_features=cat_features)
    params = {"iterations": 1000,
              "loss_function": "Logloss",
              "verbose": False}
  
    cv_results = cv(pool, params, fold_count=fold_count, 

                    early_stopping_rounds=100,
                    # loss_function='Logloss',
                    shuffle=False, 
                    type='TimeSeries',
                    verbose=False)
    return cv_results


In [3]:
def preproc_bana(X_):
    """ make all bana + hx_bana lower case and without numbers"""
    X = X_.copy()

    for i in ['bana', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']:
        X[i] = X[i].str.lower()
        # remove numbers from hx_bana items
        X[i] = X[i].replace(to_replace=r'-\d+', value='', regex=True)

    return X


In [4]:
# read all data
df = pd.read_csv('../all_data.csv')
# make datum as date format
df['datum'] = pd.to_datetime(df['datum'])
df = preproc_bana(df)
print(df.shape)
print(df.columns)

(43294, 79)
Index(['datum', 'avd', 'bana', 'häst', 'kusk', 'streck', 'vodds', 'podds',
       'kr', 'spår', 'dist', 'lopp_dist', 'start', 'ålder', 'kön', 'plac',
       'pris', 'h1_dat', 'h1_kusk', 'h1_bana', 'h1_spår', 'h1_plac', 'h1_pris',
       'h1_odds', 'h1_kmtid', 'h2_dat', 'h2_kusk', 'h2_bana', 'h2_spår',
       'h2_plac', 'h2_pris', 'h2_odds', 'h2_kmtid', 'h3_dat', 'h3_kusk',
       'h3_bana', 'h3_spår', 'h3_plac', 'h3_pris', 'h3_odds', 'h3_kmtid',
       'h4_dat', 'h4_kusk', 'h4_bana', 'h4_spår', 'h4_plac', 'h4_pris',
       'h4_odds', 'h4_kmtid', 'h5_dat', 'h5_kusk', 'h5_bana', 'h5_spår',
       'h5_plac', 'h5_pris', 'h5_odds', 'h5_kmtid', 'h1_dist', 'h2_dist',
       'h3_dist', 'h4_dist', 'h5_dist', 'bins', 'h1_auto', 'h2_auto',
       'h3_auto', 'h4_auto', 'h5_auto', 'h1_perf', 'h2_perf', 'h3_perf',
       'h4_perf', 'h5_perf', 'senast', 'delta1', 'delta2', 'delta3', 'delta4',
       'startnr'],
      dtype='object')


In [5]:
# drop startnr because it has too many NaN
# drop vodds, podds, bins because they are not known before v75
X = df.drop(['plac', 'avd', 'startnr', 'vodds', 'podds', 'bins', 'h1_dat', 'h2_dat','h3_dat', 'h4_dat', 'h5_dat',], axis=1)
y = (df['plac'] == 1)*1

all_na = X.isna().sum()
# show only rows > 0
all_na[all_na>0].sort_values(ascending=False).head(20)


h5_perf    5351
h5_pris    5351
h4_perf    4935
h4_pris    4935
h3_perf    4508
h3_pris    4508
h2_pris    4038
h2_perf    4038
h1_perf    3441
h1_pris    3435
h3_spår    3132
h4_spår    3132
h5_spår    3132
h1_spår    3132
h2_spår    3132
h5_odds     693
h4_odds     462
h3_odds     320
h2_odds     256
h1_odds     252
dtype: int64

### Vilka är numeric resp cat_features

In [6]:
# get numerical features and cat_features
num_features = list(X.select_dtypes(include=[np.number]).columns)
cat_features = list(X.select_dtypes(include=['object']).columns)
print('Rätt längd på summan:',len(num_features)+len(cat_features) == len(X.columns))
# check cat_features isna
print('Before:\n', X[cat_features].isna().sum()[X[cat_features].isna().sum()>0].sort_values(ascending=False))

# impute 'missing' for all NaN in cat_features
X[cat_features] = X[cat_features].fillna('missing')


Rätt längd på summan: False
Before:
 h5_kusk    89
h5_bana    89
h4_kusk    28
h4_bana    28
h3_kusk     6
h3_bana     6
dtype: int64


## CV with 'everything'

In [7]:
res_dict = {}
result = catboost_cv(X, y, cat_features)

Training on fold [0/5]

bestTest = 0.2372793684
bestIteration = 133

Training on fold [1/5]

bestTest = 0.2393137609
bestIteration = 132

Training on fold [2/5]

bestTest = 0.2440632851
bestIteration = 121

Training on fold [3/5]

bestTest = 0.2360307455
bestIteration = 134

Training on fold [4/5]

bestTest = 0.2404628991
bestIteration = 131



In [8]:
res_dict['all']=result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)


Unnamed: 0,0
all,0.239505


## CV with 30%, 40%, 50%, 70% latest rows


In [9]:
def make_cv_with_fraction_X(X_, y_, fraction):
    X=X_.copy()
    y=y_.copy()
    # set X to the last % of the rows
    alla_datum = X.datum.unique()
    # get the last reduce % of alla_datum
    valda_datum = alla_datum[-int(len(alla_datum)*fraction):]
    print(fraction, pd.to_datetime(valda_datum[0]).strftime('%Y-%m-%d'))
    # keep only rows with datum >= valda_datum[0]
    X_frac = X[X.datum >= valda_datum[0]]
    y_frac = y[X.datum >= valda_datum[0]]
    return catboost_cv(X_frac, y_frac, cat_features)


In [10]:
result = make_cv_with_fraction_X(X, y, 0.3)
res_dict['30%'] = result['test-Logloss-mean'].min()
result = make_cv_with_fraction_X(X, y, 0.4)
res_dict['40%'] = result['test-Logloss-mean'].min()
result = make_cv_with_fraction_X(X, y, 0.5)
res_dict['50%'] = result['test-Logloss-mean'].min()
result = make_cv_with_fraction_X(X, y, 0.7)
res_dict['70%'] = result['test-Logloss-mean'].min()
result = make_cv_with_fraction_X(X, y, 0.9)
res_dict['90%'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)


0.3 2019-09-01
Training on fold [0/5]

bestTest = 0.2472891894
bestIteration = 128

Training on fold [1/5]

bestTest = 0.2372006647
bestIteration = 155

Training on fold [2/5]

bestTest = 0.2327249384
bestIteration = 161

Training on fold [3/5]

bestTest = 0.2500466592
bestIteration = 95

Training on fold [4/5]

bestTest = 0.241016245
bestIteration = 239

0.4 2019-01-12
Training on fold [0/5]

bestTest = 0.2427666478
bestIteration = 147

Training on fold [1/5]

bestTest = 0.2434006552
bestIteration = 131

Training on fold [2/5]

bestTest = 0.2347228139
bestIteration = 200

Training on fold [3/5]

bestTest = 0.2397898964
bestIteration = 142

Training on fold [4/5]

bestTest = 0.2457398944
bestIteration = 128

0.5 2018-06-24
Training on fold [0/5]

bestTest = 0.2509421991
bestIteration = 116

Training on fold [1/5]

bestTest = 0.2410736482
bestIteration = 125

Training on fold [2/5]

bestTest = 0.2370131222
bestIteration = 119

Training on fold [3/5]

bestTest = 0.2378129146
bestIteratio

Unnamed: 0,0
all,0.239505
90%,0.241406
40%,0.241439
70%,0.241542
30%,0.242172
50%,0.242422


In [11]:
result = catboost_cv(X.drop(['start','h1_auto', 'h2_auto', 'h3_auto', 'h4_auto', 'h5_auto'],axis=1), y, cat_features)
res_dict['no start n auto'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)


Training on fold [0/5]

bestTest = 0.2378725907
bestIteration = 131

Training on fold [1/5]

bestTest = 0.2386508048
bestIteration = 109

Training on fold [2/5]

bestTest = 0.2446053541
bestIteration = 132

Training on fold [3/5]

bestTest = 0.2364895399
bestIteration = 115

Training on fold [4/5]

bestTest = 0.2404038882
bestIteration = 140



Unnamed: 0,0
all,0.239505
no start n auto,0.239784
90%,0.241406
40%,0.241439
70%,0.241542
30%,0.242172
50%,0.242422


In [12]:
# no history at all and no lopp_dist
X_no_history = X[['datum', 'bana', 'häst', 'kusk', 'streck', 'kr', 'spår', 'dist',
                  'ålder', 'kön', 'pris']]
new_features = ['bana', 'häst', 'kusk', 'kön'] # ta bort history features

result = catboost_cv(X_no_history, y, new_features)
res_dict['no hist n start'] = result['test-Logloss-mean'].min()
# make a dataframe from res_dict
# pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)


Training on fold [0/5]

bestTest = 0.2382168873
bestIteration = 110

Training on fold [1/5]

bestTest = 0.2387793173
bestIteration = 109

Training on fold [2/5]

bestTest = 0.2447294573
bestIteration = 109

Training on fold [3/5]

bestTest = 0.2363100011
bestIteration = 129

Training on fold [4/5]

bestTest = 0.2405405226
bestIteration = 109



In [13]:
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)

Unnamed: 0,0
all,0.239505
no start n auto,0.239784
no hist n start,0.239842
90%,0.241406
40%,0.241439
70%,0.241542
30%,0.242172
50%,0.242422


In [14]:
def remove_some(X, y, feature:list):
    """ remove some cat_feature from X and y and cv """
    new_features = cat_features.copy()
    for x in feature:
        new_features.remove(x)
    
    result = catboost_cv(X.drop(feature, axis=1), y, new_features)
    if len(feature) == 1:
        li = feature
    else:
        li = [x[:2] for x in feature]
    
    res_dict['no ' + ' '.join(li)] = result['test-Logloss-mean'].min()
    print(pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60))

remove_some(X, y, ['bana'])
remove_some(X, y, ['kusk'])
remove_some(X, y, ['häst'])


Training on fold [0/5]

bestTest = 0.2386251994
bestIteration = 130

Training on fold [1/5]

bestTest = 0.2392541774
bestIteration = 132

Training on fold [2/5]

bestTest = 0.2448286936
bestIteration = 117

Training on fold [3/5]

bestTest = 0.2368272062
bestIteration = 168

Training on fold [4/5]

bestTest = 0.2405729058
bestIteration = 125

                        0
all              0.239505
no start n auto  0.239784
no hist n start  0.239842
no bana          0.240196
90%              0.241406
40%              0.241439
70%              0.241542
30%              0.242172
50%              0.242422
Training on fold [0/5]

bestTest = 0.2374022198
bestIteration = 130

Training on fold [1/5]

bestTest = 0.2388085151
bestIteration = 122

Training on fold [2/5]

bestTest = 0.2445352959
bestIteration = 110

Training on fold [3/5]

bestTest = 0.2361974877
bestIteration = 127

Training on fold [4/5]

bestTest = 0.2397108036
bestIteration = 145

                        0
all              0.23950

In [15]:
remove_some(X, y, ['bana', 'kusk', 'häst'])
remove_some(X, y, ['kusk', 'häst'])


Training on fold [0/5]

bestTest = 0.2366533557
bestIteration = 241

Training on fold [1/5]

bestTest = 0.2377619477
bestIteration = 187

Training on fold [2/5]

bestTest = 0.2431305942
bestIteration = 223

Training on fold [3/5]

bestTest = 0.233712241
bestIteration = 372

Training on fold [4/5]

bestTest = 0.2384226374
bestIteration = 255

                        0
no ba ku hä      0.238094
no häst          0.238242
all              0.239505
no kusk          0.239505
no start n auto  0.239784
no hist n start  0.239842
no bana          0.240196
90%              0.241406
40%              0.241439
70%              0.241542
30%              0.242172
50%              0.242422
Training on fold [0/5]

bestTest = 0.2365185772
bestIteration = 211

Training on fold [1/5]

bestTest = 0.2374419162
bestIteration = 256

Training on fold [2/5]

bestTest = 0.2439742804
bestIteration = 132

Training on fold [3/5]

bestTest = 0.2340695225
bestIteration = 402

Training on fold [4/5]

bestTest = 0.23818

#### Ordinal encoding häst


In [16]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit_transform(X[['häst']])
X_enc = X.copy()
X_enc.häst = enc.transform(X[['häst']])

new_features = cat_features.copy()
new_features.remove('häst')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal häst'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)




Training on fold [0/5]

bestTest = 0.236203067
bestIteration = 275

Training on fold [1/5]

bestTest = 0.2375840064
bestIteration = 164

Training on fold [2/5]

bestTest = 0.2430469286
bestIteration = 225

Training on fold [3/5]

bestTest = 0.234113543
bestIteration = 300

Training on fold [4/5]

bestTest = 0.2388065466
bestIteration = 378



Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
no häst,0.238242
no ku hä,0.238243
all,0.239505
no kusk,0.239505
no start n auto,0.239784
no hist n start,0.239842
no bana,0.240196
90%,0.241406


#### Ordinal encoding bana

In [17]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit_transform(X[['bana']])
X_enc = X.copy()
X_enc.bana = enc.transform(X[['bana']])

new_features = cat_features.copy()
new_features.remove('bana')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal bana'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


Training on fold [0/5]

bestTest = 0.2378583835
bestIteration = 121

Training on fold [1/5]

bestTest = 0.2392121395
bestIteration = 110

Training on fold [2/5]

bestTest = 0.2444817545
bestIteration = 136

Training on fold [3/5]

bestTest = 0.2363176962
bestIteration = 108

Training on fold [4/5]

bestTest = 0.2401697296
bestIteration = 115



Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
no häst,0.238242
no ku hä,0.238243
all,0.239505
no kusk,0.239505
no start n auto,0.239784
ordinal bana,0.239795
no hist n start,0.239842
no bana,0.240196


#### Ordinal encoding kusk

In [18]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit_transform(X[['kusk']])
X_enc = X.copy()
X_enc.kusk = enc.transform(X[['kusk']])

new_features = cat_features.copy()
new_features.remove('kusk')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal kusk'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


Training on fold [0/5]

bestTest = 0.2376975244
bestIteration = 117

Training on fold [1/5]

bestTest = 0.2386602564
bestIteration = 113

Training on fold [2/5]

bestTest = 0.2450109993
bestIteration = 127

Training on fold [3/5]

bestTest = 0.2357797185
bestIteration = 152

Training on fold [4/5]

bestTest = 0.2401302598
bestIteration = 179



Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
no häst,0.238242
no ku hä,0.238243
all,0.239505
no kusk,0.239505
ordinal kusk,0.23964
no start n auto,0.239784
ordinal bana,0.239795
no hist n start,0.239842


#### Ordinal encoding häst, kusk, bana

In [19]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit_transform(X[['häst', 'kusk', 'bana']])
X_enc = X.copy()
X_enc[['häst', 'kusk', 'bana']] = enc.transform(X[['häst', 'kusk', 'bana']])

new_features = cat_features.copy()
new_features.remove('häst')
new_features.remove('kusk')
new_features.remove('bana')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal hä ku ba'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)

Training on fold [0/5]

bestTest = 0.2366815355
bestIteration = 191

Training on fold [1/5]

bestTest = 0.2370132156
bestIteration = 242

Training on fold [2/5]

bestTest = 0.2440540333
bestIteration = 140

Training on fold [3/5]

bestTest = 0.2341067694
bestIteration = 354

Training on fold [4/5]

bestTest = 0.2383968008
bestIteration = 324



Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
ordinal hä ku ba,0.238181
no häst,0.238242
no ku hä,0.238243
all,0.239505
no kusk,0.239505
ordinal kusk,0.23964
no start n auto,0.239784
ordinal bana,0.239795


#### Ordinal encoding häst, kusk

In [20]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit_transform(X[['häst','kusk']])
X_enc = X.copy()
X_enc[['häst', 'kusk']] = enc.transform(X[['häst', 'kusk']])

new_features = cat_features.copy()
new_features.remove('häst')
new_features.remove('kusk')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal hä ku'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


Training on fold [0/5]

bestTest = 0.2366774972
bestIteration = 299

Training on fold [1/5]

bestTest = 0.2371612745
bestIteration = 230

Training on fold [2/5]

bestTest = 0.2436630245
bestIteration = 150

Training on fold [3/5]

bestTest = 0.2345690351
bestIteration = 383

Training on fold [4/5]

bestTest = 0.2382881223
bestIteration = 383



Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
ordinal hä ku ba,0.238181
ordinal hä ku,0.238198
no häst,0.238242
no ku hä,0.238243
all,0.239505
no kusk,0.239505
ordinal kusk,0.23964
no start n auto,0.239784


#### cv utan datum

In [21]:
# remove ['Datum']
result = catboost_cv(X.drop(['datum'], axis=1), y, cat_features)
res_dict['no datum'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)

Training on fold [0/5]

bestTest = 0.237863676
bestIteration = 132

Training on fold [1/5]

bestTest = 0.2387231492
bestIteration = 109

Training on fold [2/5]

bestTest = 0.2447837025
bestIteration = 116

Training on fold [3/5]

bestTest = 0.2363691115
bestIteration = 127

Training on fold [4/5]

bestTest = 0.240317961
bestIteration = 117



Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
ordinal hä ku ba,0.238181
ordinal hä ku,0.238198
no häst,0.238242
no ku hä,0.238243
all,0.239505
no kusk,0.239505
ordinal kusk,0.23964
no datum,0.239746


#### Function: Ordinal encoding feature + hx_feature

In [22]:
from sklearn.preprocessing import OrdinalEncoder
def enc_with_history(X_, feature:list, cat_features:list):
    X = X_.copy()
    enc = OrdinalEncoder()
    df = X[[feature[0]]].copy()
    print(f'len({feature[0]}) + 5*hist  =  ',len(df)*6)
    df_h1 = X[[feature[1]]].copy()
    df_h2 = X[[feature[2]]].copy()
    df_h3 = X[[feature[3]]].copy()
    df_h4 = X[[feature[4]]].copy()
    df_h5 = X[[feature[5]]].copy()
    df_h1.columns = [feature[0]]
    df_h2.columns = [feature[0]]
    df_h3.columns = [feature[0]]
    df_h4.columns = [feature[0]]
    df_h5.columns = [feature[0]]
 
    # catenate kusken with hx_kusken axis = 0 - stapla dem på varandra
    df = pd.concat([df, df_h1, df_h2,df_h3,df_h4,df_h5], axis=0, ignore_index=True)
    # sätt alla till samma urval
    df[feature[1]]=df_h1[feature[0]]
    df[feature[2]]=df_h2[feature[0]]
    df[feature[3]]=df_h3[feature[0]]
    df[feature[4]]=df_h4[feature[0]]
    df[feature[5]]=df_h5[feature[0]]
    
    enc.fit(df[[feature[0]]])

    X[feature[0]]= enc.transform(X[[feature[0]]])
    X[feature[1]]= enc.transform(df_h1[[feature[0]]])
    X[feature[2]]= enc.transform(df_h2[[feature[0]]])
    X[feature[3]]= enc.transform(df_h3[[feature[0]]])
    X[feature[4]]= enc.transform(df_h4[[feature[0]]])
    X[feature[5]]= enc.transform(df_h5[[feature[0]]])
    
    new_features = cat_features.copy()
    for f in feature:
        new_features.remove(f)
        
    return X, new_features


#### Ordinal encoding bana, häst kusk+history

In [23]:

feature_list = ['kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk']

X_enc, new_features = enc_with_history(X, feature_list, cat_features)

hästen_enc = OrdinalEncoder()
X_enc[['häst','bana']]=hästen_enc.fit_transform(X_enc[['häst','bana']])

print(X_enc[['häst','bana','kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk']].sample(5))

new_features.remove('häst')
new_features.remove('bana')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal ba hä ku+hx'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


len(kusk) + 5*hist  =   259764
         häst  bana   kusk  h1_kusk  h2_kusk  h3_kusk  h4_kusk  h5_kusk
6312   3605.0  22.0  339.0    339.0    339.0   2548.0    339.0   1130.0
23333  1403.0  14.0  533.0    533.0   1119.0    211.0   1119.0   1119.0
25465  5145.0  18.0  376.0    376.0   1783.0   1783.0   1887.0   1184.0
38041  7691.0  19.0  217.0    217.0    272.0    272.0    217.0    217.0
34842  5919.0  22.0  217.0    217.0   2646.0    217.0    217.0    217.0
Training on fold [0/5]

bestTest = 0.2363866433
bestIteration = 255

Training on fold [1/5]

bestTest = 0.2377298852
bestIteration = 219

Training on fold [2/5]

bestTest = 0.2440413835
bestIteration = 179

Training on fold [3/5]

bestTest = 0.2337880053
bestIteration = 428

Training on fold [4/5]

bestTest = 0.2384987982
bestIteration = 221



Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
ordinal hä ku ba,0.238181
ordinal hä ku,0.238198
no häst,0.238242
no ku hä,0.238243
ordinal ba hä ku+hx,0.238262
all,0.239505
no kusk,0.239505
ordinal kusk,0.23964


#### Ordinal encoding bana+ hx_bana

In [24]:
feature_list = ['bana', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']

X_enc, new_features = enc_with_history(X, feature_list, cat_features)

# hästen_enc = OrdinalEncoder()
# X_enc[['häst', 'kusk']] = hästen_enc.fit_transform(X_enc[['häst', 'kusk']])

print(X_enc[['häst', 'kusk'] + feature_list].sample(5))

# new_features.remove('häst')
# new_features.remove('kusk')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal ba+hx'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


len(bana) + 5*hist  =   259764
                    häst            kusk   bana  h1_bana  h2_bana  h3_bana  \
12940  SUGARMAKESMECRAZY  Erik Adielsson   35.0     44.0     35.0    117.0   
24173       I LOVE PARIS      Björn Goop  159.0     42.0    154.0    117.0   
17246     LICENCE HOLDER     Jenni Kaija  154.0    153.0    140.0     83.0   
27470    HERA DI QUATTRO   André Eklundh   42.0    154.0      4.0    154.0   
42595        SPEEDY FACE    Wilhelm Paal   60.0     60.0     80.0     47.0   

       h4_bana  h5_bana  
12940     48.0     46.0  
24173     10.0     42.0  
17246     73.0     59.0  
27470      4.0      4.0  
42595      4.0     60.0  
Training on fold [0/5]

bestTest = 0.2378184508
bestIteration = 117

Training on fold [1/5]

bestTest = 0.2394324964
bestIteration = 113

Training on fold [2/5]

bestTest = 0.2448858029
bestIteration = 105

Training on fold [3/5]

bestTest = 0.2360314638
bestIteration = 197

Training on fold [4/5]

bestTest = 0.2403288617
bestIteration = 136


Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
ordinal hä ku ba,0.238181
ordinal hä ku,0.238198
no häst,0.238242
no ku hä,0.238243
ordinal ba hä ku+hx,0.238262
all,0.239505
no kusk,0.239505
ordinal kusk,0.23964


#### Ordinal encoding häst kusk bana+history

In [25]:
feature_list = ['bana', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']

X_enc, new_features = enc_with_history(X, feature_list, cat_features)

hästen_enc = OrdinalEncoder()
X_enc[['häst', 'kusk']] = hästen_enc.fit_transform(X_enc[['häst', 'kusk']])

print(X_enc[['häst', 'kusk'] + feature_list].sample(5))

new_features.remove('häst')
new_features.remove('kusk')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal hä ku ba+hx'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


len(bana) + 5*hist  =   259764
          häst   kusk   bana  h1_bana  h2_bana  h3_bana  h4_bana  h5_bana
16705   5894.0  839.0  118.0    132.0    132.0    114.0    114.0     13.0
15586   4047.0   97.0   60.0     56.0     12.0     56.0    154.0     80.0
788     7442.0  409.0   44.0      7.0    104.0      7.0      7.0      7.0
21392  10182.0  204.0   80.0    154.0     60.0    142.0    142.0    142.0
4584    4553.0  439.0   60.0    154.0    117.0     42.0     40.0     12.0
Training on fold [0/5]

bestTest = 0.2368738348
bestIteration = 194

Training on fold [1/5]

bestTest = 0.2371998362
bestIteration = 169

Training on fold [2/5]

bestTest = 0.2437904914
bestIteration = 200

Training on fold [3/5]

bestTest = 0.2340522864
bestIteration = 435

Training on fold [4/5]

bestTest = 0.2382232008
bestIteration = 329



Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
ordinal hä ku ba,0.238181
ordinal hä ku,0.238198
no häst,0.238242
no ku hä,0.238243
ordinal hä ku ba+hx,0.238259
ordinal ba hä ku+hx,0.238262
all,0.239505
no kusk,0.239505


#### Ordinal encoding häst bana+history kusk+history

In [27]:
feature_list1 = ['bana', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']
feature_list2 = ['kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk']


X_enc, new_features = enc_with_history(X, feature_list1, cat_features)
X_enc, new_features = enc_with_history(X_enc, feature_list2, new_features)

hästen_enc = OrdinalEncoder()
X_enc[['häst']] = hästen_enc.fit_transform(X_enc[['häst']])

print(X_enc[['häst'] + feature_list1+feature_list2].sample(5))

new_features.remove('häst')

result = catboost_cv(X_enc, y, new_features,5)
res_dict['ordinal hä ku+hx ba+hx'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


len(bana) + 5*hist  =   259764
len(kusk) + 5*hist  =   259764
         häst   bana  h1_bana  h2_bana  h3_bana  h4_bana  h5_bana    kusk  \
20876  9103.0  159.0     35.0    117.0    117.0     80.0     35.0  1254.0   
13680  8843.0  104.0     86.0    117.0     42.0     86.0     12.0  2548.0   
33067  1031.0   35.0    104.0     44.0     44.0     44.0     35.0    90.0   
13300  5888.0   80.0    117.0    117.0    117.0     35.0    117.0  1316.0   
7600   3266.0  159.0     60.0    117.0     62.0     47.0     60.0  2031.0   

       h1_kusk  h2_kusk  h3_kusk  h4_kusk  h5_kusk  
20876   1254.0   1254.0   1254.0   2548.0   1254.0  
13680   2456.0   2548.0    217.0    771.0   2456.0  
33067     90.0     90.0     90.0   1682.0    506.0  
13300   1316.0   1316.0   1316.0   1316.0     58.0  
7600    2031.0   2031.0   1119.0   2031.0   2031.0  
Training on fold [0/5]


Custom logger is already specified. Specify more than one logger at same time is not thread safe.


bestTest = 0.2360776262
bestIteration = 217

Training on fold [1/5]

bestTest = 0.2378594089
bestIteration = 154

Training on fold [2/5]

bestTest = 0.2439321022
bestIteration = 172

Training on fold [3/5]

bestTest = 0.2339173614
bestIteration = 272

Training on fold [4/5]

bestTest = 0.2389054786
bestIteration = 197



Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
ordinal hä ku ba,0.238181
ordinal hä ku,0.238198
no häst,0.238242
no ku hä,0.238243
ordinal hä ku ba+hx,0.238259
ordinal ba hä ku+hx,0.238262
ordinal hä ku+hx ba+hx,0.238283
all,0.239505


In [28]:
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


Unnamed: 0,0
ordinal häst,0.238079
no ba ku hä,0.238094
ordinal hä ku ba,0.238181
ordinal hä ku,0.238198
no häst,0.238242
no ku hä,0.238243
ordinal hä ku ba+hx,0.238259
ordinal ba hä ku+hx,0.238262
ordinal hä ku+hx ba+hx,0.238283
all,0.239505


#### Feature importance med CatboostClassifier

In [29]:
model=CatBoostClassifier(iterations=150,cat_features=cat_features,loss_function='Logloss',verbose=False)

In [30]:
model.fit(X.drop(['start','h1_auto', 'h2_auto', 'h3_auto',
          'h4_auto', 'h5_auto'], axis=1), y)


<catboost.core.CatBoostClassifier at 0x1ce38bf5ab0>

In [31]:
print(model.get_best_score())
importance = model.get_feature_importance(prettified=True)
# importance.loc[importance.Importances<0.5]
importance.head(60)

{'learn': {'Logloss': 0.1877026859429181}}


Unnamed: 0,Feature Id,Importances
0,streck,38.84852
1,häst,4.631281
2,h1_perf,2.48688
3,h2_odds,2.082272
4,delta1,1.977997
5,h3_odds,1.932298
6,kr,1.91093
7,spår,1.835316
8,datum,1.776212
9,h5_kusk,1.605583
