# kolla med cv mina val:
1. Hur långt tillbaka skall jag köra learn på
2. Finns det någon avkodning av vissa categories som är bättre för catboost
   1.  Häst
   2.  Kusk
   3.  Bana
   4.  Ekipage
3. En gång för alla kolla med färre features


## import och förbered catboost classifier

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from catboost import CatBoostClassifier, Pool, cv

In [12]:
"""
cross validation with catboost
"""
def catboost_cv(X, y, cat_features, fold_count=5):
    
    # Create a pool
    pool = Pool(data=X, label=y, cat_features=cat_features)
    params = {"iterations": 1000,
              "loss_function": "Logloss",
              "verbose": False}
  
    cv_results = cv(pool, params, fold_count=fold_count, 

                    early_stopping_rounds=100,
                    # loss_function='Logloss',
                    shuffle=False, 
                    type='TimeSeries',
                    verbose=False)
    return cv_results


In [5]:
def preproc_bana(X_):
    """ make all bana + hx_bana lower case and without numbers"""
    X = X_.copy()

    for i in ['bana', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']:
        X[i] = X[i].str.lower()
        # remove numbers from hx_bana items
        X[i] = X[i].replace(to_replace=r'-\d+', value='', regex=True)

    return X


In [None]:
# read all data
df = pd.read_csv('../all_data.csv')
# make datum as date format
df['datum'] = pd.to_datetime(df['datum'])
df = preproc_bana(df)
print(df.shape)
print(df.columns)

In [15]:
# drop startnr because it has too many NaN
# drop vodds, podds, bins because they are not known before v75
X = df.drop(['plac', 'avd', 'startnr', 'vodds', 'podds', 'bins', 'h1_dat', 'h2_dat','h3_dat', 'h4_dat', 'h5_dat',], axis=1)
y = (df['plac'] == 1)*1

all_na = X.isna().sum()
# show only rows > 0
all_na[all_na>0].sort_values(ascending=False).head(20)


h5_perf    5372
h5_pris    5372
h4_perf    4950
h4_pris    4950
h3_perf    4521
h3_pris    4521
h2_pris    4050
h2_perf    4050
h1_perf    3456
h1_pris    3450
h3_spår    3147
h4_spår    3147
h5_spår    3147
h1_spår    3147
h2_spår    3147
h5_odds     695
h4_odds     464
h3_odds     321
h2_odds     258
h1_odds     252
dtype: int64

### Vilka är numeric resp cat_features

In [16]:
# get numerical features and cat_features
num_features = list(X.select_dtypes(include=[np.number]).columns)
cat_features = list(X.select_dtypes(include=['object']).columns)
print(f'Rätt längd på summan (med datum)? {1+len(num_features)+len(cat_features)} == {len(X.columns)}')
# check cat_features isna
print('NaN before:\n', X[cat_features].isna().sum()[X[cat_features].isna().sum()>0].sort_values(ascending=False))

# impute 'missing' for all NaN in cat_features
X[cat_features] = X[cat_features].fillna('missing')
print('\nNaN after:', X[cat_features].isna().sum().sum())


Rätt längd på summan (med datum)? 68 == 68
NaN before:
 h5_kusk    89
h5_bana    89
h4_kusk    28
h4_bana    28
h3_kusk     6
h3_bana     6
dtype: int64

NaN after: 0


## CV with 'everything'

In [17]:
res_dict = {}
result = catboost_cv(X, y, cat_features)

Training on fold [0/5]

bestTest = 0.2380287779
bestIteration = 164

Training on fold [1/5]

bestTest = 0.23992888
bestIteration = 100

Training on fold [2/5]

bestTest = 0.244946823
bestIteration = 140

Training on fold [3/5]

bestTest = 0.2359198961
bestIteration = 199

Training on fold [4/5]

bestTest = 0.239452743
bestIteration = 122



In [18]:
res_dict['all']=result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)


Unnamed: 0,0
all,0.239883


## CV with 30%, 40%, 50%, 70% latest rows


In [19]:
def make_cv_with_fraction_X(X_, y_, fraction):
    X=X_.copy()
    y=y_.copy()
    # set X to the last % of the rows
    alla_datum = X.datum.unique()
    # get the last reduce % of alla_datum
    valda_datum = alla_datum[-int(len(alla_datum)*fraction):]
    print(fraction, pd.to_datetime(valda_datum[0]).strftime('%Y-%m-%d'))
    # keep only rows with datum >= valda_datum[0]
    X_frac = X[X.datum >= valda_datum[0]]
    y_frac = y[X.datum >= valda_datum[0]]
    return catboost_cv(X_frac, y_frac, cat_features)


In [20]:
result = make_cv_with_fraction_X(X, y, 0.3)
res_dict['30%'] = result['test-Logloss-mean'].min()
result = make_cv_with_fraction_X(X, y, 0.4)
res_dict['40%'] = result['test-Logloss-mean'].min()
result = make_cv_with_fraction_X(X, y, 0.5)
res_dict['50%'] = result['test-Logloss-mean'].min()
result = make_cv_with_fraction_X(X, y, 0.7)
res_dict['70%'] = result['test-Logloss-mean'].min()
result = make_cv_with_fraction_X(X, y, 0.9)
res_dict['90%'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)


0.3 2019-09-08
Training on fold [0/5]

bestTest = 0.2442034325
bestIteration = 205

Training on fold [1/5]

bestTest = 0.2352766134
bestIteration = 172

Training on fold [2/5]

bestTest = 0.2381395018
bestIteration = 214

Training on fold [3/5]

bestTest = 0.2424352941
bestIteration = 164

Training on fold [4/5]

bestTest = 0.2403729544
bestIteration = 236

0.4 2019-01-26
Training on fold [0/5]

bestTest = 0.2383191991
bestIteration = 115

Training on fold [1/5]

bestTest = 0.2452785723
bestIteration = 110

Training on fold [2/5]

bestTest = 0.2293667962
bestIteration = 177

Training on fold [3/5]

bestTest = 0.2467057256
bestIteration = 137

Training on fold [4/5]

bestTest = 0.2377057033
bestIteration = 160

0.5 2018-06-30
Training on fold [0/5]

bestTest = 0.2503477142
bestIteration = 134

Training on fold [1/5]

bestTest = 0.241554618
bestIteration = 163

Training on fold [2/5]

bestTest = 0.2361269687
bestIteration = 127

Training on fold [3/5]

bestTest = 0.2393890038
bestIterati

Unnamed: 0,0
40%,0.239839
all,0.239883
30%,0.240214
70%,0.240818
90%,0.240902
50%,0.241682


In [21]:
result = catboost_cv(X.drop(['start','h1_auto', 'h2_auto', 'h3_auto', 'h4_auto', 'h5_auto'],axis=1), y, cat_features)
res_dict['no start n auto'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)


Training on fold [0/5]

bestTest = 0.2367801418
bestIteration = 116

Training on fold [1/5]

bestTest = 0.2398924295
bestIteration = 110

Training on fold [2/5]

bestTest = 0.2444457459
bestIteration = 123

Training on fold [3/5]

bestTest = 0.2358003369
bestIteration = 126

Training on fold [4/5]

bestTest = 0.2396659724
bestIteration = 150



Unnamed: 0,0
no start n auto,0.239469
40%,0.239839
all,0.239883
30%,0.240214
70%,0.240818
90%,0.240902
50%,0.241682


In [22]:
# no history at all and no lopp_dist
X_no_history = X[['datum', 'bana', 'häst', 'kusk', 'streck', 'kr', 'spår', 'dist',
                  'ålder', 'kön', 'pris']]
new_features = ['bana', 'häst', 'kusk', 'kön'] # ta bort history features

result = catboost_cv(X_no_history, y, new_features)
res_dict['no hist n start'] = result['test-Logloss-mean'].min()
# make a dataframe from res_dict
# pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)


Training on fold [0/5]

bestTest = 0.2393184817
bestIteration = 105

Training on fold [1/5]

bestTest = 0.2394258268
bestIteration = 103

Training on fold [2/5]

bestTest = 0.2448267215
bestIteration = 108

Training on fold [3/5]

bestTest = 0.2358675383
bestIteration = 112

Training on fold [4/5]

bestTest = 0.2395962938
bestIteration = 119



In [23]:
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)

Unnamed: 0,0
no start n auto,0.239469
40%,0.239839
all,0.239883
no hist n start,0.239947
30%,0.240214
70%,0.240818
90%,0.240902
50%,0.241682


In [24]:
def remove_some(X, y, feature:list):
    """ remove some cat_feature from X and y and cv """
    new_features = cat_features.copy()
    for x in feature:
        new_features.remove(x)
    
    result = catboost_cv(X.drop(feature, axis=1), y, new_features)
    if len(feature) == 1:
        li = feature
    else:
        li = [x[:2] for x in feature]
    
    res_dict['no ' + ' '.join(li)] = result['test-Logloss-mean'].min()
    print(pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60))

remove_some(X, y, ['bana'])
remove_some(X, y, ['kusk'])
remove_some(X, y, ['häst'])


Training on fold [0/5]

bestTest = 0.2371301776
bestIteration = 154

Training on fold [1/5]

bestTest = 0.2400510117
bestIteration = 117

Training on fold [2/5]

bestTest = 0.2447782827
bestIteration = 130

Training on fold [3/5]

bestTest = 0.2357595829
bestIteration = 154

Training on fold [4/5]

bestTest = 0.2391986078
bestIteration = 128

                        0
no start n auto  0.239469
no bana          0.239501
40%              0.239839
all              0.239883
no hist n start  0.239947
30%              0.240214
70%              0.240818
90%              0.240902
50%              0.241682
Training on fold [0/5]

bestTest = 0.2373646825
bestIteration = 148

Training on fold [1/5]

bestTest = 0.2399660352
bestIteration = 117

Training on fold [2/5]

bestTest = 0.2450255224
bestIteration = 122

Training on fold [3/5]

bestTest = 0.2357887437
bestIteration = 145

Training on fold [4/5]

bestTest = 0.2392444836
bestIteration = 142

                        0
no start n auto  0.23946

In [25]:
remove_some(X, y, ['bana', 'kusk', 'häst'])
remove_some(X, y, ['kusk', 'häst'])


Training on fold [0/5]

bestTest = 0.2361367233
bestIteration = 235

Training on fold [1/5]

bestTest = 0.2386417869
bestIteration = 187

Training on fold [2/5]

bestTest = 0.2434960129
bestIteration = 198

Training on fold [3/5]

bestTest = 0.2333568147
bestIteration = 666

Training on fold [4/5]

bestTest = 0.2374694756
bestIteration = 304

                        0
no ba ku hä      0.238023
no häst          0.238109
no start n auto  0.239469
no bana          0.239501
no kusk          0.239623
40%              0.239839
all              0.239883
no hist n start  0.239947
30%              0.240214
70%              0.240818
90%              0.240902
50%              0.241682
Training on fold [0/5]

bestTest = 0.235713314
bestIteration = 245

Training on fold [1/5]

bestTest = 0.2387362026
bestIteration = 178

Training on fold [2/5]

bestTest = 0.2440711011
bestIteration = 162

Training on fold [3/5]

bestTest = 0.2332189506
bestIteration = 324

Training on fold [4/5]

bestTest = 0.23760

#### Ordinal encoding häst


In [26]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit_transform(X[['häst']])
X_enc = X.copy()
X_enc.häst = enc.transform(X[['häst']])

new_features = cat_features.copy()
new_features.remove('häst')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal häst'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0)




Training on fold [0/5]

bestTest = 0.2362187659
bestIteration = 247

Training on fold [1/5]

bestTest = 0.2381703335
bestIteration = 214

Training on fold [2/5]

bestTest = 0.2435865623
bestIteration = 205

Training on fold [3/5]

bestTest = 0.2344190463
bestIteration = 302

Training on fold [4/5]

bestTest = 0.2373776084
bestIteration = 259



Unnamed: 0,0
no ba ku hä,0.238023
no ku hä,0.238032
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469
no bana,0.239501
no kusk,0.239623
40%,0.239839
all,0.239883
no hist n start,0.239947


#### Ordinal encoding bana

In [27]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit_transform(X[['bana']])
X_enc = X.copy()
X_enc.bana = enc.transform(X[['bana']])

new_features = cat_features.copy()
new_features.remove('bana')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal bana'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


Training on fold [0/5]

bestTest = 0.2370389074
bestIteration = 189

Training on fold [1/5]

bestTest = 0.2402572491
bestIteration = 120

Training on fold [2/5]

bestTest = 0.2452569497
bestIteration = 112

Training on fold [3/5]

bestTest = 0.2353023173
bestIteration = 140

Training on fold [4/5]

bestTest = 0.2394716293
bestIteration = 116



Unnamed: 0,0
no ba ku hä,0.238023
no ku hä,0.238032
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469
no bana,0.239501
no kusk,0.239623
ordinal bana,0.239668
40%,0.239839
all,0.239883


#### Ordinal encoding kusk

In [28]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit_transform(X[['kusk']])
X_enc = X.copy()
X_enc.kusk = enc.transform(X[['kusk']])

new_features = cat_features.copy()
new_features.remove('kusk')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal kusk'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


Training on fold [0/5]

bestTest = 0.2372388068
bestIteration = 137

Training on fold [1/5]

bestTest = 0.2397508443
bestIteration = 113

Training on fold [2/5]

bestTest = 0.2451080026
bestIteration = 146

Training on fold [3/5]

bestTest = 0.2351098487
bestIteration = 176

Training on fold [4/5]

bestTest = 0.2393094596
bestIteration = 133



Unnamed: 0,0
no ba ku hä,0.238023
no ku hä,0.238032
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469
no bana,0.239501
ordinal kusk,0.239513
no kusk,0.239623
ordinal bana,0.239668
40%,0.239839


#### Ordinal encoding häst, kusk, bana

In [29]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit_transform(X[['häst', 'kusk', 'bana']])
X_enc = X.copy()
X_enc[['häst', 'kusk', 'bana']] = enc.transform(X[['häst', 'kusk', 'bana']])

new_features = cat_features.copy()
new_features.remove('häst')
new_features.remove('kusk')
new_features.remove('bana')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal hä ku ba'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)

Training on fold [0/5]

bestTest = 0.2357451736
bestIteration = 294

Training on fold [1/5]

bestTest = 0.2384243412
bestIteration = 176

Training on fold [2/5]

bestTest = 0.2436997983
bestIteration = 211

Training on fold [3/5]

bestTest = 0.2335643763
bestIteration = 428

Training on fold [4/5]

bestTest = 0.2378945014
bestIteration = 388



Unnamed: 0,0
no ba ku hä,0.238023
no ku hä,0.238032
ordinal hä ku ba,0.238052
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469
no bana,0.239501
ordinal kusk,0.239513
no kusk,0.239623
ordinal bana,0.239668


#### Ordinal encoding häst, kusk

In [30]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
enc.fit_transform(X[['häst','kusk']])
X_enc = X.copy()
X_enc[['häst', 'kusk']] = enc.transform(X[['häst', 'kusk']])

new_features = cat_features.copy()
new_features.remove('häst')
new_features.remove('kusk')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal hä ku'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


Training on fold [0/5]

bestTest = 0.2358295616
bestIteration = 245

Training on fold [1/5]

bestTest = 0.2384715178
bestIteration = 148

Training on fold [2/5]

bestTest = 0.2435704078
bestIteration = 265

Training on fold [3/5]

bestTest = 0.2334404063
bestIteration = 496

Training on fold [4/5]

bestTest = 0.2374391597
bestIteration = 269



Unnamed: 0,0
ordinal hä ku,0.237897
no ba ku hä,0.238023
no ku hä,0.238032
ordinal hä ku ba,0.238052
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469
no bana,0.239501
ordinal kusk,0.239513
no kusk,0.239623


#### cv utan datum

In [31]:
# remove ['Datum']
result = catboost_cv(X.drop(['datum'], axis=1), y, cat_features)
res_dict['no datum'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)

Training on fold [0/5]

bestTest = 0.2378763161
bestIteration = 128

Training on fold [1/5]

bestTest = 0.2402611312
bestIteration = 97

Training on fold [2/5]

bestTest = 0.2451000355
bestIteration = 101

Training on fold [3/5]

bestTest = 0.2352896069
bestIteration = 123

Training on fold [4/5]

bestTest = 0.2394867741
bestIteration = 115



Unnamed: 0,0
ordinal hä ku,0.237897
no ba ku hä,0.238023
no ku hä,0.238032
ordinal hä ku ba,0.238052
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469
no bana,0.239501
ordinal kusk,0.239513
no kusk,0.239623


#### Function: Ordinal encoding feature + hx_feature

In [32]:
from sklearn.preprocessing import OrdinalEncoder
def enc_with_history(X_, feature:list, cat_features:list):
    X = X_.copy()
    enc = OrdinalEncoder()
    df = X[[feature[0]]].copy()
    print(f'len({feature[0]}) + 5*hist  =  ',len(df)*6)
    df_h1 = X[[feature[1]]].copy()
    df_h2 = X[[feature[2]]].copy()
    df_h3 = X[[feature[3]]].copy()
    df_h4 = X[[feature[4]]].copy()
    df_h5 = X[[feature[5]]].copy()
    df_h1.columns = [feature[0]]
    df_h2.columns = [feature[0]]
    df_h3.columns = [feature[0]]
    df_h4.columns = [feature[0]]
    df_h5.columns = [feature[0]]
 
    # catenate kusken with hx_kusken axis = 0 - stapla dem på varandra
    df = pd.concat([df, df_h1, df_h2,df_h3,df_h4,df_h5], axis=0, ignore_index=True)
    # sätt alla till samma urval
    df[feature[1]]=df_h1[feature[0]]
    df[feature[2]]=df_h2[feature[0]]
    df[feature[3]]=df_h3[feature[0]]
    df[feature[4]]=df_h4[feature[0]]
    df[feature[5]]=df_h5[feature[0]]
    
    enc.fit(df[[feature[0]]])

    X[feature[0]]= enc.transform(X[[feature[0]]])
    X[feature[1]]= enc.transform(df_h1[[feature[0]]])
    X[feature[2]]= enc.transform(df_h2[[feature[0]]])
    X[feature[3]]= enc.transform(df_h3[[feature[0]]])
    X[feature[4]]= enc.transform(df_h4[[feature[0]]])
    X[feature[5]]= enc.transform(df_h5[[feature[0]]])
    
    new_features = cat_features.copy()
    for f in feature:
        new_features.remove(f)
        
    return X, new_features


#### Ordinal encoding bana, häst kusk+history

In [33]:

feature_list = ['kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk']

X_enc, new_features = enc_with_history(X, feature_list, cat_features)

hästen_enc = OrdinalEncoder()
X_enc[['häst','bana']]=hästen_enc.fit_transform(X_enc[['häst','bana']])

print(X_enc[['häst','bana','kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk']].sample(5))

new_features.remove('häst')
new_features.remove('bana')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal ba hä ku+hx'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


len(kusk) + 5*hist  =   261306
         häst  bana    kusk  h1_kusk  h2_kusk  h3_kusk  h4_kusk  h5_kusk
40028  5677.0   0.0  1120.0   1120.0   1120.0   1120.0    534.0   2017.0
43280  9572.0  17.0  2295.0   2295.0   1494.0   2594.0   2295.0   1494.0
6013   3268.0  10.0   548.0    548.0    548.0   2552.0    548.0    548.0
39680   933.0  27.0  1685.0   1685.0   1685.0   1685.0   1685.0    272.0
3058   3064.0   0.0  2570.0   2570.0   2570.0   2570.0   2570.0   2570.0
Training on fold [0/5]

bestTest = 0.236020944
bestIteration = 217

Training on fold [1/5]

bestTest = 0.2380887028
bestIteration = 268

Training on fold [2/5]

bestTest = 0.2439888781
bestIteration = 215

Training on fold [3/5]

bestTest = 0.2333378535
bestIteration = 191

Training on fold [4/5]

bestTest = 0.2376759691
bestIteration = 243



Unnamed: 0,0
ordinal ba hä ku+hx,0.237891
ordinal hä ku,0.237897
no ba ku hä,0.238023
no ku hä,0.238032
ordinal hä ku ba,0.238052
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469
no bana,0.239501
ordinal kusk,0.239513


#### Ordinal encoding bana+ hx_bana

In [34]:
feature_list = ['bana', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']

X_enc, new_features = enc_with_history(X, feature_list, cat_features)

# hästen_enc = OrdinalEncoder()
# X_enc[['häst', 'kusk']] = hästen_enc.fit_transform(X_enc[['häst', 'kusk']])

print(X_enc[['häst', 'kusk'] + feature_list].sample(5))

# new_features.remove('häst')
# new_features.remove('kusk')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal ba+hx'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


len(bana) + 5*hist  =   261306
                  häst                 kusk   bana  h1_bana  h2_bana  h3_bana  \
36565     IES ELISABET    Carl Johan Jepson   60.0    104.0     47.0     47.0   
37582       RALLY HANS    Rikard N Skoglund   44.0    117.0    159.0     44.0   
25072  SEVEN AND SEVEN    Carl Johan Jepson  154.0     60.0     12.0    117.0   
5988   ORDER BY KEEPER        Per Linderoth   44.0    104.0     12.0    117.0   
31964   JEPPAS PICASSO  Susanne H Osterling    7.0      7.0    132.0      7.0   

       h4_bana  h5_bana  
36565    117.0    154.0  
37582    117.0    117.0  
25072    154.0     60.0  
5988      60.0    104.0  
31964      7.0      7.0  
Training on fold [0/5]

bestTest = 0.2365071165
bestIteration = 134

Training on fold [1/5]

bestTest = 0.2397251704
bestIteration = 101

Training on fold [2/5]

bestTest = 0.24512046
bestIteration = 133

Training on fold [3/5]

bestTest = 0.2356584679
bestIteration = 128

Training on fold [4/5]

bestTest = 0.2397732297
best

Unnamed: 0,0
ordinal ba hä ku+hx,0.237891
ordinal hä ku,0.237897
no ba ku hä,0.238023
no ku hä,0.238032
ordinal hä ku ba,0.238052
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469
no bana,0.239501
ordinal kusk,0.239513


#### Ordinal encoding häst kusk bana+history

In [35]:
feature_list = ['bana', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']

X_enc, new_features = enc_with_history(X, feature_list, cat_features)

hästen_enc = OrdinalEncoder()
X_enc[['häst', 'kusk']] = hästen_enc.fit_transform(X_enc[['häst', 'kusk']])

print(X_enc[['häst', 'kusk'] + feature_list].sample(5))

new_features.remove('häst')
new_features.remove('kusk')
result = catboost_cv(X_enc, y, new_features)
res_dict['ordinal hä ku ba+hx'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


len(bana) + 5*hist  =   261306
         häst   kusk  bana  h1_bana  h2_bana  h3_bana  h4_bana  h5_bana
40929  7595.0  495.0  42.0     13.0     80.0     44.0    159.0    117.0
21762  5633.0  833.0  42.0      4.0     60.0    154.0    117.0    159.0
38129  9786.0    1.0  60.0    154.0    154.0    154.0     60.0    154.0
28074  6717.0  178.0  24.0    146.0    146.0    146.0      8.0      8.0
20507  4592.0  423.0  60.0     47.0    117.0     62.0     60.0    117.0
Training on fold [0/5]

bestTest = 0.2361011641
bestIteration = 222

Training on fold [1/5]

bestTest = 0.2381958672
bestIteration = 171

Training on fold [2/5]

bestTest = 0.2435744657
bestIteration = 214

Training on fold [3/5]

bestTest = 0.2331721135
bestIteration = 275

Training on fold [4/5]

bestTest = 0.2374124201
bestIteration = 242



Unnamed: 0,0
ordinal hä ku ba+hx,0.237765
ordinal ba hä ku+hx,0.237891
ordinal hä ku,0.237897
no ba ku hä,0.238023
no ku hä,0.238032
ordinal hä ku ba,0.238052
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469
no bana,0.239501


#### Ordinal encoding häst bana+history kusk+history

In [36]:
feature_list1 = ['bana', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']
feature_list2 = ['kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk']


X_enc, new_features = enc_with_history(X, feature_list1, cat_features)
X_enc, new_features = enc_with_history(X_enc, feature_list2, new_features)

hästen_enc = OrdinalEncoder()
X_enc[['häst']] = hästen_enc.fit_transform(X_enc[['häst']])

print(X_enc[['häst'] + feature_list1+feature_list2].sample(5))

new_features.remove('häst')

result = catboost_cv(X_enc, y, new_features,5)
res_dict['ordinal hä ku+hx ba+hx'] = result['test-Logloss-mean'].min()
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


len(bana) + 5*hist  =   261306
len(kusk) + 5*hist  =   261306
          häst   bana  h1_bana  h2_bana  h3_bana  h4_bana  h5_bana    kusk  \
12944   4158.0   44.0    117.0    154.0     42.0     80.0    117.0   395.0   
27057   7996.0  117.0     80.0     47.0     47.0    154.0     60.0  2035.0   
27758   6219.0    7.0    104.0     35.0    117.0    117.0     35.0  2650.0   
28174    935.0  117.0     80.0      4.0     42.0    154.0    154.0  2640.0   
39861  10184.0  104.0     80.0     35.0      7.0     35.0      7.0   534.0   

       h1_kusk  h2_kusk  h3_kusk  h4_kusk  h5_kusk  
12944    395.0    395.0    395.0    395.0    395.0  
27057    817.0   2035.0   2035.0   2035.0   2035.0  
27758   2650.0    398.0   2650.0   2650.0   2650.0  
28174   1113.0   1566.0   1113.0   2395.0   1039.0  
39861    534.0    534.0   1180.0   1180.0   1757.0  
Training on fold [0/5]

bestTest = 0.2357807752
bestIteration = 181

Training on fold [1/5]

bestTest = 0.2380040392
bestIteration = 153

Training on f

Unnamed: 0,0
ordinal hä ku ba+hx,0.237765
ordinal ba hä ku+hx,0.237891
ordinal hä ku,0.237897
ordinal hä ku+hx ba+hx,0.237958
no ba ku hä,0.238023
no ku hä,0.238032
ordinal hä ku ba,0.238052
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469


In [37]:
pd.DataFrame.from_dict(res_dict, orient='index').sort_values(by=0).head(60)


Unnamed: 0,0
ordinal hä ku ba+hx,0.237765
ordinal ba hä ku+hx,0.237891
ordinal hä ku,0.237897
ordinal hä ku+hx ba+hx,0.237958
no ba ku hä,0.238023
no ku hä,0.238032
ordinal hä ku ba,0.238052
ordinal häst,0.238075
no häst,0.238109
no start n auto,0.239469


#### Feature importance med CatboostClassifier

In [38]:
model=CatBoostClassifier(iterations=150,cat_features=cat_features,loss_function='Logloss',verbose=False)

In [39]:
model.fit(X.drop(['start','h1_auto', 'h2_auto', 'h3_auto',
          'h4_auto', 'h5_auto'], axis=1), y)


<catboost.core.CatBoostClassifier at 0x1d1ae0a6340>

In [40]:
print(model.get_best_score())
importance = model.get_feature_importance(prettified=True)
# importance.loc[importance.Importances<0.5]
importance.head(60)

{'learn': {'Logloss': 0.1884515958390656}}


Unnamed: 0,Feature Id,Importances
0,streck,39.813205
1,häst,5.0712
2,spår,2.406556
3,h5_odds,2.287268
4,delta4,2.055578
5,h1_perf,1.876698
6,h3_odds,1.819091
7,delta1,1.721481
8,h1_kmtid,1.676074
9,senast,1.650721


# Kolla en sorterad lista med proba och kelly

In [154]:
# moduler
import pandas as pd
import numpy as np
from catboost import CatBoostClassifier, Pool, cv
import flaml
import pickle
from IPython.display import display
# # read a model from a file with pickle
with open('..\\modeller\\FLAML_model.sav', 'rb') as f:
    model = pickle.load(f)
with open('..\\modeller\\FLAML2_model.sav', 'rb') as f:
    model2 = pickle.load(f)
    
pd.set_option('display.width', 100)


In [3]:
def remove_features(df_, remove_mer=[]):
    df = df_.copy()
    df.drop(['avd', 'startnr', 'vodds', 'podds', 'bins', 'h1_dat',
              'h2_dat', 'h3_dat', 'h4_dat', 'h5_dat'], axis=1, inplace=True)
    if remove_mer:
            df.drop(remove_mer, axis=1, inplace=True)

    return df


def load_enc():
    with open('..\\modeller/encoder.sav', 'rb') as file:
        enc = pickle.load(file)
    return enc
def ordinal_enc(df_, feature):
    enc = load_enc()
    df = df_.copy()
    enc.transform(df[[feature]])
    # enc.fit_transform(df[[feature]])
    df[[feature]] = enc.transform(df[[feature]])
    return df

In [4]:
# skapa data
df = pd.read_csv('..\\all_data.csv')
X=df.copy()
X.drop('plac', axis=1, inplace=True)
X = ordinal_enc(X, 'häst')
y=(df.plac==1)*1   # plac 1 eller 0


In [160]:
proba = model.predict_proba(remove_features(X))
proba2 = model2.predict_proba(remove_features(X))
X['proba'] = proba[:,1]
X['proba2'] = proba2[:, 1]


In [161]:
def kelly(proba, streck, odds):  # proba = prob winning, streck i % = streck
    if odds is None:
        o = 100/streck.copy()
    else:
        o=odds.copy()
            
    # for each values > 50 in odds set to 1
    o[o>40]=1
    return (o*proba - (1-proba))/o
            

Xtst = pd.read_csv('..\\sparad_scrape.csv')

#df['f'] = (df.proba*df.vodds - 1) / (df.vodds-1)  # kelly formel
Xtst['kellys'] = kelly(Xtst.proba, Xtst.streck+1, None)
proba2 = model2.predict_proba(remove_features(Xtst))
Xtst['proba2'] = proba2[:, 1]
Xtst['kelly2'] = kelly(Xtst.proba2, Xtst.streck+1, None)


In [162]:
Xtst[['avd', 'startnr', 'proba','kellys','proba2','kelly2']]

Unnamed: 0,avd,startnr,proba,kellys,proba2,kelly2
0,1,2,0.407517,0.152750,0.301184,0.000693
1,1,6,0.134074,-0.030452,0.326964,0.199087
2,1,5,0.122549,-0.044167,0.259322,0.118593
3,1,11,0.054934,-0.011221,0.093266,0.029795
4,1,8,0.052131,-0.014220,0.047735,-0.018923
...,...,...,...,...,...,...
78,7,1,0.036045,0.007126,0.049132,0.020606
79,7,11,0.034413,-0.931174,0.041891,-0.916219
80,7,7,0.033703,-0.932594,0.050503,-0.898995
81,7,8,0.033485,-0.933031,0.036666,-0.926667


## plocka hästar efter proba och kelly till en df
1. Plocka hästar upp till 300:-
2. Minst en häst per avd

In [163]:
def compute_total_insats(df):
    insats=0
    # group by avd
    summa = df.groupby('avd').avd.count().prod() / 2   
    return summa

def fill_up_dfSpel(df_, max_insats=300,metod='proba'):
    df=df_.copy()
    dfSpel = pd.DataFrame()
    df=df.sort_values(by=metod, ascending=[False])
    curr_insats = 0
    for index, row in df.iterrows():
        dfSpel = dfSpel.append(row)
        curr_insats = compute_total_insats(dfSpel)
        if curr_insats>max_insats+30:
            dfSpel = dfSpel.iloc[:-1,:]
            break
        # print(curr_insats)

    print(compute_total_insats(dfSpel))
    return dfSpel


In [164]:
dfSpel = fill_up_dfSpel(Xtst, max_insats=300, metod=['kelly2'])
list(dfSpel[['avd','startnr']].groupby('avd'))


180.0


[(1.0,
     avd  startnr
  1  1.0      6.0
  2  1.0      5.0),
 (2.0,
      avd  startnr
  11  2.0      4.0
  12  2.0      3.0
  13  2.0      1.0
  16  2.0      6.0),
 (3.0,
      avd  startnr
  25  3.0      9.0
  28  3.0      5.0
  24  3.0      1.0
  22  3.0      2.0
  23  3.0     12.0),
 (4.0,
      avd  startnr
  35  4.0     15.0),
 (5.0,
      avd  startnr
  49  5.0      4.0),
 (6.0,
      avd  startnr
  59  6.0      1.0
  60  6.0      6.0
  64  6.0      7.0),
 (7.0,
      avd  startnr
  71  7.0      4.0
  72  7.0      5.0
  73  7.0      3.0)]