# Testa Olika settings och hyperparametrar


In [1]:
import pandas as pd
import numpy as np
import fixa_features as ff
import fixa_mer_features as ff2

from catboost import CatBoostClassifier, Pool
from catboost.utils import get_confusion_matrix 

### Läs in hela samlingen komplett.csv ###
df=pd.read_csv('mer_komplett.csv')
df.drop('tid',axis=1,inplace=True)

### Rätta ev felaktigheter
df=ff2.fixa_mer_features(df,hist=False)
df.lopp_dist=df.lopp_dist.astype('float')
df.plac = (df.plac==1)*1
print(df.plac.value_counts())  # CatBoost fungerar bättre med 0,1 än False, True
print(df.start.value_counts())


(37824, 62)
fixa features start (37824, 62)
efter fixa_features (37824, 63)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 37824 entries, 0 to 37823
Data columns (total 63 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   datum      37824 non-null  object 
 1   avd        37824 non-null  int64  
 2   häst       37824 non-null  object 
 3   bana       37824 non-null  object 
 4   dist       37824 non-null  int64  
 5   h1_bana    37824 non-null  object 
 6   h1_dat     37824 non-null  object 
 7   h1_dist    37824 non-null  float64
 8   h1_kmtid   37820 non-null  object 
 9   h1_kusk    37824 non-null  object 
 10  h1_odds    37771 non-null  object 
 11  h1_plac    37817 non-null  object 
 12  h1_pris    34738 non-null  float64
 13  h1_spår    35008 non-null  float64
 14  h2_bana    37824 non-null  object 
 15  h2_dat     37824 non-null  object 
 16  h2_dist    37824 non-null  float64
 17  h2_kmtid   37821 non-null  object 
 18  h2_kusk   

In [23]:
# bestäm datum för train/test split
# mitt_datum = df.iloc[int(df.shape[0]/2)].datum  # någonstans i mitten
# mitt_datum = '2020-10-10'  # annat datum under inledande testningar
mitt_datum = '2019-03-01'  # annat datum under inledande testningar
mitt_datum

'2019-03-01'

In [3]:
def get_rows_with_dates(df,n):
    # get the first rows in df with the n first dates
    dates = df.datum.unique()[:n]
    rows = df[df.datum=='inget datum']   # empty with the same columns as df
    for d in dates:
        rows = pd.concat([rows,df[df.datum==d]], copy=True, ignore_index=True)
    print('rows',rows.shape)
    return rows

In [4]:
def move_rows_from_A_to_B(A,B,n):
    # select the rows for the n first datum in A
    move = get_rows_with_dates(A,n)
    a=A.iloc[move.shape[0]:] # remove the n first datum
    
    # add to the end of B
    b = pd.concat([B,move],ignore_index=True,copy=True)
    return a,b    

In [5]:
# split med datum
def initsplit_my_data(df,mitt_datum):
    train=df[df.datum<=mitt_datum]
    test_all = df[df.datum>mitt_datum]

    print('Check that this is True',test_all.shape[0]+train.shape[0] == df.shape[0])
    train.shape,test_all.shape

    return train,test_all,

In [7]:
def catfit(trainPool, testPool,auto=None,use_best=False,early=200,iterations=2000):
    
    model = CatBoostClassifier(  
        random_seed=2021,
        iterations=iterations,              
        auto_class_weights=auto ,
        # scale_pos_weight = 10,
        eval_metric = 'AUC',
        loss_function = 'Logloss',
        early_stopping_rounds=early,
        )
    if use_best:
        model.fit(trainPool,
        use_best_model=True,
        eval_set=testPool,
        verbose = 100
        )
    else:            
        model.fit(trainPool,
        use_best_model=False,
        eval_set=testPool,
        verbose = 100
        )
    
    return model



##  Finn bästa feature-komb (utan odds).
### Vodds, bins och podds är osäkra features eftersom de ändrar sig efter att v75 börjat.
### streck borde vara med

In [8]:


all_features = ['häst','kusk','bana','kön','lopp_dist','dist','kr','streck','bins','podds','vodds','pris','spår','start','ålder','plac']
cat_features = ['häst','kusk','bana','kön']

### Bestäm features för train ##

sel_feat = cat_features + ['lopp_dist','kr','pris','spår','start','streck','ålder']

# text='early, AUC, Balanced|'
# text='early, AUC, SqrtBal|'
# text='early, AUC, None|'
# text='2000, AUC, Balanced|'
# text='2000, AUC, Bal use_b|'
# text='early, F1, Balanced|'
# text='early, AUC, scale|'
text='allt utom odds|'

# Dataframe som sparar resultaten
# dfRes = pd.DataFrame(columns=[ 'text', 'F1', 'Recall',	'Precision', 'Accuracy', 'AUC','b_iter','model'])
dfRes = pd.read_csv('testResults.csv')


In [24]:
### split och pool ###

train, test = initsplit_my_data(df,mitt_datum)

trainPool = Pool(data=train[sel_feat], label=train.plac, cat_features=cat_features)
testPool = Pool(data=test[sel_feat], label=test.plac, cat_features=cat_features)

print(f'train last {train.iloc[-1].datum} test first {test.iloc[0].datum}')
print(sel_feat)

Check that this is True True
train last 2019-02-23 test first 2019-03-02
['häst', 'kusk', 'bana', 'kön', 'lopp_dist', 'kr', 'pris', 'spår', 'start', 'streck', 'ålder']


- Kör med all under testningarna  
- Parametrar: balanced/sqrtBalanced/None, eval_test='AUC','F1'  
- Testa med olika iterations med och utan early 
- Kolla framför allt på F1  

In [None]:

use_best = True
early=200
iterations=2000

model = catfit(trainPool, testPool, auto='Balanced',use_best=use_best,early=early,iterations=iterations)


In [None]:
### mätningar och spara i dfRes ###
eval = model.eval_metrics(testPool,['Accuracy','F1','Precision','Recall','AUC'])
x=0 
x=model.get_best_iteration()
dfRes.loc[len(dfRes)] = [text, round(np.mean(eval['F1']),3),round(np.mean(eval['Recall']),3), round(np.mean(eval['Precision']),3),  round(np.mean(eval['Accuracy']),3), round(np.mean(eval['AUC']),3), x, model]

model.feature_names_

In [None]:
pd.set_option('display.width',100)

print(dfRes[['text', 'F1', 'Recall', 'Precision', 'Accuracy', 'AUC', 'b_iter']])


In [None]:

# Skriv ut feature importance och confusion matrix
print(model.get_feature_importance(prettified=True))
print()
print(get_confusion_matrix(model,data=testPool))

In [None]:
dfRes.sort_values(by=['F1','Recall'],ascending=False)

In [None]:
test.häst.value_counts()

In [None]:
dfRes.to_csv('testResults.csv',index=False)
model.save_model('testmodell_podds')

In [None]:
all_features = ['häst','kusk','bana','kön','lopp_dist','dist','kr','streck','bins','podds','vodds','pris','spår','start','ålder','plac']
cat_features = ['häst','kusk']
#sel_feat = cat_features + ['lopp_dist','kr','bins','podds','vodds','pris','spår','start','streck','ålder']
sel_feat = cat_features + []


def fit(train,test,cat_features):
    trainPool = Pool(data=train, label=train.plac, cat_features=cat_features)
    testPool = Pool(data=test, label=test.plac, cat_features=cat_features)

    model = catfit(trainPool, testPool, auto='Balanced',use_best=True,early=200,iterations=2000)

    return model



In [None]:
print(model.get_feature_importance(prettified=True))
print(get_confusion_matrix(model,testPool))
eval = model.eval_metrics(testPool,['Accuracy','F1','Precision','Recall','AUC'])
print(round(np.mean(eval['F1']),3), round(np.mean(eval['Recall']),3), round(np.mean(eval['Precision']),3),  round(np.mean(eval['Accuracy']),3))
# np.mean(model.eval_metrics(testPool,['AUC'])['AUC'])

In [198]:
def ettVarv(d,lr,add_features):
    from catboost import Pool, cv
    all_features = ['häst','kusk','bana','kön','lopp_dist','dist','kr','streck','bins','podds','vodds','pris','spår','start','ålder','plac']
    cat_features=['häst','kusk','bana','kön']
    sel_features=cat_features + add_features
    cv_data = test[sel_features]

    labels=test.plac

    cv_dataset = Pool(data=cv_data,
                    label=labels,
                    cat_features=cat_features)

    params = {"iterations": 1000,
            "depth": d,
            "learning_rate": lr,
            "auto_class_weights": "Balanced",
            "eval_metric": "Accuracy",
            "loss_function": "Logloss",
            "verbose": 50}

    scores = cv(cv_dataset,
                params,
                shuffle=False,
                early_stopping_rounds=200,
                type='TimeSeries',
                fold_count=52, 
                )
    return scores  

def addScores(dfcv,score,d,lr,add_features):   
    ix= scores[scores['test-Accuracy-mean'].max()==scores['test-Accuracy-mean']].index.values[0]
    rad=scores.loc[ix,['test-Accuracy-mean','test-Logloss-mean','iterations']].tolist()+ [d,lr,add_features]
    dfcv.loc[len(dfcv)] = rad
    dfcv.sort_values(by='accuracy',ascending=False,inplace=True)

    return dfcv
             

In [202]:
add_features = ['lopp_dist','streck']
for d in [1,2,3,4,5,6]:
    for lr in [0.01,0.03,0.06,0.08]:
        print(f'djup={d} lr={lr}')
        scores = ettVarv(d,lr,add_features)
        dfcv = addScores(dfcv,scores,d,lr,add_features)
        print(dfcv)




0:	learn: 0.7481755	test: 0.7398496	best: 0.7398496 (0)	total: 741ms	remaining: 12m 20s
50:	learn: 0.7553151	test: 0.7395696	best: 0.7423474 (16)	total: 1m	remaining: 18m 45s
100:	learn: 0.7552079	test: 0.7393950	best: 0.7423474 (16)	total: 2m 1s	remaining: 17m 58s
150:	learn: 0.7554792	test: 0.7393684	best: 0.7423474 (16)	total: 3m 2s	remaining: 17m 3s
200:	learn: 0.7561780	test: 0.7405365	best: 0.7423474 (16)	total: 4m 2s	remaining: 16m 5s
Stopped by overfitting detector  (200 iterations wait)
0:	learn: 0.7481755	test: 0.7398496	best: 0.7398496 (0)	total: 713ms	remaining: 11m 51s
50:	learn: 0.7547215	test: 0.7388854	best: 0.7413387 (12)	total: 1m	remaining: 18m 47s
100:	learn: 0.7572635	test: 0.7388189	best: 0.7413387 (12)	total: 2m 1s	remaining: 17m 59s
150:	learn: 0.7586988	test: 0.7375212	best: 0.7413387 (12)	total: 3m 2s	remaining: 17m 4s
200:	learn: 0.7598501	test: 0.7367705	best: 0.7413387 (12)	total: 4m 3s	remaining: 16m 8s
Stopped by overfitting detector  (200 iterations wait

In [200]:
# dfcv = pd.DataFrame(columns = ['accuracy', 'logloss', 'iter', 'depth', 'learn_r','feat'])
# dfcv = addScores(dfcv,scores,d,lr,add_features)

In [203]:
dfcv

Unnamed: 0,accuracy,logloss,iter,depth,learn_r,feat
6,0.744034,0.662642,3.0,2,0.03,"[lopp_dist, streck]"
3,0.743115,0.548918,28.0,1,0.06,"[lopp_dist, streck]"
5,0.743051,0.674944,6.0,2,0.01,"[lopp_dist, streck]"
17,0.742526,0.673803,6.0,5,0.01,"[lopp_dist, streck]"
1,0.742347,0.654794,16.0,1,0.01,"[lopp_dist, streck]"
19,0.742296,0.570665,13.0,5,0.06,"[lopp_dist, streck]"
11,0.742246,0.626195,4.0,3,0.06,"[lopp_dist, streck]"
15,0.742182,0.565165,15.0,4,0.06,"[lopp_dist, streck]"
0,0.742048,0.589511,34.0,1,0.02,"[lopp_dist, streck]"
7,0.741841,0.62788,4.0,2,0.06,"[lopp_dist, streck]"


In [261]:
def addFeatScore(dfcv,accuracy,logloss,iterations,d,lr,add_features):   
    dfcv.loc[len(dfcv)] = [accuracy,logloss,iterations,d,lr,add_features]
    dfcv.sort_values(by='accuracy',ascending=False,inplace=True)

    return dfcv
             

In [308]:
d=2
lr=0.03
from catboost import CatBoostClassifier
model=CatBoostClassifier(learning_rate=lr,depth=d,iterations=1000,auto_class_weights="Balanced",eval_metric='Accuracy',loss_function='Logloss',verbose=100,early_stopping_rounds=200)
cat_feat=['häst','kusk']
add_features = [  'streck']
sel_feat=cat_feat+add_features
trainPool = Pool(data=train[sel_feat], label=train.plac, cat_features=cat_feat)
testPool = Pool(data=test[sel_feat], label=test.plac, cat_features=cat_feat)

score=model.fit(trainPool,
        use_best_model=True,
        eval_set=testPool,
        verbose = 100,
)
     
    

0:	learn: 0.7379974	test: 0.7317111	best: 0.7317111 (0)	total: 28.3ms	remaining: 28.3s
100:	learn: 0.7487231	test: 0.7416212	best: 0.7422004 (84)	total: 4.79s	remaining: 42.6s
200:	learn: 0.7501086	test: 0.7426632	best: 0.7433806 (191)	total: 9.44s	remaining: 37.5s
300:	learn: 0.7511740	test: 0.7374985	best: 0.7433806 (191)	total: 14s	remaining: 32.4s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.743380622
bestIteration = 191

Shrink model to first 192 iterations.


In [309]:
accuracy = model.get_best_score()['validation']['Accuracy']

# dfcv=addScores(dfcv,score,d,lr,add_features)
model.best_iteration_
Logloss = model.get_best_score()['validation']['Logloss']

dfcv=addFeatScore(dfcv.copy(),accuracy,Logloss,model.best_iteration_,d,lr,add_features)
dfcv

Unnamed: 0,accuracy,logloss,iter,depth,learn_r,feat
30,0.744343,0.530295,206.0,2,0.03,"[streck, spår]"
6,0.744034,0.662642,3.0,2,0.03,"[lopp_dist, streck]"
29,0.743928,0.530184,214.0,2,0.03,"[streck, spår, ålder]"
28,0.743915,0.53068,209.0,2,0.03,"[kr, streck, spår, ålder]"
32,0.743913,0.532516,116.0,2,0.03,[streck]
25,0.743895,0.530435,223.0,2,0.03,"[lopp_dist, kr, streck, pris, spår, start, ålder]"
26,0.74363,0.531666,127.0,2,0.03,"[kr, streck, pris, spår, start, ålder]"
27,0.743467,0.530939,168.0,2,0.03,"[kr, streck, pris, spår, ålder]"
33,0.743381,0.533167,191.0,2,0.03,[streck]
3,0.743115,0.548918,28.0,1,0.06,"[lopp_dist, streck]"


In [310]:
feat_imp=model.get_feature_importance(prettified=True)
print(feat_imp)
bort=feat_imp.loc[len(feat_imp)-1][0]
# add_features.remove(bort)
add_features

  Feature Id  Importances
0     streck    91.910658
1       häst     4.870891
2       kusk     3.218451


['streck']

In [251]:
dfcv

Unnamed: 0,accuracy,logloss,iter,depth,learn_r,feat
6,0.744034,0.662642,3.0,2,0.03,"[lopp_dist, streck]"
3,0.743115,0.548918,28.0,1,0.06,"[lopp_dist, streck]"
5,0.743051,0.674944,6.0,2,0.01,"[lopp_dist, streck]"
17,0.742526,0.673803,6.0,5,0.01,"[lopp_dist, streck]"
1,0.742347,0.654794,16.0,1,0.01,"[lopp_dist, streck]"
19,0.742296,0.570665,13.0,5,0.06,"[lopp_dist, streck]"
11,0.742246,0.626195,4.0,3,0.06,"[lopp_dist, streck]"
15,0.742182,0.565165,15.0,4,0.06,"[lopp_dist, streck]"
0,0.742048,0.589511,34.0,1,0.02,"[lopp_dist, streck]"
7,0.741841,0.62788,4.0,2,0.06,"[lopp_dist, streck]"
