# Stacking development

In [1]:
!pip install -q catboost 

import pandas as pd
import numpy as np
import logging
import sys

import catboost
from catboost import CatBoostClassifier
from catboost import Pool

import sklearn
from sklearn import svm  
from sklearn.ensemble import RandomForestClassifier as RF
from sklearn.neighbors import KNeighborsClassifier as KNN
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import precision_score

sys.path.append('C:\\Users/peter/Google Drive/Colab Notebooks/Småprojekt/')
import fixa_features as ff
import class_model as mcl

import time

print("sklearn version", sklearn.__version__)
print("numpy version  ", np.__version__)

sklearn version 0.21.3
numpy version   1.16.5


# Data

In [2]:
def read_and_fix(): #df = read and fix data
    filnamn='C:\\Users\\peter\\Documents\\MyProjects\\PyProj\\Trav\\travlopp\\komplett.csv'
    df=pd.read_csv(filnamn)
    df = ff.fix_features(df, True)
    df['vann']=df.plac==1.0
    df['vann'] *= 1
    df.drop('plac',axis=1,inplace=True)
    #df['avd'] = 1
    #df['vann'] = False
    df.reset_index(inplace=True,drop=True)
    A = df.sample(frac=0.55)
    B = df.drop(A.index)

    return A,B

# def bins(df):
#   bins=pd.DataFrame({
#    'fr':[1.0, 2.1,  2.9,   3.6,  4.1,  4.8,
#          5.6, 6.3,  7.1,   7.9,  8.7,  9.7,
#         10.7, 11.9, 13.2, 14.8, 16.5, 18.1,
#         20.2, 22.5, 25.2, 28.0, 31.5, 35.3,
#         39.6, 44.7, 50.0, 56.7, 64.3, 75.4],
                           
#     'to':[2.1, 2.9,   3.6,  4.1,  4.8, 5.6, 
#           6.3, 7.1,   7.9,  8.7,  9.7,10.7, 
#           11.9, 13.2, 14.8, 16.5, 18.1,20.2, 
#           22.5, 25.2, 28.0, 31.5, 35.3,39.6, 
#           44.7, 50.0, 56.7, 64.3, 75.4, 2000 ],

#     'bin':[0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
#           10,11,12,13,14,15,16,17,18,19,
#           20,21,22,23,24,25,26,27,28,29]
#        })

  
#   val = bins.loc[:,'fr':'to'].apply(tuple,1).tolist()
#   indx = pd.IntervalIndex.from_tuples(val, closed='right')
#   df['bins'] = bins.loc[indx.get_indexer(df['vodds']),'bin'].values

#   return df


# class

In [3]:
# create pool for CatBoost
def create_pool(X_train,y_train,X_test,y_test,cat_features):
    
    train_pool = Pool(
        cat_features=cat_features,
        data=X_train, 
        label=y_train,
    )       
    if X_test is None:
        test_pool = None 
    else:
        test_pool = Pool(
            cat_features=cat_features,
            data=X_test,
            label=y_test
        )
    return train_pool, test_pool         

# Kelly algorithm

In [4]:
# ((odds-1) * percentage_estimate -  (1-percent_estimate))   / (odds-1)  X   100
def kelly(pred,odds):
    return ( ((odds-1) * pred - (1-pred)) / (odds-1) * 100)


# Modeller för första lagret i stacking

In [5]:
## CatBoost
def Cat_model(A,B,the_odds,path,häst_suf='h'): #häst_suf 'h' om häst är med, 'u' om inte
    params={
        'med': {'colsample_bylevel': 0.683, 'depth': 2, 'l2_leaf_reg': 1.189,       #target: -0.2467: 50-50; vinst=29
                   'leaf_estimation_iterations': 1, 'learning_rate': 0.0786, 
                   'min_data_in_leaf': 54, 'subsample': 0.9164},
        'utan':  {'colsample_bylevel': 0.2316, 'depth': 1.1, 'l2_leaf_reg': 19.79,  #target: -0.2532:  61-39; vinst=1665
                  'leaf_estimation_iterations': 2.3, 'learning_rate': 0.1748, 'min_data_in_leaf': 5.5, 'subsample': 0.8097}}
    params[the_odds]['depth']=int(round(params[the_odds]['depth']))
    params[the_odds]['leaf_estimation_iterations']=int(round(params[the_odds]['leaf_estimation_iterations']))
    params[the_odds]['min_data_in_leaf']=int(round(params[the_odds]['min_data_in_leaf']))

    aa = A.copy()
    bb = B.copy()

    if häst_suf=='u':
        aa.drop('häst',axis=1,inplace=True)
        bb.drop('häst',axis=1,inplace=True)

    A_X_train, A_X_test, A_y_train, A_y_test = train_test_split(aa.drop(['datum','vann'],axis=1), aa.vann, 
        test_size=0.2, random_state=202006)
  
    file_name='Cat1'+häst_suf+'_stacking'
    if the_odds=='utan':
        file_name='Cat2'+häst_suf+'_stacking'      
        A_X_train.drop('vodds',axis=1,inplace=True)
        A_X_test.drop('vodds',axis=1,inplace=True)
        bb.drop('vodds',axis=1,inplace=True)

    # preparera data till pool
    cat_features = ['start','spår','h1_spår','h2_spår','h3_spår','h4_spår','h5_spår','häst']
    if häst_suf=='u':
        cat_features = ['start','spår','h1_spår','h2_spår','h3_spår','h4_spår','h5_spår']

    A_y_train = A_y_train*1
    A_y_test =A_y_test*1
    train_pool, test_pool = create_pool(A_X_train,A_y_train,A_X_test,A_y_test,cat_features)
    
    cb = CatBoostClassifier(  # CatB-modellen
      iterations=4000,                 
      **params[the_odds],
      scale_pos_weight = 1,
      eval_metric = 'Logloss',
      early_stopping_rounds=100,
    )

    # fit av A
    cb.fit(train_pool, 
      eval_set = test_pool,
      use_best_model=True,
      verbose =False
    )
    
    spara(cb,path+file_name)

    B_X_pool,_=create_pool(bb.drop(['datum','vann'], axis=1),bb.vann, None,None,cat_features)
    
    # predict av B
    return cb,cb.predict_proba(B_X_pool)[:,1]

## Ramdom Forest
def rf_model(A,B,the_odds,path):
    params= {'med':{'n_estimators': 219, 'min_samples_leaf': 9,'min_samples_split': 10, 'max_depth': None,}, #Logloss: -0.9068
        'utan':{'max_depth': 100, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 200}, #Logloss: -0.9061
    }
    aa=A.copy()
    bb=B.copy()
    file_name='rf1_stacking'
    if the_odds=='utan':
        file_name='rf2_stacking'
        aa.drop('vodds',axis=1,inplace=True)
        bb.drop('vodds',axis=1,inplace=True)

    rf = RF(**params[the_odds]) # rf-modellen

    # fit av A
    rf.fit(aa.drop(['datum','häst','vann'],axis=1), aa.vann)

    spara(rf, PATH+file_name)
    
    # predict av B
    return rf,rf.predict_proba(bb.drop(['datum','häst','vann'],axis=1))[:,1]

## Ramdom Forest med 30 bins
def rf30_model(A,B,the_odds,path):
    params= {'med':{'n_estimators': 219, 'min_samples_leaf': 9,'min_samples_split': 10, 'max_depth': None,}, #Logloss: -0.9068
        'utan':{'max_depth': 100, 'min_samples_leaf': 2, 'min_samples_split': 3, 'n_estimators': 200}, #Logloss: -0.9061
    }
    aa=A.copy()
    bb=B.copy()
    file_name='rf1_30_stacking'
    if the_odds=='utan':
        file_name='rf2_30_stacking'
        aa.drop('vodds',axis=1,inplace=True)
        bb.drop('vodds',axis=1,inplace=True)

    rf30 = RF(**params[the_odds]) # rf30-modellen

    # fit av A
    rf30.fit(aa.drop(['datum','häst','vann'],axis=1), aa.vann)

    spara(rf30, PATH+file_name)
    
    # predict av B
    return rf30,rf30.predict_proba(bb.drop(['datum','häst','vann'],axis=1))[:,1]

## knn
def knn_model(A,B,the_odds,path):
    params= {'med':{'leaf_size': 1, 'n_neighbors': 8, 'p': 2},       #target': 0.3273015873015873, precision
            'med3':{'leaf_size': 43, 'n_neighbors': 25, 'p': 1},    #target': 0.9058454402452014, accuracy
            'med2':{'leaf_size': 22, 'n_neighbors': 23, 'p': 2},    #'target': 0.9057963965963539 accuracy
            'utan':{'leaf_size': 14, 'n_neighbors': 16, 'p': 1.0}  #target': 0.31, precision. (44-56) men vinst totalt ändå!
    }
    aa=A.copy()
    bb=B.copy()

    file_name='knn1_stacking'
    if the_odds=='utan':
        file_name='knn2_stacking'
        aa.drop('vodds',axis=1,inplace=True)
        bb.drop('vodds',axis=1,inplace=True)

    knn = KNN(**params[the_odds]) # knn-modellen
    
    # fit av A
    knn.fit(aa.drop(['datum','häst','vann'],axis=1), aa.vann)

    spara(knn, PATH+file_name)

    # predict av B
    return knn,knn.predict_proba(bb.drop(['datum','häst','vann'],axis=1))[:,1]


# spara/ladda Modell

In [6]:
def spara(modelname, path):
    import pickle    
    pickle.dump(modelname, open(path, 'wb'))
    print(modelname,'\nsparad på', path)

def ladda(modelname, path):
    import pickle    
    loaded_model = pickle.load(open(path+modelname, 'rb'))
    return loaded_model


# Träna första lagret
### Fit av A och predict av B ger C

In [7]:
PATH = F"C:/Users/peter/Documents/MyProjects/PyProj/Trav/Modeller/"
A,B = read_and_fix()

C=B[['vann','vodds','start','spår']]

#### catb
print('startar CatB')
#print('kan ta bort Cat1 och Cat2 som är samma som Cat1h och Cat2h ')
#Cat1,C['Cat1'] = Cat_model(A,B, 'med',PATH)*C.vodds
#Cat2,C['Cat2'] = Cat_model(A,B, 'utan',PATH)*C.vodds

Cat1h,C['Cat1h'] = Cat_model(A,B, 'med',PATH,häst_suf='h')
#C.Cat1h *= C.vodds
C.Cat1h = kelly(C.Cat1h, C.vodds)
Cat2h,C['Cat2h'] = Cat_model(A,B, 'utan',PATH,häst_suf='h')
#C.Cat2h *= C.vodds
C.Cat2h = kelly(C.Cat2h, C.vodds)
Cat2u,C['Cat2u'] = Cat_model(A,B, 'utan',PATH,häst_suf='u')
#C.Cat2u *= C.vodds
C.Cat2u = kelly(C.Cat2u, C.vodds)

##### rf
print('startar rf')
rf1,C['rf1'] = rf_model(A,B, 'med',PATH)
#C.rf1 *= C.vodds
C.rf1 = kelly(C.rf1, C.vodds)
rf2,C['rf2'] = rf_model(A,B, 'utan',PATH)
#C.rf2 *= C.vodds
C.rf2 = kelly(C.rf2, C.vodds)

##### rf30
print('startar rf30')
rf130,C['rf130'] = rf30_model(A,B, 'med',PATH)
#C.rf130 *= C.vodds
C.rf130 = kelly(C.rf130, C.vodds)

rf230,C['rf230'] = rf30_model(A,B, 'utan',PATH)
#C.rf230 *= C.vodds
C.rf230 = kelly(C.rf230, C.vodds)

##### knn
print('startar knn')
knn1,C['knn1'] = knn_model(A,B, 'med',PATH)
#C.knn1 *= C.vodds
C.knn1 = kelly(C.knn1, C.vodds)

knn2,C['knn2'] = knn_model(A,B, 'utan',PATH)
#C.knn2 *= C.vodds
C.knn2 = kelly(C.knn2, C.vodds)

C.sample(10)
  

fixa features start (27449, 28)
startar CatB
<catboost.core.CatBoostClassifier object at 0x0000014ADF727AC8> 
sparad på C:/Users/peter/Documents/MyProjects/PyProj/Trav/Modeller/Cat1h_stacking
<catboost.core.CatBoostClassifier object at 0x0000014ADF728A88> 
sparad på C:/Users/peter/Documents/MyProjects/PyProj/Trav/Modeller/Cat2h_stacking
<catboost.core.CatBoostClassifier object at 0x0000014ADF71DA08> 
sparad på C:/Users/peter/Documents/MyProjects/PyProj/Trav/Modeller/Cat2u_stacking
startar rf
RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=9, min_samples_split=10,
                       min_weight_fraction_leaf=0.0, n_estimators=219,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False) 
sparad på 

Unnamed: 0,vann,vodds,start,spår,Cat1h,Cat2h,Cat2u,rf1,rf2,rf130,rf230,knn1,knn2
10959,0,3.63,0,1,1.880681,3.84877,9.507993,10.987117,7.478707,16.385285,1.395023,-38.022814,-12.143536
18842,0,35.12,0,8,-0.445476,-0.484681,-0.587746,-0.893158,-0.949414,-0.732906,-0.228898,9.935522,3.502345
22009,0,4.37,0,4,1.078971,-2.853741,7.346626,7.857213,11.676848,8.91925,21.059658,2.744807,-13.464392
10201,0,10.21,1,3,-4.741249,-3.908146,-2.882637,-4.633272,-8.492798,-5.426209,-5.767544,16.856678,9.928067
6099,0,54.71,0,9,0.217508,-0.019949,0.210488,-0.543694,0.302714,-0.572286,0.396087,-1.861851,4.504515
18400,0,44.83,0,10,-0.407635,-0.619284,-0.29242,-0.213544,1.140019,0.261384,1.588109,10.50365,4.111054
11460,0,32.2,0,7,-0.450262,-0.425971,-0.103524,-1.999795,-1.485043,-1.775494,-2.689103,-3.205128,-3.205128
1257,0,33.08,0,10,-0.952069,-0.854123,-0.577426,-0.774312,0.405964,-1.369881,0.277068,9.772444,9.772444
3786,0,8.1,0,9,-4.425097,-6.884797,-6.566575,-7.215518,-5.856841,-7.779465,-7.115845,-14.084507,-14.084507
11812,0,3.04,0,1,-6.241117,-7.163519,-9.159525,-2.259883,-14.668814,-11.538709,-12.822035,-49.019608,-49.019608


# Träna sista lagret i stacking
### fit av C och evaluera mot OMGÅNGAR

In [8]:
## Sista lagret
print(C.shape)

if True:  #Träna
    print('Träna upp en ny final_model')
    ## SVC
    #{‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
    params={'kernel':'linear'}
    final_model = svm.SVC(**params, probability=True, random_state = 202006, verbose=True)
    print("starta fit")
    final_model.fit(C.drop(['vann'],axis=1), C.vann) 
    
    ## rf
    #final_model2 = RF(n_estimators=100)
    #final_model2.fit(C.drop(['vann','vodds'],axis=1), C.vann)
    
    spara(final_model, PATH+'final_model_L_stacking')
else:
    print('Ladda befintlig final_model')
    final_model = ladda('final_model_L_stacking',PATH)    

## rf
final_model2 = RF(n_estimators=500)
final_model2.fit(C.drop(['vann','vodds'],axis=1), C.vann)
        
  

(10136, 13)
Träna upp en ny final_model
starta fit
[LibSVM]SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=True, random_state=202006,
    shrinking=True, tol=0.001, verbose=True) 
sparad på C:/Users/peter/Documents/MyProjects/PyProj/Trav/Modeller/final_model_L_stacking


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=500,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

## evaluera mot OMGÅNGAR som först läses in till df2

In [12]:
def read_omgångar(omgångar):
    avd=[1,2,3,4,5,6,7]
    path='C:\\Users/peter/Documents/MyProjects/PyProj/Trav/'
    stravd = ''.join(str(x) for x in avd)

    df2=pd.DataFrame()
    for key,omg in omgångar.items():
        omg_sav = omg.replace('/V75/','-')
        filnamn=path+omg_sav+stravd+'test.csv'
        tmp = pd.read_csv(filnamn)
        # tmp.spår = tmp.spår.astype('int')
        # tmp.h1_spår = tmp.h1_spår.astype('int')
        # tmp.h2_spår = tmp.h2_spår.astype('int')
        # tmp.h3_spår = tmp.h3_spår.astype('int')
        # tmp.h4_spår = tmp.h4_spår.astype('int')
        # tmp.h5_spår = tmp.h5_spår.astype('int')
        # tmp=ff.bins(tmp)
        
        # spara_data(tmp,path,omg_sav,avd)
        if sum(tmp.vann) == 0:
            print(omg,'saknar vinnare')
            continue
    
        df2 = pd.concat([df2,tmp])
        
    
    df2.reset_index(inplace=True,drop=True)

    return(df2)
    
def spara_data(df,path, omgsav, avd):
    stravd = ''.join(str(x) for x in avd)
    filnamn=path+omgsav+stravd+'test.csv'
    df.to_csv(filnamn, index=False)

## Evaluera stacken

In [11]:

OMGÅNGAR = {
    1: '2020-04-25/V75/aby',
    2: '2020-05-02/V75/orebro',
    3: '2020-04-18/V75/umaker',
    4: '2020-05-09/V75/aby', 
    5: '2020-05-16/V75/mantorp', 
    6: '2020-05-23/V75/gavle',
    7: '2020-05-30/V75/solvalla',
    8: '2020-06-06/V75/ostersund',
    9: '2020-06-13/V75/boden',
    10: '2020-06-14/V75/bjerke',   
    11: '2020-07-04/V75/halmstad',   
    12: '2020-06-21/V75/kalmar',   
    13: '2020-07-11/V75/arjang',  
    14: '2020-07-19/V75/axevalla',  
     0: '2020-07-25/V75/bollnas',  
} ##### Sätt alltid senaste omgång till 0 och döp om när ny 0:a kommer
  
df2 = read_omgångar(OMGÅNGAR)
# läs in alla MODELLER behövs ju inte nu men sedan

## df3 blir första lagrets predict av df2

In [13]:
####TESTA ATT TA MED start, spår, dist MED ELLER UTAN vodds
# Första lagret

df2.vann *= 1
df3=df2[['vann','vodds','start','spår']]
#df3['Cat1_res'] = Cat1.predict_proba(df2.drop(['datum','avd','vann'],axis=1))[:,1]
#df3['Cat2_res'] = Cat2.predict_proba(df2.drop(['datum','avd','vodds','vann'],axis=1))[:,1]
cath_features = ['start','spår','h1_spår','h2_spår','h3_spår','h4_spår','h5_spår','häst']

h1_pool, _ = create_pool(df2.drop(['datum','avd','vann'],axis=1),df2.vann,None,None,cath_features)    
#df3['Cat1h'] = Cat1h.predict_proba(h1_pool)[:,1]*df3.vodds
df3['Cat1h'] = kelly(Cat1h.predict_proba(h1_pool)[:,1], df3.vodds)

h2_pool, _ = create_pool(df2.drop(['datum','avd','vann','vodds'],axis=1),df2.vann,None,None,cath_features)    
#df3['Cat2h']  = Cat2h.predict_proba(h2_pool)[:,1]*df3.vodds
df3['Cat2h'] = kelly(Cat2h.predict_proba(h2_pool)[:,1], df3.vodds)

catu_features = ['start','spår','h1_spår','h2_spår','h3_spår','h4_spår','h5_spår']
u2_pool, _ = create_pool(df2.drop(['datum','avd','vann','vodds'],axis=1),df2.vann,None,None,cath_features)    
#df3['Cat2u'] = Cat2u.predict_proba(u2_pool)[:,1]*df3.vodds
df3['Cat2u'] = kelly(Cat2u.predict_proba(u2_pool)[:,1], df3.vodds)

#df3['rf1'] = rf1.predict_proba(df2.drop(['datum','avd','häst','vann'],axis=1))[:,1]*df3.vodds
df3['rf1'] = kelly(rf1.predict_proba(df2.drop(['datum','avd','häst','vann'],axis=1))[:,1], df3.vodds)
#df3['rf2'] = rf2.predict_proba(df2.drop(['datum','avd','häst','vodds','vann'],axis=1))[:,1]*df3.vodds
df3['rf2'] = kelly(rf2.predict_proba(df2.drop(['datum','avd','häst','vodds','vann'],axis=1))[:,1], df3.vodds)

#df3['rf130'] = rf130.predict_proba(df2.drop(['datum','avd','häst','vann'],axis=1))[:,1]*df3.vodds
df3['rf130'] = kelly( rf130.predict_proba(df2.drop(['datum','avd','häst','vann'],axis=1))[:,1], df3.vodds)
#df3['rf230'] = rf230.predict_proba(df2.drop(['datum','avd','häst','vodds','vann'],axis=1))[:,1]*df3.vodds
df3['rf230'] = kelly( rf230.predict_proba(df2.drop(['datum','avd','häst','vodds','vann'],axis=1))[:,1], df3.vodds)

#df3['knn1'] = knn1.predict_proba(df2.drop(['datum','avd','häst','vann'],axis=1))[:,1]*df3.vodds
df3['knn1'] = kelly( knn1.predict_proba(df2.drop( ['datum','avd','häst','vann'],axis=1 ))[:,1], df3.vodds)
#df3['knn2'] = (knn2.predict_proba(df2.drop(['datum','avd','häst','vodds','vann'],axis=1))[:,1])*df3.vodds
df3['knn2'] = kelly( knn2.predict_proba(df2.drop( ['datum','avd','häst','vodds','vann'],axis=1 ))[:,1], df3.vodds)

df3.sample(50)


Unnamed: 0,vann,vodds,start,spår,Cat1h,Cat2h,Cat2u,rf1,rf2,rf130,rf230,knn1,knn2
1001,0,81.95,0,6,0.896496,0.990535,1.010093,19.517843,40.365159,18.719788,39.710143,-1.23533,5.091878
176,0,33.97,0,5,0.219513,0.588267,0.091911,21.71333,41.038105,22.99267,38.793456,-3.03306,3.406506
395,0,30.53,1,4,-0.560924,-0.554594,0.145062,10.805884,28.930475,10.223029,30.647222,-3.386387,3.075262
939,0,25.47,1,1,-0.417202,0.070097,0.46382,4.55575,37.603159,5.891285,35.943349,8.924193,2.418778
469,0,45.27,0,9,0.283205,0.245433,-0.262797,30.856918,48.546747,33.509854,46.62635,-2.258866,4.132313
579,0,45.52,1,7,0.129458,-0.033205,0.152462,12.493259,33.916102,10.919148,30.988088,-2.246181,4.144205
109,0,12.71,1,5,-1.247967,-1.005842,-1.736514,-0.521454,24.935746,-0.075036,23.507716,-8.53971,-1.755978
301,0,7.12,1,3,-5.914667,-5.555384,-5.446327,12.517122,22.653377,11.192101,20.776473,-1.797386,-9.068627
653,0,154.31,1,7,1.420762,1.001659,1.24761,6.224115,30.214424,6.300536,33.284318,-0.652273,5.638494
193,0,53.98,0,12,-0.059857,-0.37023,-0.255918,20.509145,38.503613,22.158651,38.225646,23.584371,4.480464


## Sista lagret - gör predict av df3 och räkna ut vinst

In [14]:
model=final_model
droppa = ['vann']
# model=final_model2
# droppa = ['vann','vodds']

pred=model.predict_proba(df3.drop(droppa,axis=1))[:,1]
#print(pred)
#spela=pred>0.5
#spela=((df3.vodds*pred)>1.5) & ((df3.vodds*pred)<1.68)
spela = pred>=0.5

utd = df3[spela].vann*df3[spela].vodds
tot_utd = utd.sum()
vinst = tot_utd - sum(spela)
print('antal hästar =',len(spela),'antal satsade =',sum(spela),'vinst =',round(vinst,1))
print('per satsad:', round(vinst/sum(spela),2) )
print()
print("Accuracy: %.4g" % model.score(df3.drop(droppa,axis=1), df3.vann))

predZ = model.predict(df3.drop(droppa,axis=1))

#print("Accuracy: %.4g" % accuracy_score(df3.vann, predZ))
print('Precision: %4g' % precision_score(df3.vann,predZ))
print(metrics.confusion_matrix(df3.vann,predZ))


antal hästar = 1115 antal satsade = 10 vinst = 4.5
per satsad: 0.45

Accuracy: 0.9085
Precision:    0
[[1013    0]
 [ 102    0]]


# Kelly criterion
## ((odds-1) * percentage_estimate -  (1-percent_estimate))   / (odds-1)  X   100

In [17]:
df4=df3[['vodds','vann']]
mod_res=['Cat1h','Cat2h','Cat2u','rf1','rf2','rf130','rf230','knn1','knn2']

for res in mod_res:
    df4[res] = df3[res] / df3.vodds

    df4['kelly'] = ((df4.vodds-1) * df4[res] - (1-df4[res])) / (df4.vodds-1) * 100
    max=-9999
    #print(df4.kelly.describe())
    for thresh in range(5,40):
        utdelning = sum(df4[(df4.vann==1)&(df4.kelly>=thresh)].vodds)
        insats = sum(df4.kelly>thresh)
        if insats == 0:
            break
        max = utdelning-insats if utdelning-insats > max else max
        if utdelning-insats >0:
            print('vinst =',utdelning-insats, 'om thresh = ',thresh )

    print(res,max)

Cat1h -6.849999999999998
Cat2h -11.350000000000001
Cat2u -4.840000000000003
rf1 -150.23999999999995
rf2 -339.6699999999996
rf130 -141.70999999999998
rf230 -343.2199999999997
vinst = 1.6299999999999955 om thresh =  20
vinst = 7.569999999999993 om thresh =  24
vinst = 10.569999999999993 om thresh =  25
vinst = 12.569999999999993 om thresh =  26
vinst = 6.170000000000016 om thresh =  31
vinst = 3.8500000000000085 om thresh =  34
vinst = 14.850000000000009 om thresh =  35
vinst = 19.85000000000001 om thresh =  36
vinst = 33.85000000000001 om thresh =  37
knn1 33.85000000000001
vinst = 0.07000000000000028 om thresh =  31
vinst = 0.07000000000000028 om thresh =  32
vinst = 2.0700000000000003 om thresh =  33
vinst = 2.0700000000000003 om thresh =  34
vinst = 2.0700000000000003 om thresh =  35
vinst = 3.0700000000000003 om thresh =  36
knn2 3.0700000000000003


In [14]:
odds=1.58
est = 0.7
((odds-1) * est - (1-est)) /(odds-1) *100

18.275862068965512