# Learn v75 med walkthrough-metoden

In [1]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier,Pool,cv,utils

import sys
sys.path.append('C:\\Users\peter\\Documents\\MyProjects\\PyProj\\Trav\\spel')
import V75_scraping as vs

In [2]:
### returnera en modell med parametrar satta
def get_model(d=6,l2=2,iterations=3000,use_best=True,verbose=False):
    model = CatBoostClassifier(iterations=iterations,use_best_model=use_best, 
        custom_metric=['Logloss', 'AUC','Recall', 'Precision', 'F1', 'Accuracy'],

        eval_metric='Accuracy', 
        depth=d,l2_leaf_reg=l2,
        auto_class_weights='Balanced',verbose=verbose, random_state=2021) 
    return model                

In [3]:
### Features som inte används vid träning
def remove_features(df,remove_mer=[]):
    #remove_mer=['h5_perf','h5_auto','h4_perf','h4_auto', 'h3_perf', 'h2_perf']
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) #
    if remove_mer:
        df.drop(remove_mer,axis=1,inplace=True)
    
    # df=check_unique(df.copy())
    # df=check_corr(df.copy())
    return df

In [4]:
 ## byt ut alla NaN till text för cat_features
def replace_NaN(X_train,X_test=None, cat_features=[]):
    # print('cat_features',cat_features)
    for c in cat_features:
        # print(c)
        X_train.loc[X_train[c].isna(),c] = 'None'       ### byt ut None-värden till texten 'None
        if X_test is not None:  ## om X_test är med
            X_test.loc [X_test[c].isna(),c] = 'None'    ### byt ut None-värden till texten 'None

    return X_train,X_test

In [5]:
def scrape_nya_lopp():
    nya_lopp,strukna = vs.v75_scraping(resultat=True,history=True)

    df=pd.concat([pd.read_csv('all_data.csv'), nya_lopp])
    print('shape med nya lopp',df.shape)
    #ta bort dubletter
    df.drop_duplicates(['datum','avd','häst'],inplace=True)
    df.sort_values(by=['datum','avd'],inplace=True)
    df.reset_index(drop=True,inplace=True)
    print('shape med dubletter bort',df.shape)

    df.to_csv('all_data.csv', index=False)

    print("första datum i df =",df.datum.head(1).to_list()[0])
    print("sista  datum i df =",df.datum.tail(1).to_list()[0])

    return df,nya_lopp

In [6]:
### beräkna vilka datum att använda ###
def get_alla_datum(proc=0.75, total_omlärning = False):
    if total_omlärning:
        nya_lopp=None
        df = pd.read_csv('all_data.csv')     
        alla_datum = df.datum.unique()
        split_ix = int(len(alla_datum)*proc)
    else:
        # normalt adderar vi bara 1 eller flera veckor från "omg_att_spela_link.csv"
        df, nya_lopp = scrape_nya_lopp()  # scrape från 'omg_att_spela_link.csv' och addera till df
        omg_df = pd.read_csv('omg_att_spela_link.csv')     
        startix=omg_df.Link.str.find('spel')[0]    # index till 'spel' i url
        alla_datum = omg_df.Link.str.slice(start=startix+5,stop=startix+15).to_list() # en datum 
        split_ix=0
        print(f'datum att lära från {alla_datum}')

    return df,nya_lopp,alla_datum,split_ix


## Walkthrough-funktionen  här

In [7]:

### Kör en walkthrough learn här, en datum i taget framåt

# Jag har ändrat till att alla steg kör utan test-datam ed fast iterations=100
def walkthrough(classic_test=False, verbose=False):
    
    df, nya_lopp, alla_datum, split_ix = get_alla_datum()

    l2_leaf_regs=2
    model=get_model(use_best=False,iterations=100)
    df=remove_features(df.copy())
    cat_features = list(df.loc[:,df.dtypes=='O'].columns)
    df,_ = replace_NaN(df.copy(), cat_features=cat_features)    
    print(f'cat_features {cat_features}\n')

    df['plac']=(df.plac==1)*1
        
    for nr,datum in enumerate(alla_datum[split_ix:]):
        print(f'walk-iter {nr+1} av {len(alla_datum[split_ix:])} ',end=': ')

        X_train = df.loc[df.datum<datum,:].copy()
        y_train = X_train.plac; X_train.drop(['plac'],axis=1,inplace=True)

        if classic_test:    ### klassisk train/test utan walkthrough
            X_test  = df.loc[df.datum>=datum,:].copy()
            y_test  = X_test.plac;  X_test.drop(['plac'],axis=1,inplace=True)
            train_pool = Pool(X_train,y_train,cat_features=cat_features)
            test_pool = Pool(X_test,y_test,cat_features=cat_features)
            model.fit(train_pool,use_best_model=True, verbose=verbose,eval_set=test_pool)
        else:
            X_test  = df.loc[df.datum==datum,:].copy()
            y_test  = X_test.plac;  X_test.drop(['plac'],axis=1,inplace=True)
            train_pool = Pool(X_train,y_train,cat_features=cat_features)
            test_pool = Pool(X_test,y_test,cat_features=cat_features)
            model.fit(train_pool,use_best_model=False, verbose=verbose)

        print('best iteration',model.get_best_iteration(), '\tbest score', round(model.get_best_score()['learn']['Accuracy'],3) )
        ##['validation']['Logloss'],3),'\t', round(model.get_best_score()['validation']['Accuracy:use_weights=true'],3))
        
        if classic_test:    ### klassisk train/test utan walkthrough
            return model,cat_features
    
        model.save_model('modeller/model_'+datum)

    X_train =df.copy().drop('plac',axis=1)
    y_train = df.plac 
    model.fit(X_train,y=y_train,cat_features=cat_features)
    print(f'spara model_senaste',datum)
    model.save_model('modeller/model_senaste')

    return df,nya_lopp, model,cat_features

### Här körs hela walkthrough

In [8]:
df, nya_lopp, model, cat_features = walkthrough(classic_test=False, verbose=False)


omgång 1: https://www.atg.se/spel/2021-09-18/V75/
klickade på ANPASSA
anpassa klar - break
ant resultat 7
ant lopp 7




Ant priser 7
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 1 FÄRJESTAD 1640 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 2 FÄRJESTAD 1640 VOLTSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 10 10 10 10




AVD 3 FÄRJESTAD 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 13 13 13 13




AVD 4 FÄRJESTAD 3140 VOLTSTART 



.



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 5 FÄRJESTAD 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 10 10 10 10




AVD 6 FÄRJESTAD 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 7 FÄRJESTAD 2640 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



..

det tog 97.962 sekunder
utdelning: 79915, 661, 57
startar Fixa mer
tog bort 4 strukna från 81 till 77
rensade totalt bort 4 hästar i städa_och_rensa. Från 81 till 77
shape med nya lopp (41922, 79)
shape med dubletter bort (41922, 79)
första datum i df = 2014-12-28
sista  datum i df = 2021-09-18
datum att lära från ['2021-09-18']
cat_features ['datum', 'bana', 'häst', 'kusk', 'kön', 'h1_kusk', 'h1_bana', 'h2_kusk', 'h2_bana', 'h3_kusk', 'h3_bana', 'h4_kusk', 'h4_bana', 'h5_kusk', 'h5_bana']

walk-iter 1 av 1 : best iteration None 	best score 0.753
spara model_senaste 2021-09-18


## Kör allt ovanför walkthrough
### Se till att "omg_att_spela_link.csv" är ifylld

In [9]:
from catboost.utils import eval_metric
df = pd.read_csv('all_data.csv')     
print(df.columns)
dfval=remove_features(df.copy())
cat_features = list(dfval.loc[:,dfval.dtypes=='O'].columns)
dfval,_ = replace_NaN(dfval.copy(), cat_features=cat_features)    
    
validation = nya_lopp.copy()
validation = validation[dfval.columns]
# validation.drop('startnr',axis=1,inplace=True)
# validation = remove_features(validation)
y=validation.plac
y=(y==1)*1
validation.drop('plac',axis=1,inplace=True)
# cat_features = validation.loc[:,validation.dtypes=='O'].columns
        
val_pool=Pool(validation,y,cat_features)
yhat=model.predict(val_pool)
print(utils.get_confusion_matrix(model,val_pool))
eval_metric(yhat,y,'Accuracy')



Index(['datum', 'avd', 'bana', 'häst', 'kusk', 'streck', 'vodds', 'podds',
       'kr', 'spår', 'dist', 'lopp_dist', 'start', 'ålder', 'kön', 'plac',
       'pris', 'h1_dat', 'h1_kusk', 'h1_bana', 'h1_spår', 'h1_plac', 'h1_pris',
       'h1_odds', 'h1_kmtid', 'h2_dat', 'h2_kusk', 'h2_bana', 'h2_spår',
       'h2_plac', 'h2_pris', 'h2_odds', 'h2_kmtid', 'h3_dat', 'h3_kusk',
       'h3_bana', 'h3_spår', 'h3_plac', 'h3_pris', 'h3_odds', 'h3_kmtid',
       'h4_dat', 'h4_kusk', 'h4_bana', 'h4_spår', 'h4_plac', 'h4_pris',
       'h4_odds', 'h4_kmtid', 'h5_dat', 'h5_kusk', 'h5_bana', 'h5_spår',
       'h5_plac', 'h5_pris', 'h5_odds', 'h5_kmtid', 'h1_dist', 'h2_dist',
       'h3_dist', 'h4_dist', 'h5_dist', 'bins', 'h1_auto', 'h2_auto',
       'h3_auto', 'h4_auto', 'h5_auto', 'h1_perf', 'h2_perf', 'h3_perf',
       'h4_perf', 'h5_perf', 'senast', 'delta1', 'delta2', 'delta3', 'delta4',
       'startnr'],
      dtype='object')
[[46. 24.]
 [ 2.  5.]]


[0.6623376623376623]

In [10]:
validation.datum.max()

'2021-09-18'

In [11]:
model.get_feature_importance(prettified=True).head(30)

Unnamed: 0,Feature Id,Importances
0,streck,65.337353
1,datum,6.980384
2,häst,4.772097
3,h2_bana,1.455736
4,bana,1.405156
5,h4_bana,1.175271
6,h3_odds,0.905305
7,h1_bana,0.836294
8,h1_pris,0.802259
9,kr,0.759869


In [12]:
model.get_params()

{'iterations': 100,
 'depth': 6,
 'l2_leaf_reg': 2,
 'use_best_model': False,
 'verbose': False,
 'auto_class_weights': 'Balanced',
 'custom_metric': ['Logloss', 'AUC', 'Recall', 'Precision', 'F1', 'Accuracy'],
 'eval_metric': 'Accuracy',
 'random_state': 2021}

# Kör cross validation

## init  - kör först allt t.o.m 'replace_NaN()' ovan

In [52]:
model = get_model().load_model('modeller/model_senaste')
dforg = pd.read_csv('all_data.csv')     
# print(df.columns)
df=remove_features(dforg.copy())
# df['avd']=dforg.avd
cat_features = list(df.loc[:,df.dtypes=='O'].columns)
df,_ = replace_NaN(df.copy(), cat_features=cat_features)    
y=df.plac
y=(y==1)*1
df.drop('plac',axis=1,inplace=True)


In [11]:
df[df.columns[(df.dtypes=='object').values.tolist()]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41922 entries, 0 to 41921
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   datum    41922 non-null  object
 1   bana     41922 non-null  object
 2   häst     41922 non-null  object
 3   kusk     41922 non-null  object
 4   kön      41922 non-null  object
 5   h1_kusk  41922 non-null  object
 6   h1_bana  41922 non-null  object
 7   h2_kusk  41922 non-null  object
 8   h2_bana  41922 non-null  object
 9   h3_kusk  41922 non-null  object
 10  h3_bana  41922 non-null  object
 11  h4_kusk  41922 non-null  object
 12  h4_bana  41922 non-null  object
 13  h5_kusk  41922 non-null  object
 14  h5_bana  41922 non-null  object
dtypes: object(15)
memory usage: 4.8+ MB


## cv

In [12]:

cv_pool = Pool(df,y,cat_features=cat_features)

params = {
         'use_best_model': True,
         'eval_metric' : 'AUC',
         "loss_function": "Logloss",
         'early_stopping_rounds': 100,
         'verbose': 50,
}

cv_score =cv(pool=cv_pool, 
   params=params, 
   dtrain=None, 
   iterations=2000, 
   num_boost_round=None,
   fold_count=5, 
   nfold=None,
   inverted=False,
   partition_random_seed=0,
   seed=2021, 
   shuffle=False, 
   logging_level=None, 
   stratified=True,
   as_pandas=True,
   type='TimeSeries')

0:	test: 0.6963294	best: 0.6963294 (0)
50:	test: 0.8098065	best: 0.8098065 (50)
100:	test: 0.8121092	best: 0.8122300 (88)
150:	test: 0.8106810	best: 0.8122300 (88)
Stopped by overfitting detector  (100 iterations wait)


In [13]:
cv_score

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.696329,0.087224,0.659487,0.003077,0.658914,0.002954
1,1,0.739789,0.039380,0.628632,0.002891,0.628028,0.002733
2,2,0.756218,0.024839,0.599821,0.001683,0.599206,0.001507
3,3,0.760643,0.032663,0.573624,0.004392,0.572955,0.004249
4,4,0.770705,0.023209,0.549600,0.005298,0.547927,0.003432
...,...,...,...,...,...,...,...
184,184,0.809656,0.005030,0.240083,0.001577,0.205599,0.005008
185,185,0.809714,0.005035,0.240056,0.001589,0.205494,0.005031
186,186,0.809768,0.005085,0.240049,0.001601,0.205365,0.005120
187,187,0.809768,0.005102,0.240036,0.001599,0.205267,0.005187


In [14]:
from IPython.display import display
print(df.datum.max())
display(cv_score[cv_score['test-Logloss-mean'].min() == cv_score['test-Logloss-mean']])
display(cv_score[cv_score['test-AUC-mean'].max() == cv_score['test-AUC-mean']])

2021-09-18


Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
157,157,0.810658,0.005165,0.239583,0.001686,0.208877,0.003641


Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
88,88,0.81223,0.006032,0.242407,0.003284,0.221168,0.001837


In [71]:
from sklearn.model_selection import train_test_split

df[['datum','avd','streck','häst','kusk']] = dforg[['datum','avd','streck','häst','kusk']]

# df.drop('datum',axis=1,inplace=True)
df.drop('avd',axis=1,inplace=True)
df.drop(['streck'],axis=1,inplace=True)
df['ekipage'] = dforg['kusk'].str.cat(dforg['häst'], sep =", ")
df.drop('ekipage',axis=1,inplace=True)
# df.drop(['häst','kusk'],axis=1,inplace=True)
cat_features = list(df.loc[:,df.dtypes=='O'].columns)
X_train,X_test,y_train,y_test = train_test_split(df,y,shuffle=False,)
cb=get_model(use_best=True)
cb.fit(X_train,y_train,eval_set= (X_test,y_test),early_stopping_rounds=200, cat_features=cat_features,verbose=100)

0:	learn: 0.6429472	test: 0.6002399	best: 0.6002399 (0)	total: 84.9ms	remaining: 4m 14s
100:	learn: 0.7160471	test: 0.6407389	best: 0.6437970 (21)	total: 11.2s	remaining: 5m 20s
200:	learn: 0.7368167	test: 0.6419145	best: 0.6477709 (150)	total: 22.3s	remaining: 5m 10s
300:	learn: 0.7563230	test: 0.6375103	best: 0.6489617 (238)	total: 32.5s	remaining: 4m 51s
400:	learn: 0.7794626	test: 0.6309864	best: 0.6489617 (238)	total: 42.5s	remaining: 4m 35s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.648961667
bestIteration = 238

Shrink model to first 239 iterations.


<catboost.core.CatBoostClassifier at 0x2d32dfceac0>

In [72]:
cb.get_feature_importance(prettified=True).head(40)

Unnamed: 0,Feature Id,Importances
0,häst,9.63099
1,kr,7.602864
2,datum,7.324576
3,pris,6.632094
4,kusk,5.873598
5,spår,5.64844
6,h1_odds,5.567155
7,h2_odds,4.12414
8,dist,4.008371
9,ålder,3.388386


In [73]:
cb.best_score_

{'learn': {'Accuracy:use_weights=false': 0.7707133997010274,
  'Precision:use_weights=false': 0.2467298150654037,
  'Recall:use_weights=false': 0.806036069193964,
  'Logloss:use_weights=true': 0.4792279785597136,
  'Accuracy:use_weights=true': 0.7863734171958425,
  'Logloss:use_weights=false': 0.4847853602671395,
  'Logloss': 0.4792279785597136,
  'Precision:use_weights=true': 0.7759249239558584,
  'Recall:use_weights=true': 0.806036069193964,
  'F1:use_weights=false': 0.37772982304704367,
  'F1:use_weights=true': 0.790492885598547,
  'Accuracy': 0.7863734171958425},
 'validation': {'Accuracy:use_weights=false': 0.7566071939700411,
  'Precision:use_weights=false': 0.17205138186064617,
  'Recall:use_weights=false': 0.8541202672605791,
  'Logloss:use_weights=true': 0.6288930796085968,
  'Accuracy:use_weights=true': 0.6489616670491436,
  'AUC': 0.7026075313861987,
  'Logloss:use_weights=false': 0.5194062316339202,
  'Logloss': 0.6288930796085968,
  'Precision:use_weights=true': 0.68719686