# Learn v75 med walkthrough-metoden

In [1]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier,Pool,cv,utils

import sys
sys.path.append('C:\\Users\peter\\Documents\\MyProjects\\PyProj\\Trav\\spel')
import V75_scraping as vs

In [2]:
### returnera en modell med parametrar satta
def get_model(d=6,l2=2,iterations=3000,use_best=True,verbose=False):
    model = CatBoostClassifier(iterations=iterations,use_best_model=use_best, 
        custom_metric=['Logloss', 'AUC','Recall', 'Precision', 'F1', 'Accuracy'],

        eval_metric='Accuracy', 
        depth=d,l2_leaf_reg=l2,
        auto_class_weights='Balanced',verbose=verbose, random_state=2021) 
    return model                

In [3]:
### Features som inte används vid träning
def remove_features(df,remove_mer=[]):
    #remove_mer=['h5_perf','h5_auto','h4_perf','h4_auto', 'h3_perf', 'h2_perf']
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) #
    if remove_mer:
        df.drop(remove_mer,axis=1,inplace=True)
    
    # df=check_unique(df.copy())
    # df=check_corr(df.copy())
    return df

In [4]:
 ## byt ut alla NaN till text för cat_features
def replace_NaN(X_train,X_test=None, cat_features=[]):
    # print('cat_features',cat_features)
    for c in cat_features:
        # print(c)
        X_train.loc[X_train[c].isna(),c] = 'None'       ### byt ut None-värden till texten 'None
        if X_test is not None:  ## om X_test är med
            X_test.loc [X_test[c].isna(),c] = 'None'    ### byt ut None-värden till texten 'None

    return X_train,X_test

In [5]:
def scrape_nya_lopp():
    nya_lopp,strukna = vs.v75_scraping(resultat=True,history=True)

    df=pd.concat([pd.read_csv('all_data.csv'), nya_lopp])
    print('shape med nya lopp',df.shape)
    #ta bort dubletter
    df.drop_duplicates(['datum','avd','häst'],inplace=True)
    df.sort_values(by=['datum','avd'],inplace=True)
    df.reset_index(drop=True,inplace=True)
    print('shape med dubletter bort',df.shape)

    df.to_csv('all_data.csv', index=False)

    print("första datum i df =",df.datum.head(1).to_list()[0])
    print("sista  datum i df =",df.datum.tail(1).to_list()[0])

    return df,nya_lopp

In [20]:
### beräkna vilka datum att använda ###
def get_alla_datum(proc=0.75, total_omlärning = False):
    if total_omlärning:
        nya_lopp=None
        df = pd.read_csv('all_data.csv')     
        alla_datum = df.datum.unique()
        split_ix = int(len(alla_datum)*proc)
    else:
        # normalt adderar vi bara 1 eller flera veckor från "omg_att_spela_link.csv"
        df, nya_lopp = scrape_nya_lopp()  # scrape från 'omg_att_spela_link.csv' och addera till df
        omg_df = pd.read_csv('omg_att_spela_link.csv')     
        startix=omg_df.Link.str.find('spel')[0]    # index till 'spel' i url
        alla_datum = omg_df.Link.str.slice(start=startix+5,stop=startix+15).to_list() # en datum 
        split_ix=0
        print(f'datum att lära från {alla_datum}')

    return df,nya_lopp,alla_datum,split_ix


## Walkthrough-funktionen  här

In [19]:

### Kör en walkthrough learn här, en datum i taget framåt

# Jag har ändrat till att alla steg kör utan test-datam ed fast iterations=100
def walkthrough(classic_test=False, verbose=False):
    
    df, nya_lopp, alla_datum, split_ix = get_alla_datum()

    l2_leaf_regs=2
    model=get_model(use_best=False,iterations=100)
    df=remove_features(df.copy())
    cat_features = list(df.loc[:,df.dtypes=='O'].columns)
    df,_ = replace_NaN(df.copy(), cat_features=cat_features)    
    print(f'cat_features {cat_features}\n')

    df['plac']=(df.plac==1)*1
        
    for nr,datum in enumerate(alla_datum[split_ix:]):
        print(f'walk-iter {nr+1} av {len(alla_datum[split_ix:])} ',end=': ')

        X_train = df.loc[df.datum<datum,:].copy()
        y_train = X_train.plac; X_train.drop(['plac'],axis=1,inplace=True)

        if classic_test:    ### klassisk train/test utan walkthrough
            X_test  = df.loc[df.datum>=datum,:].copy()
            y_test  = X_test.plac;  X_test.drop(['plac'],axis=1,inplace=True)
            train_pool = Pool(X_train,y_train,cat_features=cat_features)
            test_pool = Pool(X_test,y_test,cat_features=cat_features)
            model.fit(train_pool,use_best_model=True, verbose=verbose,eval_set=test_pool)
        else:
            X_test  = df.loc[df.datum==datum,:].copy()
            y_test  = X_test.plac;  X_test.drop(['plac'],axis=1,inplace=True)
            train_pool = Pool(X_train,y_train,cat_features=cat_features)
            test_pool = Pool(X_test,y_test,cat_features=cat_features)
            model.fit(train_pool,use_best_model=False, verbose=verbose)

        print('best iteration',model.get_best_iteration(), '\tbest score', round(model.get_best_score()['learn']['Accuracy'],3) )
        ##['validation']['Logloss'],3),'\t', round(model.get_best_score()['validation']['Accuracy:use_weights=true'],3))
        
        if classic_test:    ### klassisk train/test utan walkthrough
            return model,cat_features
    
        model.save_model('modeller/model_'+datum)

    X_train =df.copy().drop('plac',axis=1)
    y_train = df.plac 
    model.fit(X_train,y=y_train,cat_features=cat_features)
    print(f'spara model_senaste',datum)
    model.save_model('modeller/model_senaste')

    return df,nya_lopp, model,cat_features

### Här körs hela walkthrough

In [21]:
df, nya_lopp, model, cat_features = walkthrough(classic_test=False, verbose=False)


omgång 1: https://www.atg.se/spel/2021-09-04/V75/
klickade på ANPASSA
anpassa klar - break
ant resultat 7
ant lopp 7




Ant priser 7
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 1 JÄGERSRO 2140 VOLTSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 2 JÄGERSRO 2640 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 3 JÄGERSRO 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 13 13 13 13




AVD 4 JÄGERSRO 2140 VOLTSTART 



.



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 5 JÄGERSRO 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 15 15 15 15




AVD 6 JÄGERSRO 2140 VOLTSTART 



.



.



.



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 10 10 10 10




AVD 7 JÄGERSRO 2140 AUTOSTART 



.



.



.



.



.



.



.



.



..

det tog 123.453 sekunder
utdelning: 1666051, 4498, 269
startar Fixa mer
tog bort 2 strukna från 86 till 84
rensade totalt bort 2 hästar i städa_och_rensa. Från 86 till 84
shape med nya lopp (41847, 79)
shape med dubletter bort (41763, 79)
första datum i df = 2014-12-28
sista  datum i df = 2021-09-04
datum att lära från ['2021-09-04']
cat_features ['datum', 'bana', 'häst', 'kusk', 'kön', 'h1_kusk', 'h1_bana', 'h2_kusk', 'h2_bana', 'h3_kusk', 'h3_bana', 'h4_kusk', 'h4_bana', 'h5_kusk', 'h5_bana']

walk-iter 1 av 1 : best iteration None 	best score 0.751
spara model_senaste 2021-09-04


## Kör allt ovanför walkthrough
### Se till att "omg_att_spela_link.csv" är ifylld

In [22]:
from catboost.utils import eval_metric
df = pd.read_csv('all_data.csv')     
print(df.columns)
dfval=remove_features(df.copy())
cat_features = list(dfval.loc[:,dfval.dtypes=='O'].columns)
dfval,_ = replace_NaN(dfval.copy(), cat_features=cat_features)    
    
validation = nya_lopp.copy()
validation = validation[dfval.columns]
# validation.drop('startnr',axis=1,inplace=True)
# validation = remove_features(validation)
y=validation.plac
y=(y==1)*1
validation.drop('plac',axis=1,inplace=True)
# cat_features = validation.loc[:,validation.dtypes=='O'].columns
        
val_pool=Pool(validation,y,cat_features)
yhat=model.predict(val_pool)
print(utils.get_confusion_matrix(model,val_pool))
eval_metric(yhat,y,'Accuracy')



Index(['datum', 'avd', 'bana', 'häst', 'kusk', 'streck', 'vodds', 'podds',
       'kr', 'spår', 'dist', 'lopp_dist', 'start', 'ålder', 'kön', 'plac',
       'pris', 'h1_dat', 'h1_kusk', 'h1_bana', 'h1_spår', 'h1_plac', 'h1_pris',
       'h1_odds', 'h1_kmtid', 'h2_dat', 'h2_kusk', 'h2_bana', 'h2_spår',
       'h2_plac', 'h2_pris', 'h2_odds', 'h2_kmtid', 'h3_dat', 'h3_kusk',
       'h3_bana', 'h3_spår', 'h3_plac', 'h3_pris', 'h3_odds', 'h3_kmtid',
       'h4_dat', 'h4_kusk', 'h4_bana', 'h4_spår', 'h4_plac', 'h4_pris',
       'h4_odds', 'h4_kmtid', 'h5_dat', 'h5_kusk', 'h5_bana', 'h5_spår',
       'h5_plac', 'h5_pris', 'h5_odds', 'h5_kmtid', 'h1_dist', 'h2_dist',
       'h3_dist', 'h4_dist', 'h5_dist', 'bins', 'h1_auto', 'h2_auto',
       'h3_auto', 'h4_auto', 'h5_auto', 'h1_perf', 'h2_perf', 'h3_perf',
       'h4_perf', 'h5_perf', 'senast', 'delta1', 'delta2', 'delta3', 'delta4',
       'startnr'],
      dtype='object')
[[60. 17.]
 [ 2.  5.]]


[0.7738095238095238]

In [23]:
validation.datum.max()

'2021-09-04'

In [24]:
model.get_feature_importance(prettified=True).head(30)

Unnamed: 0,Feature Id,Importances
0,streck,70.303008
1,datum,6.005589
2,häst,3.454758
3,h2_bana,1.208777
4,h3_bana,0.953858
5,h4_spår,0.862934
6,delta1,0.831052
7,h3_odds,0.813842
8,h5_bana,0.689608
9,h4_bana,0.653921


In [25]:
model.get_params()

{'iterations': 100,
 'depth': 6,
 'l2_leaf_reg': 2,
 'use_best_model': False,
 'verbose': False,
 'auto_class_weights': 'Balanced',
 'custom_metric': ['Logloss', 'AUC', 'Recall', 'Precision', 'F1', 'Accuracy'],
 'eval_metric': 'Accuracy',
 'random_state': 2021}

# Kör cross validation

## init  - kör först allt t.o.m 'replace_NaN()' ovan

In [26]:
model = get_model().load_model('modeller/model_senaste')
df = pd.read_csv('all_data.csv')     
# print(df.columns)
df=remove_features(df.copy())
cat_features = list(df.loc[:,df.dtypes=='O'].columns)
df,_ = replace_NaN(df.copy(), cat_features=cat_features)    
y=df.plac
y=(y==1)*1
df.drop('plac',axis=1,inplace=True)


In [27]:
df[df.columns[(df.dtypes=='object').values.tolist()]].info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41763 entries, 0 to 41762
Data columns (total 15 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   datum    41763 non-null  object
 1   bana     41763 non-null  object
 2   häst     41763 non-null  object
 3   kusk     41763 non-null  object
 4   kön      41763 non-null  object
 5   h1_kusk  41763 non-null  object
 6   h1_bana  41763 non-null  object
 7   h2_kusk  41763 non-null  object
 8   h2_bana  41763 non-null  object
 9   h3_kusk  41763 non-null  object
 10  h3_bana  41763 non-null  object
 11  h4_kusk  41763 non-null  object
 12  h4_bana  41763 non-null  object
 13  h5_kusk  41763 non-null  object
 14  h5_bana  41763 non-null  object
dtypes: object(15)
memory usage: 4.8+ MB


## cv

In [28]:

cv_pool = Pool(df,y,cat_features=cat_features)

params = {
         'use_best_model': True,
         'eval_metric' : 'Recall',
         "loss_function": "Logloss",
         'early_stopping_rounds': 100,
         'verbose': 50,
}

cv_score =cv(pool=cv_pool, 
   params=params, 
   dtrain=None, 
   iterations=2000, 
   num_boost_round=None,
   fold_count=5, 
   nfold=None,
   inverted=False,
   partition_random_seed=0,
   seed=2021, 
   shuffle=False, 
   logging_level=None, 
   stratified=True,
   as_pandas=True,
   type='TimeSeries')

0:	learn: 0.0000000	test: 0.0000000	best: 0.0000000 (0)
50:	learn: 0.0668059	test: 0.0526109	best: 0.0526109 (50)
100:	learn: 0.1230172	test: 0.0702883	best: 0.0706272 (83)
150:	learn: 0.1540520	test: 0.0672860	best: 0.0712867 (106)
200:	learn: 0.1765135	test: 0.0649411	best: 0.0712867 (106)
Stopped by overfitting detector  (100 iterations wait)


In [29]:
cv_score

Unnamed: 0,iterations,test-Recall-mean,test-Recall-std,train-Recall-mean,train-Recall-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.000000,0.000000,0.000000,0.000000,0.657185,0.002140,0.656769,0.002359
1,1,0.000334,0.000747,0.000167,0.000373,0.626521,0.002456,0.626004,0.002656
2,2,0.000334,0.000747,0.000566,0.000878,0.598239,0.002116,0.597526,0.002375
3,3,0.000000,0.000000,0.000067,0.000149,0.572133,0.001460,0.571280,0.001804
4,4,0.000000,0.000000,0.000067,0.000149,0.548211,0.002310,0.547297,0.002508
...,...,...,...,...,...,...,...,...,...
202,202,0.064941,0.020933,0.177975,0.004107,0.240884,0.001840,0.204677,0.007735
203,203,0.064941,0.020933,0.178475,0.003726,0.240893,0.001845,0.204595,0.007754
204,204,0.064943,0.020399,0.178792,0.003615,0.240915,0.001839,0.204499,0.007840
205,205,0.064942,0.020532,0.178719,0.003696,0.240935,0.001830,0.204394,0.007895


In [30]:
print(df.datum.max())
cv_score[cv_score['test-Logloss-mean'].min() == cv_score['test-Logloss-mean']]

2021-09-04


Unnamed: 0,iterations,test-Recall-mean,test-Recall-std,train-Recall-mean,train-Recall-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
145,145,0.066952,0.020748,0.152459,0.003419,0.240227,0.00202,0.211076,0.005064
