# Learn v75 med walkthrough-metoden

In [66]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier,Pool,cv,utils

import sys
sys.path.append('C:\\Users\peter\\Documents\\MyProjects\\PyProj\\Trav\\spel')
import V75_scraping as vs

In [67]:
# df skall innehålla datum,avd,vodds
def proba_order_score(df_, y,proba):
    kassa=1000
    df = df_.copy()
    df['proba'] = proba[:,1]
    df['f'] = (df.proba*df.vodds - 1) / (df.vodds-1)  # kelly formel
    df['spela'] = df.f >0
    df['insats'] = df.spela * df.f * kassa

    df.sort_values(['datum','avd','proba'],ascending=[True,True,False],inplace=True)
    proba_order=df.groupby(['datum','avd']).proba.cumcount()

    df['prob_order']=proba_order+1
    df['y'] = y
    
    print('log(proba)',np.log(df.loc[df.y==1].proba).mean())
    return df, df.loc[df.y==1].prob_order.mean()   # mean prob_order för vinnarhäst

In [68]:
def create_ekipage(df_):
    df=df_.copy()
    prefs = ['','h1_','h2_','h3_','h4_','h5_',]
    for pr in prefs:
        df[pr+'ekipage'] = df[pr+'kusk'].str.cat(df['häst'], sep =", ")
        df.drop([pr+'kusk'],axis=1, inplace=True)
        
    return df.drop(['häst'], axis=1)

In [69]:
### returnera en modell med parametrar satta
def get_model(d=6,l2=2,iterations=3000,use_best=True,verbose=False):
    model = CatBoostClassifier(iterations=iterations,use_best_model=use_best, 
        custom_metric=['Logloss', 'AUC','Recall', 'Precision', 'F1', 'Accuracy'],

        eval_metric='Accuracy', 
        depth=d,l2_leaf_reg=l2,
        auto_class_weights='Balanced',verbose=verbose, random_state=2021) 
    return model                

In [70]:
### Features som inte används vid träning
def remove_features(df_,remove_mer=[]):
    # df = df_.copy()
    #remove_mer=['h5_perf','h5_auto','h4_perf','h4_auto', 'h3_perf', 'h2_perf']
    df = df_.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1) #
    if remove_mer:
        df = df.drop(remove_mer,axis=1)
    
    # df=check_unique(df.copy())
    # df=check_corr(df.copy())
    return df

In [71]:
 ## byt ut alla NaN till text för cat_features
def replace_NaN(X_train,X_test=None, cat_features=[]):
    # print('cat_features',cat_features)
    for c in cat_features:
        # print(c)
        X_train.loc[X_train[c].isna(),c] = 'None'       ### byt ut None-värden till texten 'None
        if X_test is not None:  ## om X_test är med
            X_test.loc [X_test[c].isna(),c] = 'None'    ### byt ut None-värden till texten 'None

    return X_train,X_test

In [72]:
def scrape_nya_lopp():
    nya_lopp,strukna = vs.v75_scraping(resultat=True,history=True)

    df=pd.concat([pd.read_csv('all_data.csv'), nya_lopp])
    print('shape med nya lopp',df.shape)
    #ta bort dubletter
    df.drop_duplicates(['datum','avd','häst'],inplace=True)
    df.sort_values(by=['datum','avd'],inplace=True)
    df.reset_index(drop=True,inplace=True)
    print('shape med dubletter bort',df.shape)

    df.to_csv('all_data.csv', index=False)

    print("första datum i df =",df.datum.head(1).to_list()[0])
    print("sista  datum i df =",df.datum.tail(1).to_list()[0])

    return df,nya_lopp

In [73]:
### beräkna vilka datum att använda ###
def get_alla_datum(test_from_proc=0.75, train_from_proc=0, total_omlärning = False):
    if total_omlärning:
        nya_lopp=None
        df = pd.read_csv('all_data.csv')     
        alla_datum = df.datum.unique()
        split_ix = int(len(alla_datum)*test_from_proc)
    else:
        # normalt adderar vi bara 1 eller flera veckor från "omg_att_spela_link.csv"
        df, nya_lopp = scrape_nya_lopp()  # scrape från 'omg_att_spela_link.csv' och addera till df
        omg_df = pd.read_csv('omg_att_spela_link.csv')     
        startix=omg_df.Link.str.find('spel')[0]    # index till 'spel' i url
        alla_datum = omg_df.Link.str.slice(start=startix+5,stop=startix+15).to_list() # en datum 
        split_ix=0
        print(f'datum att lära från {alla_datum}')

    return df,nya_lopp,alla_datum,split_ix


## Walkthrough-funktionen  här

In [74]:

### Kör en walkthrough learn här, en datum i taget framåt

# Jag har ändrat till att alla steg kör utan test-datam ed fast iterations=100
def walkthrough(classic_test=False, verbose=False):
    
    df, nya_lopp, alla_datum, split_ix = get_alla_datum()

    l2_leaf_regs=2
    model=get_model(use_best=False,iterations=100)
    df=remove_features(df.copy())
    cat_features = list(df.loc[:,df.dtypes=='O'].columns)
    df,_ = replace_NaN(df.copy(), cat_features=cat_features)    
    print(f'cat_features {cat_features}\n')

    df['plac']=(df.plac==1)*1
        
    for nr,datum in enumerate(alla_datum[split_ix:]):
        print(f'walk-iter {nr+1} av {len(alla_datum[split_ix:])} ',end=': ')

        X_train = df.loc[df.datum<datum,:].copy()
        y_train = X_train.plac; X_train.drop(['plac'],axis=1,inplace=True)

        if classic_test:    ### klassisk train/test utan walkthrough
            X_test  = df.loc[df.datum>=datum,:].copy()
            y_test  = X_test.plac;  X_test.drop(['plac'],axis=1,inplace=True)
            train_pool = Pool(X_train,y_train,cat_features=cat_features)
            test_pool = Pool(X_test,y_test,cat_features=cat_features)
            model.fit(train_pool,use_best_model=True, verbose=verbose,eval_set=test_pool)
        else:
            X_test  = df.loc[df.datum==datum,:].copy()
            y_test  = X_test.plac;  X_test.drop(['plac'],axis=1,inplace=True)
            train_pool = Pool(X_train,y_train,cat_features=cat_features)
            test_pool = Pool(X_test,y_test,cat_features=cat_features)
            model.fit(train_pool,use_best_model=False, verbose=verbose)

        print('best iteration',model.get_best_iteration(), '\tbest score', round(model.get_best_score()['learn']['Accuracy'],3) )
        ##['validation']['Logloss'],3),'\t', round(model.get_best_score()['validation']['Accuracy:use_weights=true'],3))
        
        if classic_test:    ### klassisk train/test utan walkthrough
            return model,cat_features
    
        model.save_model('modeller/model_'+datum)

    X_train =df.copy().drop('plac',axis=1)
    y_train = df.plac 
    model.fit(X_train,y=y_train,cat_features=cat_features)
    print(f'spara model_senaste',datum)
    model.save_model('modeller/model_senaste')

    return df,nya_lopp, model,cat_features

### Här körs hela walkthrough

In [75]:
df, nya_lopp, model, cat_features = walkthrough(classic_test=False, verbose=False)

omgång 1: https://www.atg.se/spel/2021-10-09/V75/
klickade på ANPASSA
anpassa klar - break
ant resultat 7
ant lopp 7




Ant priser 7
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 1 ÅBY 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 2 ÅBY 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 10 10 10 10




AVD 3 ÅBY 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 4 ÅBY 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 11 11 11 11




AVD 5 ÅBY 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 6 ÅBY 2140 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



.



.
ant names,vodds,podds,rader,streck 12 12 12 12




AVD 7 ÅBY 2640 AUTOSTART 



.



.



.



.



.



.



.



.



.



.



..

det tog 114.871 sekunder
utdelning: 3123, 48, 0
startar Fixa mer
tog bort 1 strukna från 81 till 80
rensade totalt bort 1 hästar i städa_och_rensa. Från 81 till 80
shape med nya lopp (42166, 79)
shape med dubletter bort (42086, 79)
första datum i df = 2014-12-28
sista  datum i df = 2021-10-09
datum att lära från ['2021-10-09']
cat_features ['datum', 'bana', 'häst', 'kusk', 'kön', 'h1_kusk', 'h1_bana', 'h2_kusk', 'h2_bana', 'h3_kusk', 'h3_bana', 'h4_kusk', 'h4_bana', 'h5_kusk', 'h5_bana']

walk-iter 1 av 1 : best iteration None 	best score 0.75
spara model_senaste 2021-10-09


## Kör allt ovanför walkthrough
### Se till att "omg_att_spela_link.csv" är ifylld

# Kör cross validation

## init  - kör först allt t.o.m 'replace_NaN()' ovan

In [76]:
model = get_model().load_model('modeller/model_senaste')
dforg = pd.read_csv('all_data.csv')     
# print(df.columns)
df=remove_features(dforg.copy())
# df['avd']=dforg.avd
cat_features = list(df.loc[:,df.dtypes=='O'].columns)
df,_ = replace_NaN(df.copy(), cat_features=cat_features)    
y=df.plac
y=(y==1)*1
df.drop('plac',axis=1,inplace=True)

## cv

In [77]:

cv_pool = Pool(df,y,cat_features=cat_features)

params = {
         'use_best_model': True,
         'eval_metric' : 'AUC',
         "loss_function": "Logloss",
         'early_stopping_rounds': 100,
         'verbose': 50,
}

cv_score =cv(pool=cv_pool, 
   params=params, 
   dtrain=None, 
   iterations=2000, 
   num_boost_round=None,
   fold_count=5, 
   nfold=None,
   inverted=False,
   partition_random_seed=0,
   seed=2021, 
   shuffle=False, 
   logging_level=None, 
   stratified=True,
   as_pandas=True,
   type='TimeSeries')

0:	test: 0.7231466	best: 0.7231466 (0)
50:	test: 0.8103076	best: 0.8103076 (50)
100:	test: 0.8119351	best: 0.8132971 (88)
150:	test: 0.8106453	best: 0.8132971 (88)
Stopped by overfitting detector  (100 iterations wait)


In [78]:
cv_score

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.723147,0.017090,0.658532,0.000493,0.658296,0.000421
1,1,0.736509,0.027711,0.628294,0.002363,0.627893,0.002414
2,2,0.741092,0.029567,0.601071,0.003482,0.600593,0.003592
3,3,0.759872,0.023102,0.573875,0.001936,0.573239,0.001878
4,4,0.764229,0.015500,0.550024,0.002753,0.549194,0.002708
...,...,...,...,...,...,...,...
184,184,0.810010,0.005546,0.240050,0.002584,0.207421,0.005733
185,185,0.810002,0.005532,0.240062,0.002586,0.207283,0.005798
186,186,0.809982,0.005533,0.240053,0.002593,0.207177,0.005900
187,187,0.810068,0.005653,0.240024,0.002598,0.207101,0.005908


In [79]:
from IPython.display import display
print(df.datum.max())
display(cv_score[cv_score['test-Logloss-mean'].min() == cv_score['test-Logloss-mean']])
display(cv_score[cv_score['test-AUC-mean'].max() == cv_score['test-AUC-mean']])

2021-10-09


Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
135,135,0.810895,0.006198,0.239324,0.002668,0.212978,0.003651


Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
88,88,0.813297,0.0054,0.241036,0.002668,0.221929,0.001061


In [80]:
from sklearn.model_selection import train_test_split

df[['datum','avd','streck','häst','kusk']] = dforg[['datum','avd','streck','häst','kusk']]

# df.drop('datum',axis=1,inplace=True)
df.drop('avd',axis=1,inplace=True)
df.drop(['streck'],axis=1,inplace=True)
# df.drop(['häst','kusk'],axis=1,inplace=True)
cat_features = list(df.loc[:,df.dtypes=='O'].columns)
X_train,X_test,y_train,y_test = train_test_split(df,y,shuffle=False,)
cb=get_model(use_best=True)
cb.fit(X_train,y_train,eval_set= (X_test,y_test),early_stopping_rounds=200, cat_features=cat_features,verbose=100)

0:	learn: 0.6365261	test: 0.6259963	best: 0.6259963 (0)	total: 136ms	remaining: 6m 47s
100:	learn: 0.7092908	test: 0.6361242	best: 0.6466054 (40)	total: 17.9s	remaining: 8m 34s
200:	learn: 0.7325313	test: 0.6352651	best: 0.6466054 (40)	total: 35.3s	remaining: 8m 11s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.6466054152
bestIteration = 40

Shrink model to first 41 iterations.


<catboost.core.CatBoostClassifier at 0x1d39dd0b580>

In [81]:
X_test[['datum','avd','vodds']] = dforg[['datum','avd','vodds']]
predict_prob = cb.predict_proba(X_test)

_,prob_score = proba_order_score(X_test ,y_test, predict_prob)

print('cb med ekipage',prob_score, cb.best_score_['validation']['AUC'])

log(proba) -0.6784942768795404
cb med ekipage 4.005537098560354 0.7065429865701615


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


# FLAML (med och utan ekipage och streck)

In [82]:
def split_data(df_,train_from_proc=0,test_proc=0.25):
    # train_from_proc = where to start both train and test
    # test_proc = how much of the data is test
    df=df_.copy()
    alla_datum = df.datum.unique()
    train_from_datum = alla_datum[ int(len(alla_datum)*train_from_proc)]
    print(train_from_datum)
    X_test=None
    y_test=None
    test_from_datum=alla_datum[-1]
    if test_proc:
        selected_data = alla_datum[ alla_datum >= train_from_datum ]
        test_from_datum = selected_data[ int(len(selected_data)*(1-test_proc)) ]
        print(test_from_datum)
        X_test  = df[df.datum >= test_from_datum]
        y_test  = (X_test.plac==1)*1
        X_test  = X_test.drop('plac',axis=1)
        
    
    X_train = df[(df.datum >= train_from_datum) & (df.datum < test_from_datum) ]
    y_train = (X_train.plac==1)*1
    
    return X_train.drop('plac', axis=1), X_test, y_train, y_test


In [83]:
# prepare all data för flaml
dforg = pd.read_csv('all_data.csv')  

X_train, X_test, y_train, y_test= split_data(dforg,train_from_proc=0.3,test_proc=0.25)
X_train = remove_features(X_train)
X_test  = remove_features( X_test)

# X_train = X_train.drop('streck', axis=1)
# X_test  = X_test.drop ('streck', axis=1)
# X_train.drop('datum', axis=1, inplace=True)
# X_test.drop( 'datum', axis=1, inplace=True)
cat_features = list(X_train.select_dtypes('object').columns)
# X_train, X_test = replace_NaN(X_train.copy(),X_test=X_test.copy(), cat_features=cat_features) 
# X_train.fillna(-1)
# X_test.fillna(-1)
print(cat_features)
X_train.shape,X_test.shape

2017-02-04
2020-05-17
['datum', 'bana', 'häst', 'kusk', 'kön', 'h1_kusk', 'h1_bana', 'h2_kusk', 'h2_bana', 'h3_kusk', 'h3_bana', 'h4_kusk', 'h4_bana', 'h5_kusk', 'h5_bana']


((22081, 68), (7392, 68))

In [84]:
from flaml import AutoML 
cat_features = list(X_train.select_dtypes('object').columns)
starting_points={'lgbm': {'n_estimators': 38,
  'num_leaves': 4,
  'min_child_samples': 2,
  'learning_rate': 0.19098448074739216,
  'log_max_bin': 7,
  'colsample_bytree': 0.8827412174089042,
  'reg_alpha': 0.004577823970660193,
  'reg_lambda': 0.03815584533462228},
 'rf': {'n_estimators': 33,
  'max_features': 0.3251674877768946,
  'max_leaves': 89,
  'criterion': 'entropy'},
 'catboost': {'early_stopping_rounds': 10,
  'learning_rate': 0.007511731949060241},
 'xgboost': {'n_estimators': 575,
  'max_leaves': 46,
  'min_child_weight': 1.032235057697502,
  'learning_rate': 0.013318439439138472,
  'subsample': 0.7908401179782586,
  'colsample_bylevel': 0.6924750037579576,
  'colsample_bytree': 0.7174828796230647,
  'reg_alpha': 0.15461500385937774,
  'reg_lambda': 0.6619886587472544},
 'extra_tree': {'n_estimators': 47,
  'max_features': 0.7934349565988307,
  'max_leaves': 213,
  'criterion': 'entropy'}}
flml_raw_parms={'task': 'classification','split_type':'time', 'metric':'roc_auc', 'starting_points': starting_points,'verbose':False,
        'time_budget':700, 'max_iter':50000000,'n_jobs':5, 'X_val': X_test, 'y_val':y_test,'early_stop':True, 'ensemble':True}

automl_raw = AutoML()
automl_raw.fit(X_train,y_train, **flml_raw_parms)

In [85]:
flm_raw_train_pred= automl_raw.predict_proba(X_train)
flm_raw_test_pred = automl_raw.predict_proba(X_test)

X_test_raw = X_test.copy()
X_test_raw[['datum','avd','vodds']] = dforg[['datum','avd','vodds']]
_,prob_score = proba_order_score(X_test_raw,y_test, flm_raw_test_pred)

print('timeserie, datum,häst, kusk', prob_score, 1-automl_raw.best_loss)
# X_test.columns

log(proba) -1.7477169036865234
timeserie, datum,häst, kusk 3.0392464678178963 0.8231996300199748


timeserie  0.3 0.25, datum, häst, kusk 3.720565149136578  0.7213763318649257 ... 1.9827526807785034 .....   best    
timeserie  0.4 0.25, datum, häst, kusk 3.7362637362637363 0.7214144007762124  
timeserie, 0.2 0.25, datum, häst, kusk 3.760989010989011  0.72561915325073230    
timeserie, 0.1 0.25  datum, häst, kusk 3.8180708180708183 0.726597977829505    
timeserie, 0.5 0.25, datum, häst, kusk 3.936263736263736  0.7216626969090024  
timeserie, 0.3 0.25, datum, häst, kusk streck, NaN 3.0706436420722136  0.8230307821948237   
timeserie, 0.3 0.25, datum, häst, kusk,streck  3.0549450549450547 0.8232840226857013 ... -1.7710182666778564 .......... best   
timeserie, 0.3 0.25, datum, häst, kusk streck, NaN, fillna, 3.0549450549450547 0.8237003593459333   
timeserie, 0.3 0.25, datum, häst, kusk, streck 3.06436420722135   0.8232840226857013       
timeserie, 0.4 0.25, datum, häst, kusk, streck 3.1483516483516483 0.8169106155467452  
timeserie, 0.2 0.25, datum, häst, kusk, streck 3.0824175824175826 0.8220287891340522

## Final FLML model

In [90]:

with_streck = False

# prepare all data för flaml
dforg = pd.read_csv('all_data.csv')  

X_train, _, y_train, _ = split_data(dforg,train_from_proc=0.3,test_proc=None)
X_train = remove_features(X_train)

if not with_streck:
    print('not streck')
    X_train.drop('streck', axis=1, inplace=True)
else:
    print('with streck')    
    
# df = create_ekipage(df)
# df.drop('datum', axis=1, inplace=True)
# X_train, _ = replace_NaN(X_train.copy(), cat_features=cat_features) 

cat_features = list(X_train.select_dtypes('object').columns)
print(cat_features)

2017-02-04
not streck
['datum', 'bana', 'häst', 'kusk', 'kön', 'h1_kusk', 'h1_bana', 'h2_kusk', 'h2_bana', 'h3_kusk', 'h3_bana', 'h4_kusk', 'h4_bana', 'h5_kusk', 'h5_bana']


In [91]:
automl = AutoML()
flml_parms={'task': 'classification','split_type':'time', 'metric':'roc_auc','starting_points': starting_points, 'verbose':False,
        'time_budget':1700, 'max_iter':400000000,'n_jobs':5, 'early_stop':True, 'ensemble':True}

automl.fit(X_train, y_train, **flml_parms)
1-automl.best_loss

0.7209304071839201

In [92]:
import pickle
if with_streck:
    filename = 'modeller\\FLAML_model.sav'
else:
    filename = 'modeller\\FLAML2_model.sav'

print(filename)        
pickle.dump(automl, open(filename, 'wb'))

modeller\FLAML2_model.sav


In [93]:
'streck' in X_train.columns


False