# Learn v75 med walkthrough-metoden

In [28]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier,Pool,cv,utils
import pickle
import sys
sys.path.append('C:\\Users\peter\\Documents\\MyProjects\\PyProj\\Trav\\spel')
import V75_scraping as vs


In [29]:

def proba_order_score(df_, y, proba):  # df skall innehålla datum,avd,vodds
    kassa=1000
    df = df_.copy()
    df['proba'] = proba[:,1]
    df['f'] = (df.proba*df.vodds - 1) / (df.vodds-1)  # kelly formel
    df['spela'] = df.f >0
    df['insats'] = df.spela * df.f * kassa

    df.sort_values(['datum','avd','proba'],ascending=[True,True,False],inplace=True)
    proba_order=df.groupby(['datum','avd']).proba.cumcount()

    df['prob_order']=proba_order+1
    df['y'] = y

    print('log(proba)',np.log(df.loc[df.y==1].proba).mean())
    return df, df.loc[df.y==1].prob_order.mean()   # mean prob_order för vinnarhäst


In [30]:
def create_ekipage(df_):
    df=df_.copy()
    prefs = ['','h1_','h2_','h3_','h4_','h5_',]
    for pr in prefs:
        df[pr+'ekipage'] = df[pr+'kusk'].str.cat(df['häst'], sep =", ")
        df.drop([pr+'kusk'],axis=1, inplace=True)
        
    return df.drop(['häst'], axis=1)

In [31]:
### returnera en modell med parametrar satta
def get_model(d=6,l2=2,iterations=3000,use_best=True,verbose=False):
    model = CatBoostClassifier(iterations=iterations,use_best_model=use_best, 
        custom_metric=['Logloss', 'AUC','Recall', 'Precision', 'F1', 'Accuracy'],

        eval_metric='Accuracy', 
        depth=d,l2_leaf_reg=l2,
        auto_class_weights='Balanced',verbose=verbose, random_state=2021) 
    return model                

In [32]:
### Features som inte används vid träning
def remove_features(df_,remove_mer=[]):
    # df = df_.copy()
    #remove_mer=['h5_perf','h5_auto','h4_perf','h4_auto', 'h3_perf', 'h2_perf']
    df = df_.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1) #
    if remove_mer:
        df = df.drop(remove_mer,axis=1)
    
    # df=check_unique(df.copy())
    # df=check_corr(df.copy())
    return df

In [33]:
 ## byt ut alla NaN till text för cat_features
def replace_NaN(X_train,X_test=None, cat_features=[]):
    # print('cat_features',cat_features)
    for c in cat_features:
        # print(c)
        X_train.loc[X_train[c].isna(),c] = 'None'       ### byt ut None-värden till texten 'None
        if X_test is not None:  ## om X_test är med
            X_test.loc [X_test[c].isna(),c] = 'None'    ### byt ut None-värden till texten 'None

    return X_train,X_test

In [34]:
def scrape_nya_lopp():
    """scrape nya lopp och lägg in i all_data.csv"""
    nya_lopp,strukna = vs.v75_scraping(resultat=True,history=True)

    df=pd.concat([pd.read_csv('all_data.csv'), nya_lopp])
    print('shape med nya lopp',df.shape)
    #ta bort dubletter
    df.drop_duplicates(['datum','avd','häst'],inplace=True)
    df.sort_values(by=['datum','avd'],inplace=True)
    df.reset_index(drop=True,inplace=True)
    print('shape med dubletter bort',df.shape)

    df.to_csv('all_data.csv', index=False)

    print("första datum i df =",df.datum.head(1).to_list()[0])
    print("sista  datum i df =",df.datum.tail(1).to_list()[0])

    return df,nya_lopp

In [35]:
### beräkna vilka datum att använda ###
def get_alla_datum(test_from_proc=0.75, train_from_proc=0, total_omlärning = False):
    if total_omlärning:
        nya_lopp=None
        df = pd.read_csv('all_data.csv')     
        datum_att_lära = df.datum.unique()
        split_ix = int(len(datum_att_lära)*test_from_proc)
    else:
        # normalt adderar vi bara 1 eller flera veckor från "omg_att_spela_link.csv"
        df, nya_lopp = scrape_nya_lopp()  # scrape från 'omg_att_spela_link.csv' och addera till df
        omg_df = pd.read_csv('omg_att_spela_link.csv')     
        startix=omg_df.Link.str.find('spel')[0]    # index till 'spel' i url
        datum_att_lära = omg_df.Link.str.slice(start=startix+5,stop=startix+15).to_list() # en datum 
        split_ix=0
        print(f'datum att lära: {datum_att_lära}')

    return df,nya_lopp,datum_att_lära,split_ix


## Walkthrough-funktionen  här

In [36]:

### Kör en walkthrough learn här, en datum i taget framåt

# Jag har ändrat till att alla steg kör utan test-datam ed fast iterations=100
def walkthrough(classic_test=False, verbose=False):
    
    df, nya_lopp, alla_datum, split_ix = get_alla_datum(0.8)

    l2_leaf_regs=2
    model=get_model(use_best=False,iterations=100)
    df=remove_features(df.copy())
    cat_features = list(df.loc[:,df.dtypes=='O'].columns)
    df,_ = replace_NaN(df.copy(), cat_features=cat_features)    
    print(f'cat_features {cat_features}\n')

    df['plac']=(df.plac==1)*1
        
    for nr,datum in enumerate(alla_datum[split_ix:]):
        print(f'walk-iter {nr+1} av {len(alla_datum[split_ix:])} ',end=': ')

        X_train = df.loc[df.datum<datum,:].copy()
        y_train = X_train.plac; X_train.drop(['plac'],axis=1,inplace=True)

        if classic_test:    ### klassisk train/test utan walkthrough
            X_test  = df.loc[df.datum>=datum,:].copy()
            y_test  = X_test.plac;  X_test.drop(['plac'],axis=1,inplace=True)
            train_pool = Pool(X_train,y_train,cat_features=cat_features)
            test_pool = Pool(X_test,y_test,cat_features=cat_features)
            model.fit(train_pool,use_best_model=True, verbose=verbose,eval_set=test_pool)
        else:
            X_test  = df.loc[df.datum==datum,:].copy()
            y_test  = X_test.plac;  X_test.drop(['plac'],axis=1,inplace=True)
            train_pool = Pool(X_train,y_train,cat_features=cat_features)
            test_pool = Pool(X_test,y_test,cat_features=cat_features)
            model.fit(train_pool,use_best_model=False, verbose=verbose)

        print('best iteration',model.get_best_iteration(), '\tbest score', round(model.get_best_score()['learn']['Accuracy'],3) )
        ##['validation']['Logloss'],3),'\t', round(model.get_best_score()['validation']['Accuracy:use_weights=true'],3))
        
        if classic_test:    ### klassisk train/test utan walkthrough
            return model,cat_features
    
        model.save_model('modeller/model_'+datum)

    X_train =df.copy().drop('plac',axis=1)
    y_train = df.plac 
    model.fit(X_train,y=y_train,cat_features=cat_features)
    print(f'spara model_senaste',datum)
    model.save_model('modeller/model_senaste')

    return df,nya_lopp, model,cat_features

### Här körs hela walkthrough

In [37]:
df, nya_lopp, model, cat_features = walkthrough(classic_test=False, verbose=False)

omgång 1: https://www.atg.se/spel/2022-03-05/V75/




klickade på ANPASSA
hoppar över voods click (verkar vara förifyllt
anpassa klar - break
ant resultat 7
ant lopp 7
EUR: False NOK: False
priser ['Pris: 110.000-55.000-32.000-19.000-12.500-10.000-6.500-5.000 kr (8 priser). Lägst 2.500 kr till alla tävlande.', 'Pris: 110.000-55.000-32.000-19.000-12.500-10.000-6.500-5.000 kr (8 priser). Lägst 2.500 kr till alla tävlande.', 'Pris: 150.000-75.000-40.000-25.000-15.000-11.500-7.500-5.000 kr (8 priser). Lägst 2.500 kr till alla tävlande.', 'Pris: 110.000-55.000-32.000-19.000-12.500-10.000-6.500-5.000 kr (8 priser). Lägst 2.500 kr till alla tävlande.', 'Pris: 125.000-62.500-34.000-21.000-13.500-10.500-7.000-5.000 kr (8 priser). Lägst 2.500 kr till alla tävlande.', 'Pris: 110.000-55.000-32.000-19.000-12.500-10.000-6.500-5.000 kr (8 priser). Lägst 2.500 kr till alla tävlande.', 'Pris: 110.000-55.000-32.000-19.000-12.500-10.000-6.500-5.000 kr (8 priser). Lägst 2.500 kr till alla tävlande.']
Ant priser 7
pris: 110.000
ant names,vodds,podds,rader,str

## Kör allt ovanför walkthrough
### Se till att "omg_att_spela_link.csv" är ifylld

## init  - kör först allt t.o.m 'replace_NaN()' ovan

In [39]:
model = get_model().load_model('modeller/model_senaste')
dforg = pd.read_csv('all_data.csv')     
# print(df.columns)
df=remove_features(dforg.copy())
# df['avd']=dforg.avd
cat_features = list(df.loc[:,df.dtypes=='O'].columns)
df,_ = replace_NaN(df.copy(), cat_features=cat_features)    
y=df.plac
y=(y==1)*1
df.drop('plac',axis=1,inplace=True)

## cv

In [40]:

cv_pool = Pool(df,y,cat_features=cat_features)

params = {
         'use_best_model': True,
         'eval_metric' : 'AUC',
         "loss_function": "Logloss",
         'early_stopping_rounds': 100,
         'verbose': 50,
}

cv_score =cv(pool=cv_pool, 
   params=params, 
   dtrain=None, 
   iterations=2000, 
   num_boost_round=None,
   fold_count=5, 
   nfold=None,
   inverted=False,
   partition_random_seed=0,
   seed=2021, 
   shuffle=False, 
   logging_level=None, 
   stratified=True,
   as_pandas=True,
   type='TimeSeries')

Training on fold [0/5]
0:	test: 0.6870743	best: 0.6870743 (0)	total: 41.7ms	remaining: 1m 23s
50:	test: 0.8164949	best: 0.8164949 (50)	total: 2.87s	remaining: 1m 49s
100:	test: 0.8170444	best: 0.8177909 (87)	total: 5.83s	remaining: 1m 49s
150:	test: 0.8141373	best: 0.8177909 (87)	total: 8.43s	remaining: 1m 43s

bestTest = 0.8177908766
bestIteration = 87

Training on fold [1/5]
0:	test: 0.6960316	best: 0.6960316 (0)	total: 52.1ms	remaining: 1m 44s
50:	test: 0.8038476	best: 0.8062768 (38)	total: 3.38s	remaining: 2m 9s
100:	test: 0.8120015	best: 0.8120770 (87)	total: 6.95s	remaining: 2m 10s
150:	test: 0.8101797	best: 0.8122038 (109)	total: 10.5s	remaining: 2m 8s
200:	test: 0.8084889	best: 0.8122038 (109)	total: 13.9s	remaining: 2m 4s

bestTest = 0.8122037661
bestIteration = 109

Training on fold [2/5]
0:	test: 0.6279293	best: 0.6279293 (0)	total: 92.6ms	remaining: 3m 5s
50:	test: 0.8064775	best: 0.8074158 (43)	total: 3.83s	remaining: 2m 26s
100:	test: 0.8030975	best: 0.8074158 (43)	total:

In [41]:
cv_score

Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.683002,0.034021,0.659567,0.000790,0.659518,0.000781
1,1,0.754246,0.033698,0.626065,0.002684,0.625836,0.002686
2,2,0.757761,0.033026,0.598072,0.002733,0.597654,0.002721
3,3,0.759813,0.032176,0.572086,0.003137,0.571593,0.003109
4,4,0.774361,0.019781,0.546906,0.003834,0.546298,0.003783
...,...,...,...,...,...,...,...
205,205,0.809739,0.005448,0.239894,0.002817,0.207548,0.007370
206,206,0.809727,0.005452,0.239898,0.002819,0.207537,0.007378
207,207,0.809705,0.005460,0.239903,0.002821,0.207520,0.007388
208,208,0.809698,0.005463,0.239898,0.002819,0.207489,0.007408


In [42]:
from IPython.display import display
print(df.datum.min(),df.datum.max())
display(cv_score[cv_score['test-Logloss-mean'].min() == cv_score['test-Logloss-mean']])
display(cv_score[cv_score['test-AUC-mean'].max() == cv_score['test-AUC-mean']])

2014-12-28 2022-03-05


Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
195,195,0.809818,0.005412,0.239852,0.002811,0.207833,0.007241


Unnamed: 0,iterations,test-AUC-mean,test-AUC-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
86,86,0.812328,0.005129,0.24331,0.002539,0.222798,0.002185


In [43]:
from sklearn.model_selection import train_test_split

df[['datum','avd','streck','häst','kusk']] = dforg[['datum','avd','streck','häst','kusk']]

# df.drop('datum',axis=1,inplace=True)
df.drop('avd',axis=1,inplace=True)
df.drop(['streck'],axis=1,inplace=True)
# df.drop(['häst','kusk'],axis=1,inplace=True)
cat_features = list(df.loc[:,df.dtypes=='O'].columns)
X_train,X_test,y_train,y_test = train_test_split(df,y,shuffle=False,)
print("test:",X_test.datum.min(),X_test.datum.max())
print("train:",X_train.datum.min(),X_train.datum.max())
cb=get_model(use_best=True)
cb.fit(X_train,y_train,eval_set= (X_test,y_test),early_stopping_rounds=200, cat_features=cat_features,verbose=100)

test: 2020-01-18 2022-03-05
train: 2014-12-28 2020-01-18
0:	learn: 0.6306328	test: 0.6201398	best: 0.6201398 (0)	total: 77.8ms	remaining: 3m 53s
100:	learn: 0.7093068	test: 0.6405142	best: 0.6427984 (69)	total: 9.83s	remaining: 4m 42s
200:	learn: 0.7312751	test: 0.6419910	best: 0.6497777 (151)	total: 19.7s	remaining: 4m 34s
300:	learn: 0.7512802	test: 0.6399053	best: 0.6497777 (151)	total: 29.4s	remaining: 4m 23s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.6497777346
bestIteration = 151

Shrink model to first 152 iterations.


<catboost.core.CatBoostClassifier at 0x258d3d13190>

In [44]:
X_test[['datum','avd','vodds']] = dforg[['datum','avd','vodds']]
predict_prob = cb.predict_proba(X_test)

_,prob_score = proba_order_score(X_test ,y_test, predict_prob)

print('cb med ekipage',prob_score, cb.best_score_['validation']['AUC'])

log(proba) -0.6808309810850233
cb med ekipage 4.102972399150743 0.705252395847655


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


# FLAML (med och utan ekipage och streck)
För att köra enbart FLAML initiera först med allt innan plus walkthrough  

In [45]:
def ordinal_enc(df_, features):
    df = df_.copy()
    from sklearn.preprocessing import OrdinalEncoder
    enc = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=100000)
    enc.fit(df[[features]])
    df[features] = enc.transform(df[[features]])
    return df,enc

# df,enc = ordinal_enc(dforg,'häst')
# import pickle
# pickle.dump(enc, open('modeller/encoder.sav', 'wb'))


In [46]:
def split_data(df_,train_from_proc=0,test_proc=0.25):
    # train_from_proc = where to start both train and test
    # test_proc = how much of the data is test
    df=df_.copy()
    alla_datum = df.datum.unique()
    train_from_datum = alla_datum[ int(len(alla_datum)*train_from_proc)]
    X_test=None
    y_test=None
    
    if test_proc:
        selected_data = alla_datum[ alla_datum >= train_from_datum ]
        test_from_datum = selected_data[ int(len(selected_data)*(1-test_proc)) ]
        X_test  = df[df.datum >= test_from_datum]
        y_test  = (X_test.plac==1)*1
        X_test  = X_test.drop('plac',axis=1)
        print(f'test from {X_test.datum.min()} to {X_test.datum.max()} (incl)')
    
        X_train = df[(df.datum >= train_from_datum) & (df.datum < test_from_datum) ]
    else:
        print('Only train data - No test')
        X_train = df[(df.datum >= train_from_datum)]
    
    y_train = (X_train.plac==1)*1
    
    print(f'train from {X_train.datum.min()} to {X_train.datum.max()} (incl)')
    
    return X_train.drop('plac', axis=1), X_test, y_train, y_test


In [47]:
# prepare all data för flaml
dforg = pd.read_csv('all_data.csv')  

### enc is the encoder that we will save for use during v75_spel.py ###
### It will be used and finally saved later in this code ###
df,env = ordinal_enc(dforg,'häst')

X_train, X_test, y_train, y_test= split_data(df,train_from_proc=0,test_proc=0.25)
X_train = remove_features(X_train)
X_test  = remove_features( X_test)

# X_train = X_train.drop('streck', axis=1)
# X_test  = X_test.drop ('streck', axis=1)
# X_train.drop('datum', axis=1, inplace=True)
# X_test.drop( 'datum', axis=1, inplace=True)
cat_features = list(X_train.select_dtypes('object').columns)
# X_train, X_test = replace_NaN(X_train.copy(),X_test=X_test.copy(), cat_features=cat_features) 
# X_train.fillna(-1)
# X_test.fillna(-1)
print(cat_features)
X_train.shape,X_test.shape

test from 2020-01-11 to 2022-03-05 (incl)
train from 2014-12-28 to 2020-01-04 (incl)
['datum', 'bana', 'kusk', 'kön', 'h1_kusk', 'h1_bana', 'h2_kusk', 'h2_bana', 'h3_kusk', 'h3_bana', 'h4_kusk', 'h4_bana', 'h5_kusk', 'h5_bana']


((32861, 68), (11117, 68))

In [48]:
from flaml import AutoML 
cat_features = list(X_train.select_dtypes('object').columns)
starting_points={'lgbm': {'n_estimators': 38,
  'num_leaves': 4,
  'min_child_samples': 2,
  'learning_rate': 0.19098448074739216,
  'log_max_bin': 7,
  'colsample_bytree': 0.8827412174089042,
  'reg_alpha': 0.004577823970660193,
  'reg_lambda': 0.03815584533462228},
 'rf': {'n_estimators': 33,
  'max_features': 0.3251674877768946,
  'max_leaves': 89,
  'criterion': 'entropy'},
 'catboost': {'early_stopping_rounds': 50,
  'learning_rate': 0.007511731949060241},
 'xgboost': {'n_estimators': 575,
  'max_leaves': 46,
  'min_child_weight': 1.032235057697502,
  'learning_rate': 0.013318439439138472,
  'subsample': 0.7908401179782586,
  'colsample_bylevel': 0.6924750037579576,
  'colsample_bytree': 0.7174828796230647,
  'reg_alpha': 0.15461500385937774,
  'reg_lambda': 0.6619886587472544},
 'extra_tree': {'n_estimators': 47,
  'max_features': 0.7934349565988307,
  'max_leaves': 213,
  'criterion': 'entropy'}}
flml_raw_parms={'task': 'classification','split_type':'time', 'metric':'roc_auc', 'starting_points': starting_points,'verbose':False,
        'time_budget':1200, 'max_iter':50000000,'n_jobs':5, 'X_val': X_test, 'y_val':y_test,'early_stop':True, 'ensemble':True}

automl_raw = AutoML()
automl_raw.fit(X_train,y_train, **flml_raw_parms)

In [49]:
flm_raw_train_pred= automl_raw.predict_proba(X_train)
flm_raw_test_pred = automl_raw.predict_proba(X_test)

X_test_raw = X_test.copy()
X_test_raw[['datum','avd','vodds']] = dforg[['datum','avd','vodds']]
_,prob_score = proba_order_score(X_test_raw,y_test, flm_raw_test_pred)

print('timeserie, datum,häst, kusk', prob_score, 1-automl_raw.best_loss)
# X_test.columns

log(proba) -1.8577109580524114
timeserie, datum,häst, kusk 3.1081932773109244 0.8200770273677597


## Final FLML models

In [50]:
def save_best_config(best_config):
    # save best_config
    import pickle
    with open('best_config_per_estimator.sav', "wb") as f:
        pickle.dump(best_config, f)

In [51]:
def run_flaml(X_train, y_train, df_perf, save=True):
    # read best_config
    with open('best_config_per_estimator.sav', "rb") as f:
        best_config = pickle.load(f)
        
    from_date = X_train.datum.min()
    to_date = X_train.datum.max()
    automl = [None,None]
    for with_streck in [True, False]:
        if with_streck: 
            X_tr = X_train.copy()    
            filename = 'modeller\\FLAML_model.sav'
        else:
            X_tr = X_train.drop('streck', axis=1).copy()
            filename = 'modeller\\FLAML2_model.sav'
            
        print('with_streck = ',with_streck)   
    
        automl[with_streck] = AutoML()
        
        with open('best_config_per_estimator.sav', "rb") as f:
            best_config = pickle.load(f)
            
        flml_parms={'task': 'classification','split_type':'time', 'metric':'roc_auc','starting_points': best_config[with_streck], 'verbose':False,
        'time_budget':2600,'n_jobs':5, 'early_stop':True, 'ensemble':True}

        automl[with_streck].fit(X_tr, y_train, **flml_parms)
        perf = 1-automl[with_streck].best_loss
        print(perf, 'for streck in columns', with_streck)
        df_perf.loc[len(df_perf)] = [from_date, to_date, with_streck, perf]
        
        # save_model
        if save:
            print('save model in',filename)   
            with open(filename,"wb") as f:
                pickle.dump(automl[with_streck], f, pickle.HIGHEST_PROTOCOL)
                
            print('save encoder enc in encoder.sav')
            with open('encoder.sav',"wb") as f:
                pickle.dump(enc, f, pickle.HIGHEST_PROTOCOL)
             
    save_best_config(best_config)  

    # remove duplicates
    df_perf.drop_duplicates(subset=['learn_from','learn_to','streck'], keep='last', inplace=True)

    print('\n','\n')
    display(df_perf.tail(30).sort_values(by=['perf'], ascending=False))
    print('Med streck max:',df_perf.loc[df_perf.streck == True].perf.max())
    print('Ej  streck max:',df_perf.loc[df_perf.streck == False].perf.max())

    df_perf.to_csv('perf_flaml.csv', index=False)
    return automl

In [52]:
# prepare all data för flaml
dforg = pd.read_csv('all_data.csv')  
df,enc = ordinal_enc(dforg, 'häst')
X_train, _, y_train, _ = split_data(df,train_from_proc=0,test_proc=None)
X_train = remove_features(X_train)

df_perf = pd.read_csv('perf_flaml.csv')
automl = run_flaml(X_train, y_train, df_perf)


Only train data - No test
train from 2014-12-28 to 2022-03-05 (incl)
with_streck =  True


2022-03-07 10:09:09.434 INFO    flaml.searcher.blendsearch: No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'. More info can be found at https://microsoft.github.io/FLAML/docs/FAQ#about-low_cost_partial_config-in-tune


0.817578749158179 for streck in columns True
save model in modeller\FLAML_model.sav
save encoder enc in encoder.sav
with_streck =  False
0.7274465110751244 for streck in columns False
save model in modeller\FLAML2_model.sav
save encoder enc in encoder.sav

 



Unnamed: 0,learn_from,learn_to,streck,perf
2,2017-02-26,2022-01-01,True,0.818728
0,2017-02-26,2021-12-31,True,0.818714
20,2014-12-28,2022-02-26,True,0.817911
18,2014-12-28,2022-02-19,True,0.817896
16,2014-12-28,2022-02-12,True,0.817811
22,2014-12-28,2022-03-05,True,0.817579
14,2014-12-28,2022-02-05,True,0.817489
12,2014-12-28,2022-01-29,True,0.817124
10,2014-12-28,2022-01-22,True,0.816984
4,2014-12-28,2022-01-01,True,0.816678


Med streck max: 0.8187283877462516
Ej  streck max: 0.7287040948165976


# remove garbage

In [53]:
def remove_garbage():
    import subprocess
    subprocess.call([r'C:/Users/peter/Documents/MyProjects/PyProj/Trav/spel/remove_dirt.bat'])
remove_garbage()    

# Tester

In [54]:
# Kolla autmoml-grejer från dokumntationen
#### How much time is needed to find the best model​
#### If you want to get a sense of how much time is needed to find the best model, you can use max_iter = 2 to perform two trials first.
#### You will see the time to finish the first and cheapest trial in seconds. 
#### The estimated necessary time budget in, and the estimated sufficient time budget inseconds. 
def set_time_budget():
  import time
  start_time = time.time()

  automl = AutoML()

  with open('best_config_per_estimator.sav', "rb") as f:
      best_config = pickle.load(f)
      
  flml_parms = {'task': 'classification', 'split_type': 'time', 'metric': 'roc_auc', 'starting_points': best_config[0], 
                'time_budget':2600, 'n_jobs': 5, 'early_stop': True, 'ensemble': True}
              # 'time_budget': 1700, 'max_iter': 400000000, 'n_jobs': 5, 'early_stop': True, 'ensemble': True,'verbose': True,}

  automl.fit(X_train, y_train, log_file_name='flaml_log.json', **flml_parms)

  return automl, time.time() - start_time

tid=None
# automl, tid = set_time_budget()  # tid in sekunder
print('tid', tid)

tid None


In [55]:
print('best_estimator:', automl.best_estimator)
print('best eest. config:',automl.best_config)
print('\nbest per est.:',automl.best_config_per_estimator)
print('\nbest_config train time:',automl.best_config_train_time)
print('best iteration:',automl.best_iteration)
print('best loss:',automl.best_loss)
print('time to find best mod.',automl.time_to_find_best_model)
print('\nhistory:',automl.config_history)

AttributeError: 'list' object has no attribute 'best_estimator'

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from flaml.data import get_output_from_log
time_history, best_valid_loss_history, valid_loss_history, config_history, metric_history = get_output_from_log(
    filename="flaml_log.json", time_budget=120)

plt.title("Learning Curve")
plt.xlabel("Wall Clock Time (s)")
plt.ylabel("Validation Accuracy")
plt.step(time_history, 1 - np.array(best_valid_loss_history), where="post")
plt.show()


## Hur skall vi köra den ny modellen 
1. preprocessa datat (nya kolumner)
    - proba och eller Kelly, ant hästar i loppet, favoriter, bara solklara favoriter
2. bestäm cat_features
3. Kör catboost eller flaml?
   - om flaml träna upp den
   - 

In [None]:
remove_garbage()