# Testar logiken i jupyter innan jag går över till python med streamlit

## Läs in fejkad web scraping data

## Ladda in två flaml-modeller samt ett par av de nya modellerna

## Skapa en egen data prepare för varje modell och kör dem 
### och spara en predict per modell

## Fejka knappen scrape

## Fejka valet av modell och vad som skall triggas

## Visualisera det tillhörande datat

In [2]:
# moduler
import pandas as pd
import numpy as np
import pickle
from catboost import CatBoostClassifier, Pool, cv
from IPython.display import display
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 200)


In [3]:
def remove_features(df_, remove_mer=[]):
    df = df_.copy()
    df.drop(['startnr', 'vodds', 'podds', 'bins', 'h1_dat',
             'h2_dat', 'h3_dat', 'h4_dat', 'h5_dat'], axis=1, inplace=True)
    if remove_mer:
        df.drop(remove_mer, axis=1, inplace=True)

    return df

In [4]:
def prepare_for_catboost(X):
    # get numerical features and cat_features
    num_features = list(X.drop('avd', axis=1).select_dtypes(include=[np.number]).columns)
    cat_features = list(X.drop('avd', axis=1).select_dtypes(include=['object']).columns)
    print(f'Rätt längd på summan (med datum)? {1+len(num_features)+len(cat_features)} == {len(X.columns)}')
    # check cat_features isna
    print('NaN before:', X[cat_features].isna().sum()[X[cat_features].isna().sum() > 0].sort_values(ascending=False).sum())

    # impute 'missing' for all NaN in cat_features
    X[cat_features] = X[cat_features].fillna('missing')
    print('NaN after:', X[cat_features].isna().sum().sum())
    return X, cat_features

In [5]:
def kelly(proba, streck, odds):  # proba = prob winning, streck i % = streck
    with open('rf_streck_odds.pkl', 'rb') as f:
        rf = pickle.load(f)

    if odds is None:
        o = rf.predict(streck.copy())
    else:
        o = rf.predict(streck.copy())

    # for each values > 40 in odds set to 1
    o[o > 40] = 1
    return (o*proba - (1-proba))/o


In [6]:
# antal hästar per avdeling
def lägg_in_antal_hästar(df_):
    df = df_.copy()
    df['ant_per_lopp'] = None
    df['ant_per_lopp'] = df.groupby(['datum', 'avd'])['avd'].transform('count')
    return df

# mest streck per avdeling
def mest_streck(X_, i, datum, avd):
    X = X_.copy()
    X.sort_values(by=['datum', 'avd', 'streck'], ascending=[
                  True, True, False], inplace=True)
    return X.loc[(X.datum == datum) & (X.avd == avd), 'streck'].iloc[i]

# mest streck per avd som fetures (n bästa)
def lägg_in_motståndare(X_, ant_motståndare):
    X = X_.copy()

    # set X['motståndare1'] to largest streck in every avd
    grouped = X.groupby(['datum', 'avd'])['streck']
    X['motståndare1'] = grouped.transform(max)

    for i in range(2, ant_motståndare+1):
        # set X['motståndare'+str(i)] to ith largest streck in every avd
        X['motståndare' + str(i)] = grouped.transform(lambda x: x.nlargest(i).min())

    return X

# diff streck per avd som fetures (n största diffarna)
def lägg_in_diff_motståndare(X_, motståndare):
    X = X_.copy()

    # set X['motståndare1'] to largest streck in every avd
    grouped = X.groupby(['datum', 'avd'])['streck']
    X['diff1'] = grouped.transform(max) - X.streck

    for i in range(2, motståndare+1):
        # set X['motståndare'+str(i)] to ith largest streck in every avd
        X['diff' + str(i)] = grouped.transform(lambda x: x.nlargest(i).min()) - X.streck

    return X

In [7]:
def compute_total_insats(df):
    insats = 0
    # group by avd
    summa = df.groupby('avd').avd.count().prod() / 2
    return summa

# för en omgång (ett datum) ta ut största diff för streck per avd men bara om diff >= 25 om only_clear
def lista_med_favoriter(df_, ant, only_clear):
    df = df_.copy()
    min_diff = 25 if only_clear else 0
    # sortera på avd,streck
    df = df.sort_values(['avd', 'streck'], ascending=[False, False])
    diff_list = []
    for avd in range(1, 8):
        diff = df.loc[df.avd == avd].streck.iloc[0] - \
            df.loc[df.avd == avd].streck.iloc[1]
        if diff >= min_diff:
            diff_list.append((avd, diff))

     # sortera på diff
    diff_list = sorted(diff_list, key=lambda x: x[1], reverse=True)
    return diff_list[:ant]

# temp is a list of tuples (avd, diff). check if avd is in the list
def check_avd(avd, temp):
    for t in temp:
        if t[0] == avd:
            return True
    return False


In [8]:
class Typ():
    def __init__(self, name, ant_hästar, proba, kelly, motst_ant, motst_diff,  ant_favoriter, only_clear, streck):
        assert (motst_diff==False and motst_ant==0) or (motst_ant>0)
        assert (ant_favoriter==0 and only_clear==False) or (ant_favoriter>0)
        self.name = name                # string för filnamn mm
        
        # inkludera features eller ej
        self.ant_hästar = ant_hästar    # int feature med antal hästar per avdelning
        self.motst_ant = motst_ant      # int inkludera n features med bästa motståndare (streck) 
        self.motst_diff = motst_diff    # bool ovanstående med diff istf fasta värden
        self.streck = streck            # bool inkludera feature med streck
        
        # urval av rader
        self.proba = proba              # bool för prioritering vid urval av rader
        self.kelly = kelly              # bool för prioritering vid urval av rader
        self.ant_favoriter = ant_favoriter # int för hur många favoriter (avd med en häst) som ska användas
        self.only_clear = only_clear    # bool för att bara avvända klara favoriter
    
    def load_model(self):
        with open(self.name+'.model', 'rb') as f:
            model = pickle.load(f)
        return model
    
    def save_model(self,model):
        with open(self.name+'.model', 'wb') as f:
            pickle.dump(model, f)
            
    def prepare_data(self, X_):
        X = X_.copy()
        if self.ant_hästar:
            X = lägg_in_antal_hästar(X)
        if self.motst_diff:
            X = lägg_in_diff_motståndare(X, self.motst_ant)
        elif self.motst_ant>0:
            X = lägg_in_motståndare(X, self.motst_ant)
        # Behåll streck ända tills learn och predict (används för prioritera rader)
        return X

    def learn(self, X_, y, iterations=1000, save=True, verbose=False):
        cbc = CatBoostClassifier(iterations=iterations, loss_function='Logloss', eval_metric='AUC', verbose=verbose)

        X = self.prepare_data(X_)
        if not self.streck:
            X.drop('streck', axis=1, inplace=True)
        # X.drop('avd', axis=1, inplace=True)
        X = remove_features(X, remove_mer=['avd','datum'])
        # print(X.columns)
        cat_features = X.select_dtypes(include=['object']).columns.tolist()
        cbc.fit(X, y, cat_features, use_best_model=False)
        # print('best iter',cbc.best_iteration_)
        print('best score',cbc.best_score_)
        if save:
            self.save_model(cbc)

    def predict(self, X_):
        X = self.prepare_data(X_)
        X = remove_features(X, remove_mer=['avd','datum'])
        model = self.load_model()
        if not self.streck:
            X.drop('streck', axis=1, inplace=True)
            
        return model.predict_proba(X)[:,1]
    
    def spela(self,X_, max_insats=300, margin=1.2):
        print(f'Max insats={max_insats} Margin={margin}')
        X = X_.copy()
        X['proba'] = self.predict(X)
        X['kelly'] = kelly(X.proba, X[['streck']], None)

        dfSpel = pd.DataFrame()
        if self.proba:
            X = X.sort_values(by='proba', ascending=False)
            if self.kelly:
                X2 = X.sort_values(by='kelly', ascending=False)
        else:
            X = X.sort_values(by='kelly', ascending=False) # must be kelly
            
        # se till att vi har minst en häst för alla avd. Välj den bästa per avd
        for avd in range(1, 8):
            dfSpel = dfSpel.append(X[X.avd == avd].iloc[0])
        
        favorit_list = lista_med_favoriter(X, self.ant_favoriter, self.only_clear)
        curr_insats = 0
        for cnt_rows, (_, row) in enumerate(X.iterrows()): 
            if check_avd(row.avd, favorit_list):  # avd med en favorit - inga fler hästar
                continue
            
            dfSpel = dfSpel.append(row)
            
            curr_insats = compute_total_insats(dfSpel)
            if curr_insats > max_insats*margin:   # överstiger x% av max insats?
                dfSpel = dfSpel.iloc[:-1, :]    # ta bort sista hästen
                curr_insats = compute_total_insats(dfSpel)
                break
            
            if self.kelly & self.proba:
                row2 = X2.iloc[cnt_rows]
                dfSpel = dfSpel.append(row2) # Addera en häst med bästa kelly
                # remove duplicates in dfSpel and keep the first
            
            dfSpel = dfSpel.drop_duplicates(subset=['avd', 'häst'], keep='first')
            curr_insats = compute_total_insats(dfSpel) # kolla igen
            if curr_insats > max_insats*margin:   # överstiger x% av max insats?
                dfSpel = dfSpel.iloc[:-1, :]    # ta bort sista hästen
                curr_insats = compute_total_insats(dfSpel)
                break
            
        return dfSpel,curr_insats

In [9]:
# skapa modeller
#           name, ant_hästar, proba, kelly, motst_ant, motst_diff,  ant_favoriter, only_clear, streck
typ6 = Typ('typ6', True,       True, False,    0,       False,         0,          False,      True)
typ1 = Typ('typ1', False,      True, False,    2,       True,          2,          True,      False)
typ9 = Typ('typ9', True,       True, True,     2,       True,          2,          True,      True)
typ16 =Typ('typ16',True,       True, True,     2,       True,          2,          False,      True)


In [10]:
# läs in data
df = pd.read_csv('..\\all_data.csv')
# Följande datum saknar avd==5 och kan inte användas
saknas = ['2015-08-15', '2016-08-13', '2017-08-12']
df = df[~df.datum.isin(saknas)]
X = df.copy()
X.drop('plac', axis=1, inplace=True)
# X = ordinal_enc(X, 'häst')
y = (df.plac == 1)*1   # plac 1 eller 0

for f in ['häst', 'bana', 'kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']:
    X[f] = X[f].str.lower()

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

X, cat_features = prepare_for_catboost(X)
print('cat_features:', cat_features)
X.tail()


Rätt längd på summan (med datum)? 78 == 78
NaN before: 369
NaN after: 0
cat_features: ['datum', 'bana', 'häst', 'kusk', 'kön', 'h1_dat', 'h1_kusk', 'h1_bana', 'h2_dat', 'h2_kusk', 'h2_bana', 'h3_dat', 'h3_kusk', 'h3_bana', 'h4_dat', 'h4_kusk', 'h4_bana', 'h5_dat', 'h5_kusk', 'h5_bana']


Unnamed: 0,datum,avd,bana,häst,kusk,streck,vodds,podds,kr,spår,dist,lopp_dist,start,ålder,kön,pris,h1_dat,h1_kusk,h1_bana,h1_spår,h1_plac,h1_pris,h1_odds,h1_kmtid,h2_dat,h2_kusk,h2_bana,h2_spår,h2_plac,h2_pris,h2_odds,h2_kmtid,h3_dat,h3_kusk,h3_bana,h3_spår,h3_plac,h3_pris,h3_odds,h3_kmtid,h4_dat,h4_kusk,h4_bana,h4_spår,h4_plac,h4_pris,h4_odds,h4_kmtid,h5_dat,h5_kusk,h5_bana,h5_spår,h5_plac,h5_pris,h5_odds,h5_kmtid,h1_dist,h2_dist,h3_dist,h4_dist,h5_dist,bins,h1_auto,h2_auto,h3_auto,h4_auto,h5_auto,h1_perf,h2_perf,h3_perf,h4_perf,h5_perf,senast,delta1,delta2,delta3,delta4,startnr
43777,2022-03-05,7,kalmar,ready häggenäs,adrian kolgjini,1.0,34.14,6.12,11040.0,8.0,2140.0,2140.0,0,5,v,1.0,2022-02-17 00:00:00,adrian kolgjini,åby-8,12.0,2.0,35.0,8.72,13.0,2022-02-08 00:00:00,peter ingves,jägersro-7,12.0,2.0,35.0,2.16,14.9,2022-01-05 00:00:00,michael nimczyk,jägersro-9,12.0,5.0,110.0,14.11,15.3,2021-12-23 00:00:00,thomas uhrberg,halmstad-6,12.0,5.0,100.0,10.39,13.0,2021-12-09 00:00:00,thomas uhrberg,åby-9,12.0,5.0,40.0,4.42,14.5,1640.0,2160.0,2640.0,1640.0,2140.0,22,1,0,1,1,1,3935.030968,3935.030968,1556.570345,1484.131591,938.647235,16.0,9.0,34.0,13.0,14.0,8.0
43778,2022-03-05,7,kalmar,bear a perfection,kevin oscarsson,1.0,40.34,6.53,10702.0,9.0,2140.0,2140.0,0,6,v,1.0,2022-02-08 00:00:00,håkan b johansson,jägersro-9,6.0,15.0,35.0,10.26,14.6,2022-01-23 00:00:00,kevin oscarsson,halmstad-5,6.0,1.0,40.0,15.81,15.6,2022-01-14 00:00:00,kevin oscarsson,kalmar-9,6.0,4.0,40.0,12.05,14.3,2021-12-14 00:00:00,kevin oscarsson,axevalla-8,6.0,4.0,30.0,4.54,15.0,2021-12-03 00:00:00,markus niklasson,kalmar-7,6.0,4.0,30.0,25.85,16.1,2140.0,2640.0,2140.0,2100.0,2640.0,24,1,1,1,1,1,5.91608,6935.717077,1547.567662,1340.232909,1340.232909,25.0,16.0,9.0,31.0,11.0,9.0
43779,2022-03-05,7,kalmar,nice and quick,gustav johansson,0.0,93.24,12.05,13504.0,10.0,2140.0,2140.0,0,6,v,0.0,2022-02-17 00:00:00,gustav johansson,åby-8,2.0,4.0,35.0,4.18,13.1,2022-01-23 00:00:00,gustav johansson,halmstad-3,2.0,5.0,35.0,2.91,14.9,2022-01-08 00:00:00,gustav johansson,mantorp-9,2.0,4.0,110.0,21.44,12.9,2021-12-10 00:00:00,gustav johansson,kalmar-7,2.0,4.0,30.0,2.76,15.5,2021-11-20 00:00:00,gustav johansson,jägersro-9,2.0,4.0,110.0,24.86,12.7,1640.0,1640.0,1640.0,1640.0,1640.0,29,1,1,1,1,1,1447.616994,878.02409,2566.350636,1340.232909,2566.350636,16.0,25.0,15.0,29.0,20.0,10.0
43780,2022-03-05,7,kalmar,graces candy,örjan kihlström,5.0,13.27,3.2,18057.0,11.0,2140.0,2140.0,0,5,s,5.0,2022-02-19 00:00:00,örjan kihlström,bergsåker-5,5.0,15.0,110.0,5.51,14.7,2022-01-29 00:00:00,jörgen westholm,östersund-7,5.0,2.0,110.0,11.15,13.1,2022-01-17 00:00:00,jörgen westholm,romme-8,5.0,20.0,40.0,6.75,30.0,2022-01-09 00:00:00,jörgen westholm,romme-10,5.0,20.0,35.0,5.82,15.1,2022-01-05 00:00:00,örjan kihlström,solvalla-2,5.0,20.0,40.0,2.56,14.8,2140.0,1640.0,2140.0,2140.0,2160.0,14,0,1,1,1,0,10.488088,6976.0643,0.519151,0.485621,0.519151,14.0,21.0,12.0,8.0,4.0,11.0
43781,2022-03-05,7,kalmar,granit de la roque,markus niklasson,0.0,109.97,16.66,11667.0,12.0,2140.0,2140.0,0,6,v,0.0,2022-02-12 00:00:00,jonathan carre,åby-1,3.0,3.0,35.0,1.94,15.8,2022-01-30 00:00:00,jonathan carre,axevalla-6,3.0,1.0,35.0,2.28,16.2,2022-01-23 00:00:00,jim oscarsson,halmstad-3,3.0,1.0,35.0,19.89,14.4,2022-01-14 00:00:00,jonathan carre,kalmar-3,3.0,1.0,30.0,2.33,15.9,2022-01-08 00:00:00,jim oscarsson,mantorp-3,3.0,1.0,30.0,5.52,15.2,2140.0,2140.0,1640.0,2140.0,2140.0,29,1,1,1,1,1,2386.716929,6487.769258,6487.769258,6006.507182,6006.507182,21.0,13.0,7.0,9.0,6.0,12.0


In [11]:
# fejkad omgång
df_omg = df.loc[df.datum==df.datum.unique()[-1]].copy()
df_omg.reset_index(inplace=True, drop=True)
X_omg = df_omg.drop('plac', axis=1)
y_omg = (df_omg.plac == 1)*1   # plac 1 eller 0
X_omg, _ = prepare_for_catboost(X_omg)
print(X_omg.datum.iloc[-1])


Rätt längd på summan (med datum)? 78 == 78
NaN before: 0
NaN after: 0
2022-03-05


In [12]:
# Kör fejkad omgång
# prepare_for_catboost
# X,_ = prepare_for_catboost(df_omg)
for f in ['häst','bana', 'kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']:
    X_omg[f] = X_omg[f].str.lower()

# en rad för varje modell
rader = pd.DataFrame()
for typ in [typ6,typ1,typ9,typ16]: # modeller
    rad, insats=typ.spela(X_omg)
    rad.sort_values(by=['avd','häst'], inplace=True)
    rader[['proba'+typ.name, 'kelly'+typ.name]] = rad[['proba','kelly']].copy()
    print(typ.name, insats)
    #print(rad[['avd','startnr','häst','proba','kelly']].sort_values(by=['avd','startnr'], ascending=True))


Max insats=300 Margin=1.2
typ6 288.0
Max insats=300 Margin=1.2
typ1 336.0
Max insats=300 Margin=1.2
typ9 216.0
Max insats=300 Margin=1.2
typ16 216.0


In [13]:
# för stacking ta med alla hästar per typ och proba plus kelly
def build_stack_df(df_):
    df = df_.copy()
    for typ in [typ6,typ1,typ9,typ16]:
        nr = typ.name[3:]
        df['proba'+nr] = typ.predict(df)
        df['kelly'+nr] = kelly(df['proba'+nr], df[['streck']], None)
    col = list(df.columns[1:2])+list(df.columns[3:4])+list(df.columns[-8:])
    return df.loc[:,col]
df_stack=build_stack_df(X)  


In [14]:
from sklearn.ensemble import RandomForestClassifier
rf_stacker = RandomForestClassifier(n_estimators=100, max_depth=5, random_state=0)
cols = list(df_stack.columns)[-8:]
rf_stacker.fit(df_stack[cols], y)
# loop med följande:
#   1. ta fram datum
#   2. X_train = X[X.datum<datum], y_train = y[X.datum<datum]
#   3. X_test = X[X.datum==datum], y_test = y[X.datum==datum]
#   4. X.validate = X[X.datum>datum], y.validate = y[X.datum>datum]
#   5. for typ in [typ6,typ1,typ9,typ16]:
#       5.1. typ.learn(X_train, y_train,X_test, y_test)
#   6. stack_df = build_stack_df(X_validate)


RandomForestClassifier(max_depth=5, random_state=0)

## Only for Learning

In [23]:
# only for Learn!
df = pd.read_csv('..\\all_data.csv')
# Följande datum saknar avd==5 och kan inte användas
saknas = ['2015-08-15', '2016-08-13', '2017-08-12']
df = df[~df.datum.isin(saknas)]
X = df.copy()
X.drop('plac', axis=1, inplace=True)
# X = ordinal_enc(X, 'häst')
y = (df.plac == 1)*1   # plac 1 eller 0

for f in ['häst', 'bana', 'kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']:
    X[f] = X[f].str.lower()

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

X, cat_features = prepare_for_catboost(X)
print('cat_features:', cat_features)
X.head()


Rätt längd på summan (med datum)? 78 == 78
NaN before: 369
NaN after: 0
cat_features: ['datum', 'bana', 'häst', 'kusk', 'kön', 'h1_dat', 'h1_kusk', 'h1_bana', 'h2_dat', 'h2_kusk', 'h2_bana', 'h3_dat', 'h3_kusk', 'h3_bana', 'h4_dat', 'h4_kusk', 'h4_bana', 'h5_dat', 'h5_kusk', 'h5_bana']


Unnamed: 0,datum,avd,bana,häst,kusk,streck,vodds,podds,kr,spår,dist,lopp_dist,start,ålder,kön,pris,h1_dat,h1_kusk,h1_bana,h1_spår,h1_plac,h1_pris,h1_odds,h1_kmtid,h2_dat,h2_kusk,h2_bana,h2_spår,h2_plac,h2_pris,h2_odds,h2_kmtid,h3_dat,h3_kusk,h3_bana,h3_spår,h3_plac,h3_pris,h3_odds,h3_kmtid,h4_dat,h4_kusk,h4_bana,h4_spår,h4_plac,h4_pris,h4_odds,h4_kmtid,h5_dat,h5_kusk,h5_bana,h5_spår,h5_plac,h5_pris,h5_odds,h5_kmtid,h1_dist,h2_dist,h3_dist,h4_dist,h5_dist,bins,h1_auto,h2_auto,h3_auto,h4_auto,h5_auto,h1_perf,h2_perf,h3_perf,h4_perf,h5_perf,senast,delta1,delta2,delta3,delta4,startnr
0,2014-12-28,1,örebro,allaballakaitoz,carl-erik lindblom,5.0,13.39,2.985,21018.0,6.0,2100.0,2100.0,0,6,v,125000.0,2014-12-07 00:00:00,carl-erik lindblom,eskilstuna,3.0,2.0,35.0,3.92,16.8,2014-11-18 00:00:00,carl-erik lindblom,eskilstuna,3.0,1.0,30.0,3.7,14.9,2014-11-01 00:00:00,carl-erik lindblom,eskilstuna,3.0,15.0,125.0,52.42,14.3,2014-10-22 00:00:00,carl-erik lindblom,solvalla,3.0,15.0,70.0,5.2,13.9,2014-10-04 00:00:00,per fromell,örebro,3.0,15.0,25.0,2.2,12.3,2140.0,2140.0,2640.0,2140.0,1609.0,14,1,1,1,1,1,3935.030968,6006.507182,11.18034,8.3666,5.0,21.0,19.0,17.0,10.0,18.0,
1,2014-12-28,1,örebro,aristocat boko,ulf ohlsson,7.0,8.21,2.555,23466.0,12.0,2100.0,2100.0,0,7,v,125000.0,2014-12-20 00:00:00,örjan kihlström,åby,7.0,4.0,125.0,12.45,30.0,2014-12-13 00:00:00,örjan kihlström,mantorp,7.0,5.0,100.0,7.55,30.0,2014-11-19 00:00:00,ulf ohlsson,solvalla,7.0,6.0,70.0,15.58,13.6,2014-11-05 00:00:00,ulf ohlsson,bergsåker,7.0,6.0,70.0,2.55,15.1,2014-10-25 00:00:00,ulf ohlsson,jägersro,7.0,6.0,100.0,7.2,15.6,1640.0,3180.0,2140.0,2640.0,3180.0,9,1,0,1,1,0,2735.73897,1484.131591,753.137355,753.137355,900.171313,8.0,7.0,24.0,14.0,11.0,
2,2014-12-28,1,örebro,art on line,johnny takter,23.0,2.92,1.565,20696.0,2.0,2100.0,2100.0,0,7,v,125000.0,2014-12-19 00:00:00,iina aho,axevalla,9.0,2.0,20.0,1.95,13.2,2014-12-13 00:00:00,mikko aho,mantorp,9.0,20.0,125.0,35.1,30.0,2014-11-24 00:00:00,iina aho,halmstad,9.0,1.0,20.0,3.72,12.4,2014-10-14 00:00:00,mikko aho,jägersro,9.0,1.0,30.0,5.8,14.3,2014-09-29 00:00:00,mikko aho,halmstad,9.0,1.0,40.0,3.0,15.0,1640.0,2140.0,1640.0,2140.0,3180.0,2,1,1,1,1,0,2974.603812,0.917738,4904.292577,6006.507182,6935.717077,9.0,6.0,19.0,41.0,15.0,
3,2014-12-28,1,örebro,bear dancer,erik adielsson,48.0,2.35,1.49,27477.0,1.0,2100.0,2100.0,0,4,v,125000.0,2014-12-10 00:00:00,tino ärling,solvalla,5.0,1.0,100.0,1.9,14.3,2014-11-15 00:00:00,johan untersteiner,solvalla,5.0,15.0,200.0,4.62,12.0,2014-11-01 00:00:00,björn goop,eskilstuna,5.0,1.0,110.0,8.39,13.3,2014-10-19 00:00:00,björn goop,solvalla,5.0,1.0,100.0,16.6,13.1,2014-10-11 00:00:00,ulf ohlsson,åby,5.0,1.0,110.0,3.3,13.7,2140.0,2140.0,2140.0,2140.0,2140.0,1,1,1,1,1,1,10966.331584,14.142136,11501.585598,10966.331584,11501.585598,18.0,25.0,14.0,13.0,8.0,
4,2014-12-28,1,örebro,by air,torbjörn jansson,5.0,14.86,3.12,30589.0,5.0,2100.0,2100.0,0,7,v,125000.0,2014-12-12 00:00:00,robert bergh,bergsåker,8.0,15.0,30.0,2.42,17.6,2014-04-26 00:00:00,robert bergh,åby,8.0,15.0,125.0,10.4,13.6,2014-04-12 00:00:00,robert bergh,eskilstuna,8.0,4.0,125.0,12.2,15.3,2014-02-22 00:00:00,robert bergh,gävle,8.0,4.0,125.0,6.9,13.8,2014-02-08 00:00:00,robert bergh,solvalla,8.0,4.0,220.0,4.3,15.0,2640.0,2140.0,2640.0,1640.0,2140.0,15,1,1,1,1,1,5.477226,11.18034,2735.73897,2735.73897,3629.367876,16.0,230.0,14.0,49.0,14.0,


In [16]:
def model_streck_to_odds(X_):
    X = X_.copy()
    # import modules for linear regression
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error as mae
    # import random forest module
    from sklearn.ensemble import RandomForestRegressor

    X_odds = X.loc[X.vodds <= 40]  # remove outliers
    ix_break = int(len(X_odds.datum.unique())*0.75)
    test_start = X_odds.datum.unique()[ix_break]

    X_train, X_test = X_odds[X_odds.datum <
                             test_start], X_odds[X_odds.datum >= test_start]
    y_train, y_test = X_train['vodds'], X_test['vodds']
    X_train = X_train[['streck']].astype(float)
    X_test = X_test[['streck']].astype(float)

    # make a model of RF
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=0)
    rf.fit(X_train, y_train)
    y_predrf = rf.predict(X_test)
    # make a model and fit it
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_predlr = linreg.predict(X_test)

    # print the coefficients
    print('Coefficients:', linreg.coef_)
    # print the mean absolute error
    print("LR Mean absolute error: %.2f" % mae(y_test, y_predlr))
    print("RF Mean absolute error: %.2f" % mae(y_test, y_predrf))

    return linreg, rf


linreg, rf = model_streck_to_odds(X)   # used in next cell
# spara rf
import pickle
with open('rf_streck_odds.pkl', 'wb') as f:
    pickle.dump(rf, f)

Coefficients: [-0.52626587]
LR Mean absolute error: 6.08
RF Mean absolute error: 2.59


In [17]:
# Bara första gången. Initierar Typ-klassen
def learn(X_train, y_train, X_test=None, y_test=None, iterations=1000, cat_features=cat_features, verbose=False):
    cbc = CatBoostClassifier(iterations=iterations, loss_function='Logloss', eval_metric='AUC', verbose=verbose)
    X_train = remove_features(X_train, remove_mer=['avd','datum'])
    cat_features = X_train.select_dtypes(include=['object']).columns.tolist()
    train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
    if X_test is not None:
        X_test = remove_features(X_test, remove_mer=['avd', 'datum'])
        test_pool = Pool(X_test, label=y_test, cat_features=cat_features)
        cbc.fit(train_pool, eval_set=test_pool, early_stopping_rounds=100, use_best_model=True, verbose=verbose)
    else:
        cbc.fit(train_pool, use_best_model=True, verbose=verbose)
    return cbc

def beräkna_datum(X,fract=0.75):
    ix_break = int(len(X.datum.unique())*fract)
    test_start = X.datum.unique()[ix_break]
    return test_start
    
Xlearn, cat_features= prepare_for_catboost(X)  
# print(Xlearn.columns)
for typ in [typ6, typ1, typ9, typ16]:
    print(typ.name)
    Xtyp = typ.prepare_data(Xlearn)                                 ###########

    if not typ.streck:                                                ################
        Xtyp.drop('streck', axis=1, inplace=True)
        
    if True: # använda X_test    
        test_start = beräkna_datum(Xtyp)    
        X_train, X_test = Xtyp[Xtyp.datum < test_start], Xtyp[Xtyp.datum >= test_start]
        y_train, y_test = y[X_train.index], y[X_test.index]
        # print('innan learn',X_train.columns)
        typ_model = learn(X_train, y_train, X_test, y_test)  ##########
        print('best iteration',typ_model.best_iteration_)                             ##########
        print('best score',    typ_model.best_score_)                                 ##########
    # save model
    typ.save_model(typ_model)                                                       ##########                          



Rätt längd på summan (med datum)? 78 == 78
NaN before: 0
NaN after: 0
typ6
best iteration 33
best score {'learn': {'Logloss': 0.2212477609543931}, 'validation': {'Logloss': 0.2373986628628492, 'AUC': 0.815795512346553}}
typ1
best iteration 37
best score {'learn': {'Logloss': 0.22502165546361602}, 'validation': {'Logloss': 0.2426145813448483, 'AUC': 0.8006791783996938}}
typ9
best iteration 37
best score {'learn': {'Logloss': 0.2198722552496039}, 'validation': {'Logloss': 0.2377904795430033, 'AUC': 0.8142228934300292}}
typ16
best iteration 37
best score {'learn': {'Logloss': 0.2198722552496039}, 'validation': {'Logloss': 0.2377904795430033, 'AUC': 0.8142228934300292}}


In [86]:
X,cat_features = prepare_for_catboost(X)
typ6.learn(X,y, iterations=33) # best iter = 25 {'Logloss': 0.23245952928761984, 'AUC': 0.8262112132692319}
typ1.learn(X,y, iterations=39) # best iter = 39 {'Logloss': 0.23278308932319106, 'AUC': 0.826883367187688}
typ9.learn(X,y, iterations=37) # best iter = 37 {'Logloss': 0.23312091900160384, 'AUC': 0.8257515762557716}
typ16.learn(X,y,iterations=37) # best iter = 37 {'Logloss': 0.23312091900160384, 'AUC': 0.8257515762557716}


Rätt längd på summan (med datum)? 78 == 78
NaN before: 0
NaN after: 0
best score {'learn': {'Logloss': 0.2173589112255604}}
best score {'learn': {'Logloss': 0.21833813002706381}}
best score {'learn': {'Logloss': 0.21336846341715388}}
best score {'learn': {'Logloss': 0.21336846341715388}}


In [25]:
# stack predict for all models
def stack_predict(X_, models):
    X = X_.copy()
    for model in models:
        nr = model.name[3:]
        X['proba'+nr] = model.predict(X)
        X['kelly'+nr] = kelly(X['proba'+nr], X[['streck']], None)
    cols=X.columns[-8]    
    return X
df_stack = stack_predict(X, [typ6, typ1, typ9, typ16])
# df_stack


In [26]:
df_stack.index

RangeIndex(start=0, stop=43782, step=1)

In [144]:
# from sklearn.model_selection import TimeSeriesSplit
# tscv = TimeSeriesSplit(n_splits=5)

# for train_index, test_index in tscv.split(df_stack):
#    print("TRAIN:", train_index[-1], "TEST:", test_index[0])
#    X_train = df_stack.loc[train_index]
#    X_test = df_stack.loc[test_index]
#    y_train, y_test = y.loc[train_index], y.loc[test_index]



## The complete learning process with all steps in stacking

In [145]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()  # The meta model
    
# fit my models on split date for timeseries   
print('START fitting and predicting TimeseriesSplit') 
cross_val_predict=pd.DataFrame()
for id_train, id_test in TimeSeriesSplit(n_splits=5).split(df_stack):  
    for typ in [typ6, typ1, typ9, typ16]:
        typ.learn(df_stack.loc[id_train],y.loc[id_train], iterations=25)
    df_pred = stack_predict(df_stack.loc[id_test], [typ6, typ1, typ9, typ16])
    df_pred['y']=y.loc[id_test]
    cross_val_predict = pd.concat([cross_val_predict, df_pred.iloc[:,-9:]])
       
print('\nFitting my models with all data')
# final fit with all the available data
for typ in [typ6, typ1, typ9, typ16]:
    typ.learn(df_stack, y, iterations=20)

print('\nFitting meta_model on predicted above')
# fit a rf meta_model on cross_val_predict
meta_model = RandomForestClassifier(max_depth=None, n_estimators=100, oob_score=True, verbose=1, n_jobs=10, random_state=2022)
meta_model.fit(cross_val_predict.iloc[:, :-1], cross_val_predict.iloc[:, -1])
print('OOB_score', meta_model.oob_score_)   # 0.9305314451043094
# pickle save stacking
pickle.dump(meta_model, open('..\\modeller\\meta_model.pkl', 'wb'))

START fitting and predicting TimeseriesSplit
best score {'learn': {'Logloss': 0.14365767513623312}}
best score {'learn': {'Logloss': 0.13620219733765712}}
best score {'learn': {'Logloss': 0.15050587361523762}}
best score {'learn': {'Logloss': 0.15050587361523762}}
best score {'learn': {'Logloss': 0.17097780937973994}}
best score {'learn': {'Logloss': 0.1670394102737669}}
best score {'learn': {'Logloss': 0.17326330978505575}}
best score {'learn': {'Logloss': 0.17326330978505575}}
best score {'learn': {'Logloss': 0.18067681849575123}}
best score {'learn': {'Logloss': 0.17834256150111963}}
best score {'learn': {'Logloss': 0.17907324552042933}}
best score {'learn': {'Logloss': 0.17907324552042933}}
best score {'learn': {'Logloss': 0.1851407846044013}}
best score {'learn': {'Logloss': 0.1836453132382496}}
best score {'learn': {'Logloss': 0.18408986130700936}}
best score {'learn': {'Logloss': 0.18408986130700936}}
best score {'learn': {'Logloss': 0.18695329875711095}}
best score {'learn': {'

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.3s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    1.0s finished


OOB_score 0.9261066191585583


In [143]:
# make prediction on unseen data
def unseen_predictions(X_, models, meta_model):
    X = X_.copy()
    for model in models:
        nr = model.name[3:]
        X['proba'+nr] = model.predict(X)
        X['kelly'+nr] = kelly(X['proba'+nr], X[['streck']], None)
        
    return(meta_model.predict_proba(X.iloc[:, -8:]))

# a small test:
unseen_predictions(df_stack.iloc[-80:,:], [typ6, typ1, typ9, typ16], meta_model)[:,1],y.iloc[-80:].values



[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.0s finished


(array([0.99, 0.06, 0.08, 0.  , 0.01, 0.01, 0.04, 0.  , 0.  , 0.  , 0.01,
        0.  , 0.03, 0.04, 0.11, 0.08, 0.1 , 0.  , 0.  , 0.02, 0.  , 0.02,
        1.  , 0.  , 0.  , 0.  , 0.72, 0.  , 0.02, 0.  , 0.03, 0.31, 0.96,
        0.  , 0.03, 0.  , 0.02, 0.  , 0.  , 0.02, 0.15, 0.  , 0.02, 0.02,
        0.  , 0.03, 0.09, 0.01, 0.  , 0.02, 0.  , 0.02, 0.59, 0.  , 0.96,
        0.  , 0.  , 0.5 , 0.08, 0.  , 0.  , 0.  , 0.07, 0.  , 0.28, 0.  ,
        0.02, 0.  , 0.  , 0.  , 0.  , 0.  , 0.02, 0.08, 0.9 , 0.  , 0.01,
        0.  , 0.  , 0.  ]),
 array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0,
        1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0]))