# Testar logiken i jupyter innan jag går över till python med streamlit

Beskrivning:  

- Läser in fejkad web scraping data.  
- gör predict proba och kelly för varje modell.  
- Använder sedan meta_modellen 

In [278]:
# moduler
import pandas as pd
import numpy as np
import pickle
from catboost import CatBoostClassifier, Pool, cv
from IPython.display import display
pd.set_option('display.width', 200)
pd.set_option('display.max_columns', 200)


In [279]:
def remove_features(df_, remove_mer=[]):
    df = df_.copy()
    df.drop(['startnr', 'vodds', 'podds', 'bins', 'h1_dat',
             'h2_dat', 'h3_dat', 'h4_dat', 'h5_dat'], axis=1, inplace=True)
    if remove_mer:
        df.drop(remove_mer, axis=1, inplace=True)

    return df

In [280]:
# remove NaN for cat_features in X and return (X, cat_features)
# ta bort alla features som inte används innan call
def prepare_for_catboost(X_, features=[]):
    X = X_.copy()
    Xtemp = remove_features(X, remove_mer=['avd', 'datum'])
    
    if len(features) > 0:
      Xtemp = Xtemp[features]
    # get numerical features and cat_features
    num_features = list(Xtemp.select_dtypes(include=[np.number]).columns)
    cat_features = list(Xtemp.select_dtypes(include=['object']).columns)

    # check cat_features isna
    print('NaN in cat before:', X[cat_features].isna().sum()[
          X[cat_features].isna().sum() > 0].sort_values(ascending=False).sum())

    # impute 'missing' for all NaN in cat_features
    X[cat_features] = X[cat_features].fillna('missing')
    print('NaN in cat after:', X[cat_features].isna().sum().sum())
    return X, cat_features


#### Funktioner för att prioritera mellan hästar

In [281]:
# Skapa ett Kelly-värde baserat på streck omvandlat till odds
def kelly(proba, streck, odds):  # proba = prob winning, streck i % = streck
    with open('rf_streck_odds.pkl', 'rb') as f:
        rf = pickle.load(f)

    if odds is None:
        o = rf.predict(streck.copy())
    else:
        o = rf.predict(streck.copy())

    # for each values > 40 in odds set to 1
    o[o > 40] = 1
    return (o*proba - (1-proba))/o

In [282]:
# för en omgång (ett datum) ta ut största diff för streck per avd 
# om only_clear=True, enbart för diff >= 25
def lista_med_favoriter(df_, ant, only_clear):
    df = df_.copy()
    min_diff = 25 if only_clear else 0
    # sortera på avd,streck
    df = df.sort_values(['avd', 'streck'], ascending=[False, False])
    diff_list = []
    for avd in range(1, 8):
        diff = df.loc[df.avd == avd].streck.iloc[0] - \
            df.loc[df.avd == avd].streck.iloc[1]
        if diff >= min_diff:
            diff_list.append((avd, diff))

     # sortera på diff
    diff_list = sorted(diff_list, key=lambda x: x[1], reverse=True)
    return diff_list[:ant]

# temp is a list of tuples (avd, diff). check if avd is in the list
def check_avd(avd, temp):
    for t in temp:
        if t[0] == avd:
            return True
    return False


In [283]:
def compute_total_insats(df):
    insats = 0
    # group by avd
    summa = df.groupby('avd').avd.count().prod() / 2
    return summa

#### Funktioner som modiferar data beroende på model

In [284]:
# antal hästar per avdeling
def lägg_in_antal_hästar(df_):
    df = df_.copy()
    df['ant_per_lopp'] = None
    df['ant_per_lopp'] = df.groupby(['datum', 'avd'])['avd'].transform('count')
    return df

# mest streck per avdeling
def mest_streck(X_, i, datum, avd):
    X = X_.copy()
    X.sort_values(by=['datum', 'avd', 'streck'], ascending=[
                  True, True, False], inplace=True)
    return X.loc[(X.datum == datum) & (X.avd == avd), 'streck'].iloc[i]

# n flest streck per avd som features
def lägg_in_motståndare(X_, ant_motståndare):
    X = X_.copy()

    # set X['motståndare1'] to largest streck in every avd
    grouped = X.groupby(['datum', 'avd'])['streck']
    X['motståndare1'] = grouped.transform(max)

    for i in range(2, ant_motståndare+1):
        # set X['motståndare'+str(i)] to ith largest streck in every avd
        X['motståndare' + str(i)] = grouped.transform(lambda x: x.nlargest(i).min())

    return X

# som föregående men med diff istf faktiska värden
def lägg_in_diff_motståndare(X_, motståndare):
    X = X_.copy()

    # set X['motståndare1'] to largest streck in every avd
    grouped = X.groupby(['datum', 'avd'])['streck']
    X['diff1'] = grouped.transform(max) - X.streck

    for i in range(2, motståndare+1):
        # set X['motståndare'+str(i)] to ith largest streck in every avd
        X['diff' + str(i)] = grouped.transform(lambda x: x.nlargest(i).min()) - X.streck

    return X

#### class Typ

In [285]:
class Typ():
    def __init__(self, name, ant_hästar, proba, kelly, motst_ant, motst_diff,  ant_favoriter, only_clear, streck):
        assert (motst_diff == False and motst_ant == 0) or (motst_ant > 0)
        assert (ant_favoriter == 0 and only_clear == False) or (ant_favoriter > 0)
        self.name = name                # string för filnamn mm

        # inkludera features eller ej
        self.ant_hästar = ant_hästar    # int feature med antal hästar per avdelning
        # int inkludera n features med bästa motståndare (streck)
        self.motst_ant = motst_ant
        self.motst_diff = motst_diff    # bool ovanstående med diff istf fasta värden
        self.streck = streck            # bool inkludera feature med streck

        # urval av rader
        self.proba = proba              # bool för prioritering vid urval av rader
        self.kelly = kelly              # bool för prioritering vid urval av rader
        # int för hur många favoriter (avd med en häst) som ska användas
        self.ant_favoriter = ant_favoriter
        self.only_clear = only_clear    # bool för att bara avvända klara favoriter

    def load_model(self):
        with open('../modeller/'+self.name+'.model', 'rb') as f:
            model = pickle.load(f)
        return model

    def save_model(self, model):
        with open('../modeller/'+self.name+'.model', 'wb') as f:
            pickle.dump(model, f)

    def prepare_for_model(self, X_):
        # X_ måste ha datum och avd
        X = X_.copy()
        print(self.name)
        if self.ant_hästar:
            print('Lägg in ant_hästar')
            X = lägg_in_antal_hästar(X)
        if self.motst_diff:
            print('Lägg in diff motståndare')
            X = lägg_in_diff_motståndare(X, self.motst_ant)
        elif self.motst_ant > 0:
            print('Lägg in motståndare')
            X = lägg_in_motståndare(X, self.motst_ant)
        # Behåll streck ända tills learn och predict (används för prioritera rader)
        return X

    def learn(self, X_, y, features, iterations=1000, save=True, verbose=False):
        # X_ måste ha datum och avd
            
        cbc = CatBoostClassifier(
            iterations=iterations, loss_function='Logloss', eval_metric='AUC', verbose=verbose)

        X = self.prepare_for_model(X_)
        if not self.streck:
            X.drop('streck', axis=1, inplace=True)

        X, cat_features = prepare_for_catboost(X)
        
        X=remove_features(X, remove_mer=['datum','avd'])
        cbc.fit(X, y, cat_features, use_best_model=False)
    
        print('best score', cbc.best_score_)
        if save:
            self.save_model(cbc)
        return cbc
    
    def predict(self, X_):
        # X_ måste ha datum och avd
        X = self.prepare_for_model(X_)
        model = self.load_model()
        if not self.streck:
            print('drop streck')
            X.drop('streck', axis=1, inplace=True)
            
        X, cat_features = prepare_for_catboost(X, model.feature_names_)

        # all features in model
        X = remove_features(X, remove_mer=['datum', 'avd'])
        # print(len(X.columns), len(model.feature_names_))
        # print('Diff', set(X.columns) - set(model.feature_names_))
        # print('X.columns\n',X.columns)
        # print('model features names\n',model.feature_names_)
        
        assert len(X.columns) == len(model.feature_names_), f'len(X.columns)  != len(model.feature_names_) in predict {self.name}'
        assert set(X.columns) == set(model.feature_names_), 'features in model and in X not equal'
        # assert list(X.columns) == list(model.feature_names_), f'features in model {self.name} and X not in same order'
        X = X[model.feature_names_]
        print('predict '+self.name)   
        print(model.get_feature_importance(prettified=True)[:3])
        
        return model.predict_proba(X)[:, 1]


In [286]:
# skapa modeller
#           name, ant_hästar, proba, kelly, motst_ant, motst_diff,  ant_favoriter, only_clear, streck
typ6 = Typ('typ6', True,       True, False,     0,      False,          0,            False,    True)
typ1 = Typ('typ1', False,      True, False,     2,      True,           2,            True,     False)
typ9 = Typ('typ9', True,       True, True,      2,      True,           2,            True,     True)
typ16 = Typ('typ16',True,      True, True,      2,      True,           2,            False,    True)

typer = [typ6, typ1, typ9, typ16]


## Learning-fasen

Gör en scrape på senaste veckan (behövs inte i denna test)

Läs in all_data.csv 
Baka ihop senaste vekan med all_data.csv

In [287]:
# läs in data
def läs_in_data_för_learning():
    df = pd.read_csv('..\\all_data.csv')
    # Följande datum saknar avd==5 och kan inte användas
    saknas = ['2015-08-15', '2016-08-13', '2017-08-12']
    df = df[~df.datum.isin(saknas)]
    X = df.copy()
    X.drop('plac', axis=1, inplace=True)
    
    y = (df.plac == 1)*1   # plac 1 eller 0

    for f in ['häst', 'bana', 'kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']:
        X[f] = X[f].str.lower()

    X.reset_index(drop=True, inplace=True)
    y.reset_index(drop=True, inplace=True)

    return X, y


In [288]:
def skapa_stack_learning(X_, y, features, iterations=1000, random_state=2022, verbose=False, save=True):
    """
    Skapar en stack med proba och kelly
    X måste ha datum och avd
    """
    X = X_.copy()
    stacked_data = pd.DataFrame()
    
    cbc = CatBoostClassifier(iterations=iterations, loss_function='Logloss', eval_metric='AUC', verbose=verbose)
    for typ in typer:
        nr = typ.name[3:]
        model = typ.learn(X, y, features, iterations=iterations, save=save, verbose=verbose)
        stacked_data['proba'+nr] = typ.predict(X) 
        stacked_data['kelly'+nr] = kelly(stacked_data['proba' + nr], X[['streck']], None)
    
    # print(stacked_data.columns)
    return stacked_data   # enbart stack-info

# fit meta_model
def learn_meta_model(X,y):
    from sklearn.ensemble import RandomForestClassifier

    print('\nFitting meta_model on X with all models predictions')
    
    meta_model = RandomForestClassifier(max_depth=None, n_estimators=100, oob_score=True, verbose=1, n_jobs=10, random_state=2022)
    meta_model.fit(X, y)
    
    print('OOB_score', meta_model.oob_score_)   # 0.9305314451043094
    # pickle save stacking
    pickle.dump(meta_model, open('..\\modeller\\meta.model', 'wb'))
    
    return meta_model


In [289]:
# read feature list from a file (ej plac)
def read_feature_list(file='../FEATURES.txt'):
    with open(file, 'r') as f:
        return f.read().splitlines()

Kör learning-skiten här

In [290]:
FEATURES = read_feature_list("../FEATURES.txt")

X_train, y_train = läs_in_data_för_learning()
assert X_train.shape[1] == len(FEATURES), f'X_train.shape[1] {X_train.shape[1]} != len(FEATURES) {len(FEATURES)}'
assert set(X_train.columns) == set(FEATURES), f'set(X_train.columns) {set(X_train.columns)} != set(FEATURES) {set(FEATURES)}'
X_train = X_train[FEATURES]  # för att få kolumner i rätt ordning
X_stacked = skapa_stack_learning(X_train, y_train, FEATURES, iterations=100,random_state=2022, verbose=False, save=True)
# display(X_stacked)
meta_model = learn_meta_model(X_stacked, y_train)


typ6
Lägg in ant_hästar
NaN in cat before: 246
NaN in cat after: 0
best score {'learn': {'Logloss': 0.19120583769208682}}
typ6
Lägg in ant_hästar
NaN in cat before: 246
NaN in cat after: 0
predict typ6
  Feature Id  Importances
0     streck    36.951123
1       häst     4.760482
2     senast     3.254997
3    h2_odds     2.871381
4         kr     2.248639
typ1
Lägg in diff motståndare
NaN in cat before: 246
NaN in cat after: 0
best score {'learn': {'Logloss': 0.19606831883977294}}
typ1
Lägg in diff motståndare
drop streck
NaN in cat before: 246
NaN in cat after: 0
predict typ1
  Feature Id  Importances
0      diff2    17.334921
1      diff1    16.038588
2       häst     5.518587
3         kr     2.706512
4       pris     2.592018
typ9
Lägg in ant_hästar
Lägg in diff motståndare
NaN in cat before: 246
NaN in cat after: 0
best score {'learn': {'Logloss': 0.18883593142791752}}
typ9
Lägg in ant_hästar
Lägg in diff motståndare
NaN in cat before: 246
NaN in cat after: 0
predict typ9
  Featur

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.4s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    1.3s finished


OOB_score 0.9459237643625752


## Spela-fasen

In [291]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier

Fejkad Scrape-funktion

In [292]:
def scrape():
    df = pd.read_csv('../sparad_scrape.csv')
    for f in ['häst','bana', 'kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']:
        df[f] = df[f].str.lower()
    return df

In [294]:
# Alternativ metod
# Ta fram rader för varje typ enligt test-resultaten innan
# låt meta_model välja mellan typerna - hur? Hur maximer insatsen?

Funktion som bygger stack-data från modellerna

In [295]:
# för stacking ta med alla hästar per typ och proba plus kelly
def build_stack_df(X_):
    X = X_.copy()
    stacked_data = X[['datum','avd', 'startnr','häst']].copy()
    for typ in typer:
        nr = typ.name[3:]
        stacked_data['proba'+nr] = typ.predict(X)
        stacked_data['kelly'+nr] = kelly(stacked_data['proba'+nr], X[['streck']], None)
    return stacked_data


Funktion där meta_model för predict_proba

In [296]:
def meta_predict(X_):
    # X_ innehåller även datum,startnr och avd
    extra = ['datum', 'avd', 'startnr', 'häst']
    assert list(X_.columns[:4]) == extra, 'meta_model måste ha datum, avd och startnr, häst för att kunna välja'
    X = X_.copy()
    with open('../modeller\\meta.model', 'rb') as f:
        meta_model = pickle.load(f)
        
    # print(meta_model.predict_proba(X.iloc[:, -8:]))
    X['meta_predict'] = meta_model.predict_proba(X.iloc[:,-8:])[:,1]
    my_columns = extra + list(X.columns)[-9:] 
    
    return X[my_columns]


Funktion som väljer rad

In [297]:
def comp_cost(antal_rader):
    cost = (antal_rader**2)/2
    return antal_rader,cost

def välj_rad(X_):
    
    max_insats=320
    veckans_rad = X_.copy()
    veckans_rad['välj'] = False

    for avd in veckans_rad.avd.unique():
        max_pred = veckans_rad[veckans_rad.avd == avd]['meta_predict'].max()
        veckans_rad.loc[(veckans_rad.avd == avd) & (veckans_rad.meta_predict == max_pred), 'välj'] = True
    antal_rader=1    
    veckans_rad = veckans_rad.sort_values(by=['meta_predict'], ascending=False)
    
    # 3. Använda ensam favorit för ett par avd? Kolla test-resultat
    # for each row in rad, välj=True if select_func(cost,avd) == True
    cost = antal_rader*0.5
    for i, row in veckans_rad.iterrows():
        new_antal,new_cost = comp_cost(antal_rader+1)
        # print(the_cost)
        if new_cost > max_insats:
            break
        
        antal_rader = new_antal
        cost = new_cost
        veckans_rad.loc[i, 'välj'] = True
        # print(cost)
    veckans_rad.sort_values(by=['välj', 'avd'], ascending=[False, True], inplace=True)

    return veckans_rad


Kör hela välj-rad-skiten här

In [298]:
X = scrape()
print(X.datum.unique())
df_stack = build_stack_df(X)
df_meta = meta_predict(df_stack)
df_meta.reset_index(drop=True, inplace=True)
veckans_rad = välj_rad(df_meta)

display(veckans_rad[veckans_rad.välj])
print('kostnad', veckans_rad.välj.sum()**2/2)


['2022-03-12']
typ6
Lägg in ant_hästar
NaN in cat before: 0
NaN in cat after: 0
predict typ6
  Feature Id  Importances
0     streck    36.951123
1       häst     4.760482
2     senast     3.254997
3    h2_odds     2.871381
4         kr     2.248639
typ1
Lägg in diff motståndare
drop streck
NaN in cat before: 0
NaN in cat after: 0
predict typ1
  Feature Id  Importances
0      diff2    17.334921
1      diff1    16.038588
2       häst     5.518587
3         kr     2.706512
4       pris     2.592018
typ9
Lägg in ant_hästar
Lägg in diff motståndare
NaN in cat before: 0
NaN in cat after: 0
predict typ9
  Feature Id  Importances
0     streck    24.800184
1      diff2    10.990879
2       häst     4.192724
3    h5_odds     3.833646
4    h1_perf     3.010518
typ16
Lägg in ant_hästar
Lägg in diff motståndare
NaN in cat before: 0
NaN in cat after: 0
predict typ16
  Feature Id  Importances
0     streck    24.800184
1      diff2    10.990879
2       häst     4.192724
3    h5_odds     3.833646
4    

[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 100 out of 100 | elapsed:    0.0s finished


Unnamed: 0,datum,avd,startnr,häst,proba6,kelly6,proba1,kelly1,proba9,kelly9,proba16,kelly16,meta_predict,välj
5,2022-03-12,1,6,dear friend,0.478491,0.225132,0.49155,0.244536,0.417039,0.133826,0.417039,0.133826,0.86,True
1,2022-03-12,1,2,chablis ribb,0.012974,-0.087249,0.043265,-0.053881,0.149861,0.063538,0.149861,0.063538,0.12,True
13,2022-03-12,2,5,borups tornado,0.501085,0.397525,0.304825,0.160527,0.163491,-0.010143,0.163491,-0.010143,0.86,True
19,2022-03-12,2,11,alert vendil,0.054399,0.000438,0.054385,0.000423,0.065319,0.011981,0.065319,0.011981,0.03,True
23,2022-03-12,3,3,tycoon conway hall,0.123378,0.091575,0.136737,0.105418,0.047318,0.012755,0.047318,0.012755,0.75,True
30,2022-03-12,3,10,stoletheshow,0.309036,0.040401,0.291468,0.016003,0.254073,-0.035931,0.254073,-0.035931,0.05,True
27,2022-03-12,3,7,upstate face,0.064489,-0.092339,0.022308,-0.141592,0.335138,0.223681,0.335138,0.223681,0.05,True
28,2022-03-12,3,8,niky flax,0.004543,-0.023738,0.038771,0.011463,0.010983,-0.017114,0.010983,-0.017114,0.04,True
29,2022-03-12,3,9,pacific face,0.026313,-0.001349,0.001928,-0.026427,0.004366,-0.02392,0.004366,-0.02392,0.03,True
41,2022-03-12,4,12,listas tinge ling,0.120025,0.050303,0.110526,0.04005,0.113913,0.043707,0.113913,0.043707,0.76,True


kostnad 288.0


# En massa gammal - kanske reusable

Läs in all data

In [None]:
# only for Learn!
df = pd.read_csv('..\\all_data.csv')
# Följande datum saknar avd==5 och kan inte användas
saknas = ['2015-08-15', '2016-08-13', '2017-08-12']
df = df[~df.datum.isin(saknas)]
X = df.copy()
X.drop('plac', axis=1, inplace=True)
# X = ordinal_enc(X, 'häst')
y = (df.plac == 1)*1   # plac 1 eller 0

for f in ['häst', 'bana', 'kusk', 'h1_kusk', 'h2_kusk', 'h3_kusk', 'h4_kusk', 'h5_kusk', 'h1_bana', 'h2_bana', 'h3_bana', 'h4_bana', 'h5_bana']:
    X[f] = X[f].str.lower()

X.reset_index(drop=True, inplace=True)
y.reset_index(drop=True, inplace=True)

X, cat_features = prepare_for_catboost(X)
print('cat_features:', cat_features)
X.head()


modell för streck_to_odds - skall vara fix och inte ändras

In [None]:
def model_streck_to_odds(X_):
    X = X_.copy()
    # import modules for linear regression
    from sklearn.linear_model import LinearRegression
    from sklearn.model_selection import train_test_split
    from sklearn.metrics import mean_absolute_error as mae
    # import random forest module
    from sklearn.ensemble import RandomForestRegressor

    X_odds = X.loc[X.vodds <= 40]  # remove outliers
    ix_break = int(len(X_odds.datum.unique())*0.75)
    test_start = X_odds.datum.unique()[ix_break]

    X_train, X_test = X_odds[X_odds.datum <
                             test_start], X_odds[X_odds.datum >= test_start]
    y_train, y_test = X_train['vodds'], X_test['vodds']
    X_train = X_train[['streck']].astype(float)
    X_test = X_test[['streck']].astype(float)

    # make a model of RF
    rf = RandomForestRegressor(n_estimators=100, max_depth=6, random_state=0)
    rf.fit(X_train, y_train)
    y_predrf = rf.predict(X_test)
    # make a model and fit it
    linreg = LinearRegression()
    linreg.fit(X_train, y_train)
    y_predlr = linreg.predict(X_test)

    # print the coefficients
    print('Coefficients:', linreg.coef_)
    # print the mean absolute error
    print("LR Mean absolute error: %.2f" % mae(y_test, y_predlr))
    print("RF Mean absolute error: %.2f" % mae(y_test, y_predrf))

    return linreg, rf


linreg, rf = model_streck_to_odds(X)   # used in next cell
# spara rf
import pickle
with open('rf_streck_odds.pkl', 'wb') as f:
    pickle.dump(rf, f)

Engångsgrej för att initiera typ-instanserna med learn

In [None]:
# Bara första gången. Initierar Typ-klassen
def learn(X_train, y_train, X_test=None, y_test=None, iterations=1000, cat_features=cat_features, verbose=False):
    cbc = CatBoostClassifier(iterations=iterations, loss_function='Logloss', eval_metric='AUC', verbose=verbose)
    X_train = remove_features(X_train, remove_mer=['avd','datum'])
    cat_features = X_train.select_dtypes(include=['object']).columns.tolist()
    train_pool = Pool(X_train, label=y_train, cat_features=cat_features)
    if X_test is not None:
        X_test = remove_features(X_test, remove_mer=['avd', 'datum'])
        test_pool = Pool(X_test, label=y_test, cat_features=cat_features)
        cbc.fit(train_pool, eval_set=test_pool, early_stopping_rounds=100, use_best_model=True, verbose=verbose)
    else:
        cbc.fit(train_pool, use_best_model=True, verbose=verbose)
    return cbc

def beräkna_datum(X,fract=0.75):
    ix_break = int(len(X.datum.unique())*fract)
    test_start = X.datum.unique()[ix_break]
    return test_start

if False:    
    Xlearn, cat_features= prepare_for_catboost(X)  
    # print(Xlearn.columns)
    for typ in [typ6, typ1, typ9, typ16]:
        print(typ.name)
        Xtyp = typ.prepare_for_model(Xlearn)                                 ###########

        if not typ.streck:                                                ################
            Xtyp.drop('streck', axis=1, inplace=True)
            
        if True: # använda X_test    
            test_start = beräkna_datum(Xtyp)    
            X_train, X_test = Xtyp[Xtyp.datum < test_start], Xtyp[Xtyp.datum >= test_start]
            y_train, y_test = y[X_train.index], y[X_test.index]
            # print('innan learn',X_train.columns)
            typ_model = learn(X_train, y_train, X_test, y_test)  ##########
            print('best iteration',typ_model.best_iteration_)                             ##########
            print('best score',    typ_model.best_score_)                                 ##########
        # save model
        typ.save_model(typ_model)                                                       ##########                          


Skapa typ6 till typ16

In [None]:
X,cat_features = prepare_for_catboost(X)
typ6.learn(X,y, iterations=33) # best iter = 25 {'Logloss': 0.23245952928761984, 'AUC': 0.8262112132692319}
typ1.learn(X,y, iterations=39) # best iter = 39 {'Logloss': 0.23278308932319106, 'AUC': 0.826883367187688}
typ9.learn(X,y, iterations=37) # best iter = 37 {'Logloss': 0.23312091900160384, 'AUC': 0.8257515762557716}
typ16.learn(X,y,iterations=37) # best iter = 37 {'Logloss': 0.23312091900160384, 'AUC': 0.8257515762557716}


Skapa stack predict med alla typer

In [None]:
# stack predict for all models
def stack_predict(X_, models):
    X = X_.copy()
    for typ in typer:
        nr = typ.name[3:]
        X['proba'+nr] = typ.predict(X)
        X['kelly'+nr] = kelly(X['proba'+nr], X[['streck']], None)
    # cols=X.columns[-8]    
    return X


learn med TimeSeriesSplit

In [None]:
# from sklearn.model_selection import TimeSeriesSplit
# tscv = TimeSeriesSplit(n_splits=5)

# for train_index, test_index in tscv.split(df_stack):
#    print("TRAIN:", train_index[-1], "TEST:", test_index[0])
#    X_train = df_stack.loc[train_index]
#    X_test = df_stack.loc[test_index]
#    y_train, y_test = y.loc[train_index], y.loc[test_index]



## The complete learning process with all steps in stacking

In [None]:
from sklearn.model_selection import TimeSeriesSplit
from sklearn.ensemble import RandomForestRegressor
rf = RandomForestRegressor()  # The meta model
    
# fit my models on split date for timeseries   
print('START fitting and predicting TimeseriesSplit') 
cross_val_predict=pd.DataFrame()
for id_train, id_test in TimeSeriesSplit(n_splits=5).split(df_stack):  
    for typ in [typ6, typ1, typ9, typ16]:
        typ.learn(df_stack.loc[id_train],y.loc[id_train], iterations=25)
    df_pred = stack_predict(df_stack.loc[id_test], [typ6, typ1, typ9, typ16])
    df_pred['y']=y.loc[id_test]
    cross_val_predict = pd.concat([cross_val_predict, df_pred.iloc[:,-9:]])
       
print('\nFitting my models with all data')
# final fit with all the available data
for typ in [typ6, typ1, typ9, typ16]:
    typ.learn(df_stack, y, iterations=20)

print('\nFitting meta_model on predicted above')
# fit a rf meta_model on cross_val_predict
meta_model = RandomForestClassifier(max_depth=None, n_estimators=100, oob_score=True, verbose=1, n_jobs=10, random_state=2022)
meta_model.fit(cross_val_predict.iloc[:, :-1], cross_val_predict.iloc[:, -1])
print('OOB_score', meta_model.oob_score_)   # 0.9305314451043094
# pickle save stacking
pickle.dump(meta_model, open('..\\modeller\\meta_model.pkl', 'wb'))

In [None]:
# make prediction on unseen data
def unseen_predictions(X_, models, meta_model):
    X = X_.copy()
    for model in models:
        nr = model.name[3:]
        X['proba'+nr] = model.predict(X)
        X['kelly'+nr] = kelly(X['proba'+nr], X[['streck']], None)
        
    return(meta_model.predict_proba(X.iloc[:, -8:]))

# a small test:
unseen_predictions(df_stack.iloc[-80:,:], [typ6, typ1, typ9, typ16], meta_model)[:,1],y.iloc[-80:].values

