# Try stacking for V75

In [1]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier,Pool,cv,utils 
from sklearn.impute import SimpleImputer

In [2]:
### return a CatBoost model with some default parameters
def get_model(d=6,l2=2,iterations=3000,use_best=True,verbose=False):
    model = CatBoostClassifier(iterations=iterations,use_best_model=use_best, 
        custom_metric=['Logloss', 'AUC','Recall', 'Precision', 'F1', 'Accuracy'],

        eval_metric='Accuracy', 
        depth=d,l2_leaf_reg=l2,
        auto_class_weights='Balanced',verbose=verbose, random_state=2021) 
    return model                

In [3]:
### Features som inte används vid träning
def remove_features(df,remove_mer=[]):
    #remove_mer=['h5_perf','h5_auto','h4_perf','h4_auto', 'h3_perf', 'h2_perf']
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) #
    if remove_mer:
        df.drop(remove_mer,axis=1,inplace=True)
    
    # df=check_unique(df.copy())
    # df=check_corr(df.copy())
    return df

In [4]:
 ## byt ut alla NaN till text för cat_features
def replace_NaN(X_train,X_test=None, cat_features=[]):
    # print('cat_features',cat_features)
    for c in cat_features:
        # print(c)
        X_train.loc[X_train[c].isna(),c] = 'missing'       ### byt ut None-värden till texten 'Missing'
        if X_test is not None:  ## om X_test är med
            X_test.loc [X_test[c].isna(),c] = 'missing'    ### byt ut None-värden till texten 'Missing'

    return X_train,X_test

In [5]:
### läs in data och returnera df, alla datum samt index till split-punkt
def load_data(proc=0.75):
    
    df = pd.read_csv('..\\all_data.csv')     
    alla_datum = list(df.datum.unique())
    split_ix = int(len(alla_datum)*proc)
    
    return df,alla_datum,split_ix

In [6]:
def remove_not_used_features(df):
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) 
    return df

In [7]:
df,alla_datum,_ = load_data() 
df = remove_not_used_features(df.copy())
CAT_FEATURES=['datum', 'bana', 'häst', 'kusk', 'kön',
        'h1_kusk', 'h1_bana',
        'h2_kusk', 'h2_bana', 
        'h3_kusk',  'h3_bana', 
        'h4_kusk', 'h4_bana', 
        'h5_kusk', 'h5_bana',]

NUM_FEATURES=[item for item in df.columns if item not in CAT_FEATURES and item !='plac']

PLAC_MEAN=df.plac.mean()
PLAC_MEAN

9.210009837088222

In [8]:
# den hittade inget, kanske skall testa igen längre fram
def remove_low_variance_features(df):
    from sklearn.feature_selection import VarianceThreshold
    print(df.shape)
    selection = VarianceThreshold(threshold=(0.1))
    X=selection.fit_transform(df)
    print(X.shape)
    return X

## Functions that are doing the transformations

In [9]:
# fill missing values in categorical features
def impute_cat_features(df, cat_features):
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df[cat_features]=imp1.fit_transform(df[cat_features])  # replae NaN's with 'missing'
    return df

In [10]:
# Set a smooth mean value to the features in df
def calc_smooth_mean(df, by, y, m=300, tot_mean=PLAC_MEAN):

    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[y].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * tot_mean) / (counts + m)

    # Replace each value by the according smoothed mean
    return df[by].map(smooth)


In [11]:
# Handle h1-h5_bana
def transform_hx_bana(df,hx,the_map):
    from sklearn.impute import SimpleImputer
    df[hx] = df[hx].str.lower()
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df[hx]=imp1.fit_transform(df[[hx]])  # replae NaN's with 'missing'

    df[hx] = [item[0] for item in df[hx].str.split('-')]  # remove '-10' from 'solvalla-10' etc
    
    df[hx]=df[hx].map(the_map)  # transform column to numeric by mapping
    # after mapping we get new NaN's - now impute 0
    imp2 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
    df[hx] = imp2.fit_transform(df[[hx]])
    return df
    

In [12]:

# Handle bana and hx_bana  
def transf_bana(df):
    df['bana'] = df.bana.str.lower()
    the_map = df.bana.value_counts() 
    the_map['missing']=0    

    df=transform_hx_bana(df,'h1_bana',the_map)
    df=transform_hx_bana(df,'h2_bana',the_map)
    df=transform_hx_bana(df,'h3_bana',the_map)
    df=transform_hx_bana(df,'h4_bana',the_map)
    df=transform_hx_bana(df,'h5_bana',the_map)

    df['bana']=df.bana.map(the_map)  # transform column to numeric by mapping 
    if df[['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]].isna().sum().sum() != 0:
        print('bana NaNs not 0:',df[['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]].isna().sum())
    
    df.drop(['bana','h1_bana','h2_bana','h3_bana','h4_bana','h5_bana'],axis=1,inplace=True)
    return df


In [13]:
# Handle häst and kusk 
def transf_kusk_häst(df,pref='',m=50,):
    df[pref+'ekipage'] = df[pref+'kusk'].str.cat(df['häst'], sep =", ")  # concatenate 'häst' and 'kusk' into one column
    df[pref+'ekipage'] = calc_smooth_mean(df, by=pref+'ekipage', y='plac',m=50) # make numeric with Target encoding with smooth mean
    df.drop([pref+'kusk'],axis=1,inplace=True)
    return df

In [14]:
# Handle kön  
def transf_kön(df):
    from sklearn.preprocessing import OneHotEncoder
    df['kön'] = df['kön'].str.lower()
    ohe = OneHotEncoder(sparse=False)
    dftemp=pd.DataFrame(ohe.fit_transform(df[['kön']]),columns=['kön_h','kön_s','kön_v'] )  # replae kön with One Hot Encoding
    df=pd.concat([df,dftemp],axis=1)

    # check that kön is correct encoded
    if len(df.loc[(df.kön=='h') & (df.kön_h != 1),'kön']):
        print('Felaktigt kön','h')
        error()
    if len(df.loc[(df.kön=='s') & (df.kön_s != 1),'kön']):
        print('Felaktigt kön','s')
        error()
    if len(df.loc[(df.kön=='v') & (df.kön_v != 1),'kön']):
        print('Felaktigt kön','v')
        error()
    df.drop(['kön'],axis=1,inplace=True)
    return df

In [15]:

def impute_all_numeric_NaNs(df):
    # all features must be numeric
    from sklearn.impute import SimpleImputer
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=-1)
    trdf=imp1.fit_transform(df)  # replae NaN's with 'missing'
    return pd.DataFrame(trdf,columns=df.columns)

## All the transformations in one function

In [16]:

def transf_all(df):
    
    trdf=transf_bana(df.copy())
    trdf=transf_kusk_häst(trdf)
    trdf=transf_kusk_häst(trdf,pref='h1_')
    trdf=transf_kusk_häst(trdf,pref='h2_')
    trdf=transf_kusk_häst(trdf,pref='h3_')
    trdf=transf_kusk_häst(trdf,pref='h4_')
    trdf=transf_kusk_häst(trdf,pref='h5_')
    trdf.drop(['häst'],axis=1,inplace=True)
    trdf=transf_kön(trdf)
    trdf['datum']=pd.to_datetime(trdf.datum).view(float)*10e210
    
    return impute_all_numeric_NaNs(trdf)

In [265]:

# transform all categoricals and impute all NaNs
def prepare_all(df):
    trdf = transf_all(df)
    
    y = (trdf.plac==1) * 1
    trdf = trdf.drop('plac',axis=1)
    
    # all features are now numeric
    trdf = impute_all_numeric_NaNs(trdf)
    if trdf.isna().sum().sum() != 0:
        print('still NaNs in data')
        error()
    return trdf,y

## CatBoost

In [18]:
#catBoost preprocessing
def catB_preprocess(df):
        y = (df.plac==1) * 1
        df = df.drop('plac',axis=1)
        df = impute_cat_features(df,cat_features=CAT_FEATURES)

        return df,y


In [None]:

# clean the cat_features
df_catb, y = catB_preprocess(df.copy())
df_catb[CAT_FEATURES].isna().sum().sum()

In [59]:

# metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score

# for tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import TimeSeriesSplit

In [294]:
trdf,y=prepare_all(df)

In [308]:
# CatBoost model GridSearchCV
my_df_1=df_catb             # catboost with Nans abd cat_features
my_cats_1 = CAT_FEATURES
my_df_2 = trdf              # dataset common for all estimators
my_cats_2 = []

my_df = my_df_2
my_cats = my_cats_2
my_pool = Pool(my_df,y,cat_features=my_cats)
my_catb = CatBoostClassifier(cat_features=my_cats)

tscv = TimeSeriesSplit(n_splits=5)
params = {'iterations': [50,100,500],
          'depth': [4, 5, 6],
          'loss_function': ['Logloss'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
          'eval_metric': ['F1'],
        #   'use_best_model': ['True'],
          'logging_level':['Silent'],
          'random_seed': [2021],
         }
# clf.fit(df_catb,y)

scorer = make_scorer(matthews_corrcoef)
catb_grid = RandomizedSearchCV(estimator=my_catb, param_distributions=params, scoring=scorer, cv=tscv)

# GridSearchCV  - compare with default
catb_grid.fit(my_df,y)


RandomizedSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
                   estimator=<catboost.core.CatBoostClassifier object at 0x000001B547641FD0>,
                   param_distributions={'depth': [4, 5, 6],
                                        'eval_metric': ['F1'],
                                        'iterations': [50, 100, 500],
                                        'l2_leaf_reg': array([1.00000000e-20, 3.16227766e-20, 1.00000000e-19]),
                                        'leaf_estimation_iterations': [10],
                                        'logging_level': ['Silent'],
                                        'loss_function': ['Logloss'],
                                        'random_seed': [2021]},
                   scoring=make_scorer(matthews_corrcoef))

In [310]:
# get best estimator and params
best_catb = catb_grid.best_estimator_
print('best Logloss gridsearch',clf_grid.best_score_)
best_param = catb_grid.best_params_
best_param

best Logloss gridsearch 0.1766571257827087


{'random_seed': 2021,
 'loss_function': 'Logloss',
 'logging_level': 'Silent',
 'leaf_estimation_iterations': 10,
 'l2_leaf_reg': 1e-20,
 'iterations': 500,
 'eval_metric': 'F1',
 'depth': 5}

In [311]:
print(best_catb.fit(my_df,y).best_score_)
best_catb.get_feature_importance(prettified=True).head(30)

{'learn': {'Logloss': 0.09385285316573008, 'F1': 0.7868249456612607}}


Unnamed: 0,Feature Id,Importances
0,h5_ekipage,14.942294
1,h2_ekipage,14.366222
2,h1_ekipage,12.717147
3,h3_ekipage,12.436779
4,h4_ekipage,12.268079
5,ekipage,11.128203
6,streck,7.1507
7,h2_kmtid,0.853184
8,senast,0.823465
9,datum,0.806665


In [309]:

params = {
         'use_best_model': True,
         'eval_metric' : 'F1',
         "loss_function": "Logloss",
         'early_stopping_rounds': 100,
         # 'verbose': 50,
         'iterations': 2000,
         'logging_level': 'Silent',
         'leaf_estimation_iterations': 10,
         'l2_leaf_reg': 3.162277660168379e-20,
         'depth': 5,
}

cv_score =cv(pool=my_pool, 
   params=params, 
   seed=2021, 
   stratified=True,
   as_pandas=True,
   type='TimeSeries')

In [312]:
cv_score[['test-F1-mean','train-F1-mean','test-Logloss-mean','train-Logloss-mean']].describe()

Unnamed: 0,test-F1-mean,train-F1-mean,test-Logloss-mean,train-Logloss-mean
count,649.0,649.0,649.0,649.0
mean,0.677948,0.729751,0.138852,0.118658
std,0.08236,0.123635,0.059768,0.067113
min,0.240333,0.210173,0.117617,0.071732
25%,0.677019,0.684495,0.118504,0.084709
50%,0.706888,0.763873,0.121889,0.102425
75%,0.72264,0.818602,0.130317,0.125003
max,0.729156,0.855998,0.654153,0.653558


## other models that need no NaN and no Categorical

In [314]:
# XGBoost model 
# cat?
# NaN's?

In [None]:
# ANN model - Approx near neighbours
# kör all preproc ovan
# GridSearchCV

### RandomForrest

In [315]:
# GridSearch
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

tscv = TimeSeriesSplit()
params = {'n_estimators': [5,10,100],
          'max_depth': [4, 5, 6, None],
          'class_weight': ['balanced'],
        #   'loss_function': ['Logloss'],
        #   'eval_metric': ['F1'],
        #   'logging_level':['Silent'],
          'random_state': [2021],
         }
# clf.fit(df_catb,y)

scorer = make_scorer(matthews_corrcoef)
rf_grid = GridSearchCV(estimator=rf, param_grid=params, scoring=scorer, cv=tscv)

# GridSearchCV  
rf_grid.fit(trdf, y)



GridSearchCV(cv=TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None),
             estimator=RandomForestClassifier(),
             param_grid={'class_weight': ['balanced'],
                         'max_depth': [4, 5, 6, None],
                         'n_estimators': [5, 10, 100], 'random_state': [2021]},
             scoring=make_scorer(matthews_corrcoef))

In [316]:
# get best estimator and params
best_rf = rf_grid.best_estimator_
print('best gridsearch',rf_grid.best_score_)
best_param = rf_grid.best_params_
best_param

best gridsearch 0.4254736603787804


{'class_weight': 'balanced',
 'max_depth': 6,
 'n_estimators': 100,
 'random_state': 2021}

In [317]:

pd.DataFrame(best_rf.feature_importances_,index=trdf.columns, columns=['importance']).sort_values(by='importance',ascending=False)

Unnamed: 0,importance
h2_ekipage,1.547132e-01
h1_ekipage,1.420154e-01
h3_ekipage,1.410606e-01
h5_ekipage,1.345333e-01
streck,1.303589e-01
...,...
kön_v,1.062269e-04
kön_s,8.433718e-05
h4_auto,8.115417e-05
h3_auto,6.096262e-05


In [None]:
# SVM  model
# All preproc ovan?
# GridSearchCV

## Stack'em

In [None]:
#
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegressionCV
base_models = [('random_forest', best_rf),
               ('catboost', best_catb),
            #    ('knn', KNeighborsClassifier(n_neighbors=11))
               ]
meta_model = LogisticRegressionCV()
stacking_model = StackingClassifier(estimators=base_models, 
                                    final_estimator=meta_model, 
                                    passthrough=True, 
                                    cv=tscv,
                                    verbose=2)

stacking_model.fit(trdf,y)

In [346]:
## Testar TimeSeries folders manuell loop
tscv = TimeSeriesSplit()
print(tscv)
i=1
for train_index, test_index in tscv.split(trdf):
    print(f"TRAIN1_{i}, {train_index[-1:]}, TEST_{i}, {test_index[-1:]}")
    X_train, X_test = trdf.iloc[train_index], trdf.iloc[test_index]
    y_train, y_test = y[train_index], y[test_index]
    for name,model in base_models:
        print(name)
        model.fit(X_train,y_train)
        y_hat = model.predict_proba(X_test)
    i+=1
    

TimeSeriesSplit(gap=0, max_train_size=None, n_splits=5, test_size=None)
TRAIN1_1, [6948], TEST_1, [13894]
random_forest
catboost
TRAIN1_2, [13894], TEST_2, [20840]
random_forest
catboost
TRAIN1_3, [20840], TEST_3, [27786]
random_forest
catboost
TRAIN1_4, [27786], TEST_4, [34732]
random_forest
catboost
TRAIN1_5, [34732], TEST_5, [41678]
random_forest
catboost


## Walkthrough-funktionen  här

In [None]:

### Kör en walkthrough learn här, en datum i taget framåt

# Jag har ändrat till att alla steg kör utan test-datam ed fast iterations=100
def walkthrough(classic_test=False, verbose=False):
    
    df, nya_lopp, alla_datum, split_ix = get_alla_datum()

    l2_leaf_regs=2
    model=get_model(use_best=False,iterations=100)
    df=remove_features(df.copy())
    cat_features = list(df.loc[:,df.dtypes=='O'].columns)
    df,_ = replace_NaN(df.copy(), cat_features=cat_features)    
    print(f'cat_features {cat_features}\n')

    df['plac']=(df.plac==1)*1
        
    for nr,datum in enumerate(alla_datum[split_ix:]):
        print(f'walk-iter {nr+1} av {len(alla_datum[split_ix:])} ',end=': ')

        X_train = df.loc[df.datum<datum,:].copy()
        y_train = X_train.plac; X_train.drop(['plac'],axis=1,inplace=True)

        if classic_test:    ### klassisk train/test utan walkthrough
            X_test  = df.loc[df.datum>=datum,:].copy()
            y_test  = X_test.plac;  X_test.drop(['plac'],axis=1,inplace=True)
            train_pool = Pool(X_train,y_train,cat_features=cat_features)
            test_pool = Pool(X_test,y_test,cat_features=cat_features)
            model.fit(train_pool,use_best_model=True, verbose=verbose,eval_set=test_pool)
        else:
            X_test  = df.loc[df.datum==datum,:].copy()
            y_test  = X_test.plac;  X_test.drop(['plac'],axis=1,inplace=True)
            train_pool = Pool(X_train,y_train,cat_features=cat_features)
            test_pool = Pool(X_test,y_test,cat_features=cat_features)
            model.fit(train_pool,use_best_model=False, verbose=verbose)

        print('best iteration',model.get_best_iteration(), '\tbest score', round(model.get_best_score()['learn']['Accuracy'],3) )
        ##['validation']['Logloss'],3),'\t', round(model.get_best_score()['validation']['Accuracy:use_weights=true'],3))
        
        if classic_test:    ### klassisk train/test utan walkthrough
            return model,cat_features
    
        model.save_model('modeller/model_'+datum)

    X_train =df.copy().drop('plac',axis=1)
    y_train = df.plac 
    model.fit(X_train,y=y_train,cat_features=cat_features)
    print(f'spara model_senaste',datum)
    model.save_model('modeller/model_senaste')

    return df,nya_lopp, model,cat_features

### Här körs hela walkthrough

In [None]:
df, nya_lopp, model, cat_features = walkthrough(classic_test=False, verbose=False)


In [None]:
model = get_model().load_model('modeller/model_senaste')
df = pd.read_csv('all_data.csv')     
# print(df.columns)
df=remove_features(df.copy())
cat_features = list(df.loc[:,df.dtypes=='O'].columns)
df,_ = replace_NaN(df.copy(), cat_features=cat_features)    
y=df.plac
y=(y==1)*1
df.drop('plac',axis=1,inplace=True)


In [None]:
df[df.columns[(df.dtypes=='object').values.tolist()]].info()

## cv

In [None]:


cv_pool = Pool(df,y,cat_features=cat_features)

params = {
         'use_best_model': True,
         'eval_metric' : 'Recall',
         "loss_function": "Logloss",
         'early_stopping_rounds': 100,
         'verbose': 50,
         'iterations': 2000,
         'seed': 2021,
         'startified': True,
         'as_pandas': True,
         'type': 'TimeSeries'
}

cv_score =cv(pool=cv_pool, 
   params=params, 
   dtrain=None, 
   iterations=2000, 
   num_boost_round=None,
   fold_count=5, 
   nfold=None,
   inverted=False,
   partition_random_seed=0,
   seed=2021, 
   shuffle=False, 
   logging_level=None, 
   stratified=True,
   as_pandas=True,
   type='TimeSeries')

In [166]:
cv_score

Unnamed: 0,iterations,test-F1-mean,test-F1-std,train-F1-mean,train-F1-std,test-Logloss-mean,test-Logloss-std,train-Logloss-mean,train-Logloss-std
0,0,0.000000,0.000000,0.000000,0.000000,0.660234,0.003815,0.659794,0.004020
1,1,0.000000,0.000000,0.000000,0.000000,0.629065,0.003617,0.628580,0.003901
2,2,0.000000,0.000000,0.000741,0.001284,0.598386,0.009334,0.600561,0.006937
3,3,0.000000,0.000000,0.000371,0.000642,0.572443,0.008103,0.574281,0.005280
4,4,0.002953,0.005115,0.006238,0.009857,0.548483,0.007865,0.550048,0.005760
...,...,...,...,...,...,...,...,...,...
184,184,0.120758,0.017292,0.280924,0.003395,0.244545,0.004970,0.207284,0.006843
185,185,0.121312,0.017374,0.282253,0.004709,0.244574,0.004972,0.207173,0.006899
186,186,0.122547,0.015863,0.282494,0.003678,0.244520,0.004856,0.207063,0.006966
187,187,0.119446,0.016485,0.283570,0.004286,0.244538,0.004805,0.206988,0.007005


In [57]:
cv

<function catboost.core.cv(pool=None, params=None, dtrain=None, iterations=None, num_boost_round=None, fold_count=None, nfold=None, inverted=False, partition_random_seed=0, seed=None, shuffle=True, logging_level=None, stratified=None, as_pandas=True, metric_period=None, verbose=None, verbose_eval=None, plot=False, early_stopping_rounds=None, save_snapshot=None, snapshot_file=None, snapshot_interval=None, metric_update_interval=0.5, folds=None, type='Classical', return_models=False, log_cout=<ipykernel.iostream.OutStream object at 0x000001B50082C1C0>, log_cerr=<ipykernel.iostream.OutStream object at 0x000001B50082CF70>)>

In [None]:
cv_score[cv_score['test-Logloss-mean'].min() == cv_score['test-Logloss-mean']]