# Try stacking for V75

In [176]:
import pandas as pd 
import numpy as np 
from IPython.display import display 

from catboost import CatBoostClassifier,Pool, cv, utils 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree   import DecisionTreeClassifier
from sklearn.linear_model import LinearRegression


In [5]:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, Pipeline
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, LabelBinarizer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer
# import sklearn.metrics, sklearn.compose

from sklearn.model_selection import train_test_split


### ***What to do***:
- Kör en enkel modell utan 'streck'  
- kolla om ekipage-nr gör någon skillnad  optimera m
- kolla om 'avd' göt någon skillnad 
- skapa en flaml-ensemble utan streck
- Se hu bra denna ensemble är jämfört med 'streck' inkluderat
- Blir den lika bra som min pipe?
- Kan  med och utan 'streck' användas tillsammans ?
  - Man kan ha som innan på den första och proba-order eller f/insats som kriterium för den andra som ju bara skall hitta överraskningar


In [6]:
### läs in data och returnera df, alla datum samt index till split-punkt
def load_data(proc=0.75):
    
    df = pd.read_csv('..\\all_data.csv')     
    alla_datum = list(df.datum.unique())
    split_ix = int(len(df)*proc)
    
    return df,alla_datum,split_ix

In [7]:
### return a CatBoost model with some default parameters
def get_model(d=6,l2=2,iterations=3000,use_best=True,verbose=False):
    model = CatBoostClassifier(iterations=iterations,use_best_model=use_best, 
        custom_metric=['Logloss', 'AUC','Recall', 'Precision', 'F1', 'Accuracy'],

        eval_metric='Accuracy', 
        depth=d,l2_leaf_reg=l2,
        auto_class_weights='Balanced',verbose=verbose, random_state=2021) 
    return model                

In [8]:
### Features som inte används vid träning
def remove_features(df,remove_mer=[]):
    #remove_mer=['h5_perf','h5_auto','h4_perf','h4_auto', 'h3_perf', 'h2_perf']
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) #
    if remove_mer:
        df.drop(remove_mer,axis=1,inplace=True)
    
    # df=check_unique(df.copy())
    # df=check_corr(df.copy())
    return df

## Bygg min första riktiga Pipeline

In [177]:
# Set a smooth mean value to the features in X_train  ##
def calc_smooth_mean(X, y, by, m=100, tot_mean=None):
    Xcopy = X.copy()
    Xcopy['plac'] = y

    # Compute the number of values and the mean of each group
    agg = Xcopy.groupby(by)['plac'].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * tot_mean) / (counts + m)
    
    return smooth.to_dict()


# transform model to stacking estimator
class ModelTransformer(TransformerMixin):
    
    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict_proba(X))
    
def create_ekipage(df):
    prefix = ['','h1_','h2_','h3_','h4_','h5_',]
    ekipage=[]
    for pref in prefix:
        df[pref+'ekipage'] = df[pref+'kusk'].str.cat(df['häst'], sep =", ")  # concatenate 'häst' and 'kusk' into one column
        ekipage.append(pref+'ekipage')
        df.drop([pref+'kusk'],axis=1,inplace=True)
        
    df.drop(['häst'],axis=1,inplace=True )   
        
    return df, ekipage   
    
# Handle ekipage (häst and kusk)
class CustomSmoothMean(BaseEstimator, TransformerMixin):
    def __init__(self,cols,plac='plac', m=100):
        super().__init__()
        self.map = {}
        self.total_mean=None
        self.cols=cols
        self.plac=plac
        self.m=m

    def fit(self, X, y=None):
        # print('CustSmooth fit start')
        self.total_mean=y.mean()
        self.map = calc_smooth_mean(X, y, by='ekipage', m=self.m, tot_mean=self.total_mean)
        self.map['missing'] = 0

        display(f'using m={self.m}')
        # print('CustSmooth fit end')
        return self

    def transform(self, X, y=None):
        # print('CustSmooth transform start')   
        for col in self.cols:
            X[col] = X[col].map(self.map)
            X[col].fillna(0,inplace=True)
            
        # display(X.isna().sum())    
        # print('CustSmooth transform end')    
        return X
    
    def get_feature_names(self):
        return self.cols, self.plac, self.m, self.total_mean,self.map    
####

def set_lower(dfo):
    # print('set_lower start')
    df=dfo.copy()
    for c in df.columns:
        df[c] = df[c].str.lower()
    # print('set_lower end')    
    return df

lower =  FunctionTransformer(set_lower)

def datum_to_num(df):
    dfc=df.copy()
    dfc['datum'] = pd.to_datetime(df.datum).view(float)*10e210
    return dfc

# Handle ekipage (häst and kusk)
class transf_bana(BaseEstimator, TransformerMixin):
    def __init__(self,banor):
        super().__init__()
        self.map = {}
        self.banor = banor

    def fit(self, X, y=None):
        # print('fit bana start')
        huvud_bana=self.banor[0]
        
        self.map = X[huvud_bana].str.lower().value_counts() 
        self.map[None] = 0
        self.map['missing'] = 0
        # print('fit bana end')
        return self

    def transform(self, X, y=None):
        # print('transform bana',X.shape)
        for bana in self.banor:
            X[bana] = X[bana].fillna('missing-1')
            X[bana] = X[bana].str.lower()
            X[bana] = [item[0] for item in X[bana].str.split('-')]  # remove '-10' from 'solvalla-10' etc
            # print('transform bana effter split',bana)
            # print(X[bana][:10])
            X[bana] = X[bana].map(self.map)
            X[bana] = X[bana].fillna(0)
            # print('transform bana innan return')
        return X
    
    def get_feature_names(self):
        return self.map, self.banor
    
####


### Själva pipen

In [190]:
df,alla_datum,split_ix = load_data()
df,ekipage = create_ekipage(df.copy())      # alla ekipage, h1_ekipage-h5_ekipage
# df = remove_features(df)

X_train=df[:split_ix].copy()
y_train=(X_train.plac==1)*1
X_train.drop('plac',axis=1,inplace=True)
X_test=df[split_ix:].copy()
y_test=(X_test.plac==1)*1
X_test.drop('plac',axis=1,inplace=True)
X_train = remove_features(X_train.copy())
X_test  = remove_features(X_test.copy())

no_streck=True
if no_streck:
    X_train.drop('streck',axis=1,inplace=True)
    X_test.drop('streck',axis=1,inplace=True)

könen=['kön1','kön2','kön3']
banor = ['bana','h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]
all_nums = list(X_train.select_dtypes('number').columns)

lower_and_ohe = Pipeline([
        ('lower', lower),
        ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])

############ Parms ###############################
rf_parms={ 'n_jobs': 5,'class_weight': 'balanced'}
etr_parms={'n_jobs': 5,'class_weight': 'balanced'}
cb_parms={'iterations': 200,'early_stopping_rounds': 50,'auto_class_weights': 'Balanced','verbose': False}
final_parms={'n_jobs': 5,'class_weight': 'balanced'}
################################################### 

col_pipe = make_column_transformer(
                        (transf_bana(banor), banor),
                        (FunctionTransformer(datum_to_num), ['datum']),
                        (lower_and_ohe, ['kön']), 
                        (CustomSmoothMean(ekipage,m=100), ekipage),
                        (SimpleImputer(strategy='constant', fill_value=-1),all_nums  ),
                        remainder='passthrough',
                        )

pipe = Pipeline(steps=[
    ('transformers', col_pipe ),
    
    ('estimators', FeatureUnion([
        ('knn', ModelTransformer(KNeighborsClassifier(n_neighbors=5, n_jobs=4))),
        ('cat', ModelTransformer(CatBoostClassifier(**cb_parms))),
        ('dtr', ModelTransformer(DecisionTreeClassifier(class_weight='balanced'))),
        ('etr', ModelTransformer(ExtraTreesClassifier(**etr_parms))),
        ('rf', ModelTransformer(RandomForestClassifier(**rf_parms))),  
    ])),
    ('estimator', RandomForestClassifier(**final_parms)),
])


# with_streck = pipe.fit(X_train,y=y_train)
# no_streck = pipe.fit(X_train_nostr,y=y_train)
# the_stack = pipe.fit(X_train,y_train)


### MyOwnSearchCV

## Jämför olika modellers score (med och utan 'streck')

In [166]:
# räkna ut proba ordning per datum,avd
def proba_ordning(X,y,df,proba) :   
    kassa=200
    X['vann'] = y
    X['avd'] =df.avd
    X['vodds'] = df.vodds
    X['proba'] = proba[:,1]
    X['f'] = (X.proba*X.vodds - 1) / (X.vodds-1)  # kelly formel
    X['spela'] = X.f >0
    X['insats'] = X.spela * X.f * kassa
    X.sort_values(['datum','avd','proba'],ascending=[False,True,False],inplace=True)
    proba_order=X.groupby(['datum','avd']).proba.cumcount()

    X['prob_order']=proba_order+1

    return(X)

In [175]:
from sklearn.metrics import roc_auc_score,mean_absolute_error,accuracy_score,matthews_corrcoef,f1_score
if no_streck:
    print("utan 'streck'")
train_pred= pipe.predict_proba(X_train)
print('auc train',roc_auc_score(y_train, train_pred[:,1]))
test_pred = pipe.predict_proba(X_test)
print('auc test', roc_auc_score(y_test,test_pred[:,1]))
print('mae test', mean_absolute_error(y_test,test_pred[:,1]))
po=proba_ordning(X_test.copy(),y_test,df.copy(),test_pred )[['datum','avd','proba','prob_order','vann']]
print('mean proba-order för vunna',po.loc[po.vann==1].prob_order.mean())
print()

utan 'streck'
auc train 1.0
auc test 0.5117973504026595
mae test 0.0957246445949814
mean vann order 6.3240534521158125

auc 0.5117973504026595
mae 0.0957246445949814
auc 0.5117973504026595
mae 0.0957246445949814


0.02033407572383073

In [165]:
#Jämfr CatBoost
cb = CatBoostClassifier(**cb_parms)
cb_pipe = make_pipeline(col_pipe, cb)
cb_pipe.fit(X_train,y_train)
cb_train_pred= cb_pipe.predict_proba(X_train)
cb_test_pred =cb_pipe.predict_proba(X_test)

if no_streck:
    print("utan 'streck'")
print('auc', roc_auc_score(y_test,cb_test_pred[:,1]))
print('mae', mean_absolute_error(y_test,cb_test_pred[:,1]))
po=proba_ordning(X_test.copy(),y_test,df.copy(),cb_test_pred )[['datum','avd','proba','prob_order','vann']]
print('cb mean vann ',po.loc[po.vann==1].prob_order.mean())

used m =  100


'using m=100'

auc 0.5753861410576032
mae 0.12954190828951184
cb mean vann  5.387527839643653


### FLAML

In [199]:
#jmfr FLAML

from flaml import AutoML 
automl, automl_raw = None, None
if False: # med transformed data
    flml_parms={'automl__task': 'classification','automl__split_type':None, 'automl__verbose':100,
            'automl__time_budget':500, 'automl__n_jobs':5, 'automl__early_stop':True, 'automl__ensemble':True}

    automl = AutoML( )
    flm_pipe = make_pipeline(col_pipe, automl)
    flm=flm_pipe.fit(X_train,y_train, **flml_parms)
if True: # raw - utan transformed data
    flml_raw_parms={'task': 'classification','split_type':None, 'verbose': 100,
            'time_budget':500, 'n_jobs':5, 'early_stop':True, 'ensemble':True}

    automl_raw = AutoML()
    automl_raw.fit(X_train,y_train, **flml_raw_parms)

[flaml.automl: 09-21 00:18:35] {1427} INFO - Evaluation method: holdout
[flaml.automl: 09-21 00:18:36] {1473} INFO - Minimizing error metric: 1-roc_auc
[flaml.automl: 09-21 00:18:36] {1505} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'catboost', 'xgboost', 'extra_tree', 'lrl1']
[flaml.automl: 09-21 00:18:36] {1735} INFO - iteration 0, current learner lgbm
[flaml.automl: 09-21 00:18:36] {1914} INFO -  at 1.4s,	best lgbm's error=0.3203,	best lgbm's error=0.3203
[flaml.automl: 09-21 00:18:36] {1735} INFO - iteration 1, current learner lgbm
[flaml.automl: 09-21 00:18:37] {1914} INFO -  at 1.9s,	best lgbm's error=0.3154,	best lgbm's error=0.3154
[flaml.automl: 09-21 00:18:37] {1735} INFO - iteration 2, current learner lgbm
[flaml.automl: 09-21 00:18:37] {1914} INFO -  at 2.4s,	best lgbm's error=0.3154,	best lgbm's error=0.3154
[flaml.automl: 09-21 00:18:37] {1735} INFO - iteration 3, current learner lgbm
[flaml.automl: 09-21 00:18:38] {1914} INFO -  at 3.2s,	best lgbm's error=0

In [None]:
automl_raw.best_estimator()

In [174]:
# print('indata flm',  len(banor+['datum']+könen+ekipage+all_nums))

if no_streck:
    print("utan 'streck'")
   
if True :   # raw - ej transformed data
    # print('indata flm_raw',len(X_train.columns))
    flm_raw_train_pred= automl_raw.predict_proba(X_train)
    flm_raw_test_pred = automl_raw.predict_proba(X_test)
    print('flm_raw test',roc_auc_score(y_test, flm_raw_test_pred[:,1]))
    print('flm_raw train',roc_auc_score(y_train, flm_raw_train_pred[:,1]))  
    print('mae', mean_absolute_error(y_test,cb_test_pred[:,1]))
    po=proba_ordning(X_test.copy(),y_test,df.copy(),flm_raw_test_pred )[['datum','avd','proba','prob_order','vann']]
    print('flm_raw mean vann',po.loc[po.vann==1].prob_order.mean())
if True:  # med transformde data
    flm_train_pred= flm_pipe.predict_proba(X_train)
    flm_test_pred = flm_pipe.predict_proba(X_test)
    print('flm test',roc_auc_score(y_test, flm_test_pred[:,1]))
    print('flm train',roc_auc_score(y_train, flm_train_pred[:,1]))
    po=proba_ordning(X_test.copy(),y_test,df.copy(),flm_test_pred )[['datum','avd','proba','prob_order','vann']]
    print('flm test mean vann',po.loc[po.vann==1].prob_order.mean())


utan 'streck'
flm_raw test 0.726974642131447
flm_raw train 0.8327028954305613
mae 0.12954190828951184
flm_raw mean vann 3.736080178173719


### End Min första Pipeline

In [227]:
# # senaste veckan
df_nu = pd.read_csv('..//sparad_scrape.csv')
print(df_nu.datum.unique())

df_nu,ekipage = create_ekipage(df_nu.copy())      # alla ekipage, h1_ekipage-h5_ekipage

X_train_nu = remove_features(df_nu.copy())

# X_test_nu = X_test.loc[X_test.datum=='2021-09-11']
# y_test_nu=y_test[X_test_nu.index]
# X_test_nu.shape

flm_raw_train_pred=automl_raw.predict_proba(X_train_nu)
# flm_raw_test_pred=automl_raw.predict_proba(X_test_nu)
# print(roc_auc_score(y_test_nu,flm_raw_test_pred[:,1]))
res = pd.DataFrame()
# res['y']=y_test_nu
res['proba']=flm_raw_train_pred[:,1]
res['avd'] = df_nu.avd
res['vodds']=df_nu.vodds
res['datum']=df_nu.datum
res['startnr']=df_nu.startnr
res['ekipage']=df_nu.ekipage
# display(f'mean for test==1 {res.loc[res.y==1].yhat}')

res['f'] = (res.proba*res.vodds - 1) / (res.vodds-1)  # kelly formel
res['spela'] = res.f >0
res['insats'] = res.spela * res.f * 200

# Ta ut de 2 bästa per avd
res.sort_values(['datum','avd','proba'],ascending=[True,True,False],inplace=True)
proba_order=res.groupby(['datum','avd']).proba.cumcount()

res['prob_order']=proba_order+1

# res.loc[(res.prob_order<5) & (res.prob_order !=0)].sort_values(by='proba',ascending=False)
res.loc[res.prob_order<8]


['2021-09-18']


Unnamed: 0,proba,avd,vodds,datum,startnr,ekipage,f,spela,insats,prob_order
0,0.298143,1,3.83,2021-09-18,1,"Carl Johan Jepson, BEARTIME",0.050137,True,10.027442,1
2,0.223853,1,6.01,2021-09-18,2,"Conrad Lugauer, SANDSJÖNS ENZO",0.068933,True,13.786591,2
1,0.17519,1,6.31,2021-09-18,7,"Jorma Kontio, MAS CAPACITY",0.019859,True,3.971789,3
4,0.132337,1,9.49,2021-09-18,4,"Erik Adielsson, ÖNAS NOUGAT",0.030139,True,6.027786,4
3,0.131445,1,5.88,2021-09-18,5,"Kim Eriksson, BO C.",-0.046538,False,-0.0,5
5,0.065016,1,18.92,2021-09-18,6,"Sören Boel, ADDE S.H.",0.01284,True,2.568053,6
6,0.055523,1,17.01,2021-09-18,3,"Christoffer Eriksson, INGO",-0.00347,False,-0.0,7
11,0.422429,2,3.38,2021-09-18,2,"Conrad Lugauer, MISTER K.O.Z.",0.179752,True,35.95038,1
12,0.175806,2,6.09,2021-09-18,6,"Mikael J Andersson, BLUE FRONTLINE",0.013882,True,2.776432,2
13,0.102996,2,3.92,2021-09-18,7,"Torbjörn Jansson, PERFECT SCORE ÅS",-0.204198,False,-0.0,3


In [None]:
df,alla_datum,split_ix = load_data() 
df = remove_features(df.copy())
CAT_FEATURES=['datum', 'bana', 'häst', 'kusk', 'kön',
        'h1_kusk', 'h1_bana',
        'h2_kusk', 'h2_bana', 
        'h3_kusk',  'h3_bana', 
        'h4_kusk', 'h4_bana', 
        'h5_kusk', 'h5_bana',]

NUM_FEATURES=[item for item in df.columns if item not in CAT_FEATURES and item !='plac']

PLAC_MEAN=df.plac.mean()
PLAC_MEAN

In [None]:
# den hittade inget, kanske skall testa igen längre fram
def remove_low_variance_features(df):
    from sklearn.feature_selection import VarianceThreshold
    print(df.shape)
    selection = VarianceThreshold(threshold=(0.1))
    X=selection.fit_transform(df)
    print(X.shape)
    return X

# Min manuella stacking (TimeSeries)
TimeSeries kan iinte använda sklearn.stacking

## Functions that are doing the transformations

In [None]:
# för ´categorical
def impute_test(df):
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df=imp1.fit_transform(df)  # replae NaN's with 'missing'
    return df


In [None]:
# fill missing values in categorical features
def impute_cat_features(df, cat_features=CAT_FEATURES):
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df[cat_features]=imp1.fit_transform(df[cat_features])  # replae NaN's with 'missing'
    return df

In [None]:
# Handle h1-h5_bana
def transform_hx_bana(df,hx,the_map):
    from sklearn.impute import SimpleImputer
    df[hx] = df[hx].str.lower()
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df[hx]=imp1.fit_transform(df[[hx]])  # replae NaN's with 'missing'

    df[hx] = [item[0] for item in df[hx].str.split('-')]  # remove '-10' from 'solvalla-10' etc
    
    df[hx]=df[hx].map(the_map)  # transform column to numeric by mapping
    # after mapping we get new NaN's - now impute 0
    imp2 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
    df[hx] = imp2.fit_transform(df[[hx]])
    return df
    

In [None]:

# Handle bana and hx_bana  
def transf_bana(df):
    df['bana'] = df.bana.str.lower()
    the_map = df.bana.value_counts() 
    the_map['missing']=0    

    df=transform_hx_bana(df,'h1_bana',the_map)
    df=transform_hx_bana(df,'h2_bana',the_map)
    df=transform_hx_bana(df,'h3_bana',the_map)
    df=transform_hx_bana(df,'h4_bana',the_map)
    df=transform_hx_bana(df,'h5_bana',the_map)

    df['bana']=df.bana.map(the_map)  # transform column to numeric by mapping 
    if df[['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]].isna().sum().sum() != 0:
        print('bana NaNs not 0:',df[['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]].isna().sum())
    
    df.drop(['bana','h1_bana','h2_bana','h3_bana','h4_bana','h5_bana'],axis=1,inplace=True)
    return df


In [113]:
# SKIPPA DENNA
def transf_kusk_häst(df,pref='',m=50,):
    df[pref+'ekipage'] = df[pref+'kusk'].str.cat(df['häst'], sep =", ")  # concatenate 'häst' and 'kusk' into one column
    df[pref+'ekipage'] = calc_smooth_mean(df, y, by=pref+'ekipage',m=50) # make numeric with Target encoding with smooth mean
    df.drop([pref+'kusk'],axis=1,inplace=True)
    return df

In [None]:
print(list(df.select_dtypes('object').columns))
print()
print(list(df.select_dtypes('number').columns))

In [None]:
# Handle kön  
def transf_kön(df):
    from sklearn.preprocessing import OneHotEncoder
    df['kön'] = df['kön'].str.lower()
    ohe = OneHotEncoder(sparse=False)
    dftemp=pd.DataFrame(ohe.fit_transform(df[['kön']]),columns=['kön_h','kön_s','kön_v'] )  # replae kön with One Hot Encoding
    # df=pd.concat([df,dftemp],axis=1)

    # check that kön is correct encoded
    if len(df.loc[(df.kön=='h') & (df.kön_h != 1),'kön']):
        assert False, 'Felaktigt kön h'
    if len(df.loc[(df.kön=='s') & (df.kön_s != 1),'kön']):
       assert False, 'Felaktigt kön s'
    if len(df.loc[(df.kön=='v') & (df.kön_v != 1),'kön']):
        assert False, 'Felaktigt kön v'
    df.drop(['kön'],axis=1,inplace=True)
    return df
# s_c = FunctionTransformer(set_cols)
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.preprocessing import OneHotEncoder

# union = FeatureUnion([('o',df.select_dtypes('object')), 
#                      ('n',df.select_dtypes('object')), 
#                       ]
#                       )

pipe=make_pipeline(SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=-1))

mapper = DataFrameMapper([
    (['datum'], None),
    (['bana'], [lower,
                SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing'), ]),
    (['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana'], [lower,
                SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing'), ],{'alias':'hx_bana'}),
    (['kön'], OneHotEncoder(sparse=False),{'alias':'kön'}),
    
    (['kusk','h1_kusk','häst','plac'], lower, CustomSmoothMean(cols=['kusk','h1_kusk'],col2='häst',y='plac')),
    (['h1_kusk','häst'], lower,{'alias':'h1ekipage'}),
    (['h2_kusk','häst'], lower,{'alias':'h2ekipage'}),
    (['h3_kusk','häst'], lower,{'alias':'h3ekipage'}),
    (['h4_kusk','häst'], lower,{'alias':'h4ekipage'}),
    (['h5_kusk','häst'], lower,{'alias':'h5ekipage'}),
    
],df_out=True,input_df=True)
pipe2=Pipeline([('the_mapper',mapper), ('the_pipe',pipe)])
display(CustomSmoothMean(['kusk','h1_kusk'],['häst'],y='plac').fit(df).transform(df))

# svar1f = CustomSmoothMean.fit(df)
# svar1=mapper.fit_transform(df)
svar2=pipe.fit_transform(df.select_dtypes(include='number'))

In [None]:

# test_pipe=make_pipeline(CustomSmoothMean(col1='kusk',col2='häst',y='plac'))
def date_to_num(df):
    return pd.DataFrame(pd.to_datetime(df.datum).view(float)*10e210)

tranf_datum = FunctionTransformer(date_to_num)
    
preprocessor = make_column_transformer(
                                    
                                    (CustomSmoothMean(cols=['kusk','h1_kusk'],col2='häst',y='plac',m=30), ['kusk','h1_kusk','häst','plac']),
                                    (tranf_datum, ['datum']),
                                    (OneHotEncoder(), ['kön']), 
                                     remainder='drop')

# test_pipe.fit_transform(df.copy())
display(preprocessor.fit_transform(df.copy()))

# type((pd.to_datetime(df.datum).view(float)*10e210).values)


In [None]:
## test test
# Partition data
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['plac']), 
                                                    df['plac'], 
                                                    test_size=.2, 
                                                    random_state=2021)

# Define categorical columns
categorical = list(X_train.select_dtypes('object').columns)
print(f"Categorical columns are: {categorical}")

# Define numerical columns
numerical = list(X_train.select_dtypes('number').columns)
print(f"Numerical columns are: {numerical}")# Define categorical pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Define numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

# Fit column transformer to training data
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, categorical),
    ('num', num_pipe, numerical)
])
preprocessor.fit(X_train)

# Prepare column names
cat_columns = preprocessor.named_transformers_['cat']['encoder'].get_feature_names(categorical)
columns = np.append(cat_columns, numerical)

# Inspect training data before and after
print("******************** Training data ********************")
display(X_train.shape)
display(len(columns))
display(preprocessor.transform(X_train).shape)
final=pd.DataFrame(preprocessor.transform(X_train),columns=columns)

# Inspect test data before and after
print("******************** Test data ********************")
# display(X_test)
display(pd.DataFrame(preprocessor.transform(X_test), columns=columns))

In [None]:

def impute_all_numeric_NaNs(df):
    # all features must be numeric
    from sklearn.impute import SimpleImputer
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=-1)
    trdf=imp1.fit_transform(df)  # replae NaN's with 'missing'
    return pd.DataFrame(trdf,columns=df.columns)

## All the transformations in one function

In [None]:

def transf_all(df):
    
    trdf=transf_bana(df.copy())
    trdf=transf_kusk_häst(trdf)
    trdf=transf_kusk_häst(trdf,pref='h1_')
    trdf=transf_kusk_häst(trdf,pref='h2_')
    trdf=transf_kusk_häst(trdf,pref='h3_')
    trdf=transf_kusk_häst(trdf,pref='h4_')
    trdf=transf_kusk_häst(trdf,pref='h5_')
    trdf.drop(['häst'],axis=1,inplace=True)
    trdf=transf_kön(trdf)
    trdf['datum']=pd.to_datetime(trdf.datum).view(float)*10e210
    
    return impute_all_numeric_NaNs(trdf)

In [None]:

# transform all categoricals and impute all NaNs
def prepare_all(df):
    trdf = transf_all(df)
    
    y = (trdf.plac==1) * 1
    trdf = trdf.drop('plac',axis=1)
    
    # all features are now numeric
    trdf = impute_all_numeric_NaNs(trdf)
    if trdf.isna().sum().sum() != 0:
        print('still NaNs in data')
        assert False
    return trdf,y

## stacking prepare and run

In [None]:
# metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# for tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit

### CatBoost

In [None]:
#catBoost preprocessing
def catB_preprocess(df):
        y = (df.plac==1) * 1
        df = df.drop('plac',axis=1)
        df = impute_cat_features(df,cat_features=CAT_FEATURES)

        return df,y


In [None]:

# clean the cat_features
df_catb, y = catB_preprocess(df.copy())
df_catb[CAT_FEATURES].isna().sum().sum()

In [None]:
trdf,y=prepare_all(df)
scorer = make_scorer(roc_auc_score)

In [None]:
# CatBoost model GridSearchCV
my_df_1=df_catb             # catboost with Nans abd cat_features
my_cats_1 = CAT_FEATURES
my_df_2 = trdf              # dataset common for all estimators
my_cats_2 = []

my_df = my_df_2
my_cats = my_cats_2
my_pool = Pool(my_df,y,cat_features=my_cats)
my_catb = CatBoostClassifier(cat_features=my_cats)

tscv = TimeSeriesSplit(n_splits=5)
params = {'iterations': [50,100,500,1000],
          'depth': [2,3,4, 5, 6],
          'loss_function': ['Logloss'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
          'eval_metric': ['AUC'],
        #   'use_best_model': ['True'],
          'logging_level':['Silent'],
          'random_seed': [2021],
         }
# clf.fit(df_catb,y)

catb_grid = RandomizedSearchCV(estimator=my_catb, param_distributions=params, scoring=scorer, cv=tscv)

# GridSearchCV  - compare with default
catb_grid.fit(my_df,y)


In [None]:
# get best estimator and params
best_catb = catb_grid.best_estimator_
print('best gridsearch',catb_grid.best_score_)
best_param = catb_grid.best_params_
best_param

In [None]:
# print(best_catb.fit(my_df,y).best_score_)
best_catb.get_feature_importance(prettified=True).head(8)

### XGBoost

In [None]:
# XGBoost model 
import xgboost as xgb
label = y
dtrain = xgb.DMatrix(trdf, label=label)
param = {'max_depth':2, 'eta':1 }
num_round = 10

# GridSearchCV
params = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.09,0.1,0.15], #so called `eta` value
              'max_depth': [7,8,9],
              'min_child_weight': [9,10,11],
              'use_label_encoder':[False],
            #   'silent': [1],
              'eval_metric': ['logloss'],
              'subsample': [0.5,0.9,1.0],
              'colsample_bytree': [0.7, 0.9, 1.0],
              'n_estimators': [7,8,9], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [2021],
              }

xgb_clf = xgb.XGBClassifier(num_round=num_round)
xgb_grid = GridSearchCV(estimator=xgb_clf, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)
xgb_grid.fit(trdf, y )

In [None]:
# get best estimator and params
best_xgb = xgb_grid.best_estimator_
print('best gridsearch', xgb_grid.best_score_)
best_param = xgb_grid.best_params_
best_param

In [None]:
pd.DataFrame(best_xgb.feature_importances_,index=trdf.columns).sort_values(by=0,ascending=False).head(6)

### ExtraTree

In [None]:
# ExtraTree  model
tscv = TimeSeriesSplit()
from sklearn.tree import ExtraTreeClassifier
et = ExtraTreeClassifier(min_samples_split=2, random_state=2021,class_weight='balanced')

# GridSearchCV
params = {'class_weight': [ 'balanced'  ,None], 
          'max_depth': [None, 5, 10  ,15 ,20],
          'min_samples_leaf': [1, 2 ,3, 4,],
          'min_samples_split': [2,30, 30,  40  ,45],   
          'criterion': [ 'gini'  ,'entropy'],   
          'splitter': ['random',  'best']
         }

et_grid = GridSearchCV(estimator=et, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)
et_grid.fit(trdf, y)

In [None]:
# get best estimator and params
best_et = et_grid.best_estimator_
print('best gridsearch', et_grid.best_score_)
best_param = et_grid.best_params_
best_param

### KNN

In [None]:
# KNN model
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=4, )

# GridSearchCV

tscv = TimeSeriesSplit()
params = {'n_neighbors': [10,15,20],
          
         }

knn_grid = GridSearchCV(estimator=knn, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)

# GridSearchCV  
knn_grid.fit(trdf, y)

In [None]:
best_knn = knn_grid.best_estimator_
print('best gridsearch',knn_grid.best_score_)
best_param = knn_grid.best_params_
best_param

### RandomForrest

In [None]:
# GridSearch
rf = RandomForestClassifier()

tscv = TimeSeriesSplit()
params = {'n_estimators': [5,10,100],
          'max_depth': [4, 5, 6, None],
          'class_weight': ['balanced'],
        #   'loss_function': ['Logloss'],
        #   'eval_metric': ['F1'],
        #   'logging_level':['Silent'],
          'random_state': [2021],
         }
# clf.fit(df_catb,y)

rf_grid = GridSearchCV(estimator=rf, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)

# GridSearchCV  
rf_grid.fit(trdf, y)

In [None]:
best_rf = rf_grid.best_estimator_
print('best gridsearch',rf_grid.best_score_)
best_param = rf_grid.best_params_
best_param

In [None]:

pd.DataFrame(best_rf.feature_importances_,index=trdf.columns, columns=['importance']).sort_values(by='importance',ascending=False)

### SVC

In [None]:
# GridSearchCV
# from sklearn.svm import SVC
# svc = SVC(C=1.0, gamma='scale', tol=0.001, cache_size=200, class_weight='balanced', random_state=2021)

# tscv = TimeSeriesSplit()
# params = {'C': [1,2,3],
#           'gamma': ['scale','auto'],
#           'class_weight': ['balanced'],
#           'random_state': [2021],
#          }

# svc_grid = GridSearchCV(estimator=svc, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)

# svc_grid.fit(trdf, y)

In [None]:
# # get best estimator and params
# best_svc = svc_grid.best_estimator_
# print('best gridsearch', svc_grid.best_score_)
# best_param = svc_grid.best_params_
# best_param

## Stack'em

In [None]:
#
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegressionCV
base_models = [('xgb',best_xgb,),
               ('rf',best_rf,),
               ('catb', best_catb,),
              ('knn', best_knn, ),
              ('et', best_et)              # ger  sämre res
               # ('ridge', best_ridge, ) ,   # saknar predict_proba - usless!
            #    ('svc', best_svc, ),        # tar extremt lång tid för fit
               ]
meta_model = LogisticRegressionCV(class_weight='balanced')


In [None]:
def evaluate_model(model, X, y, scoring=scorer):
    print('scorer =',scorer)
    tscv = TimeSeriesSplit(n_splits=5)
    scores = cross_val_score(model, X, y, scoring=scoring, cv=tscv, verbose=1, n_jobs=3, error_score='raise')
    return scores

def Stacking(model_item, X_tr, y_tr, X_final, n_fold):
    model=model_item[1]
    print(model_item[0], end=' ')
    tscv = TimeSeriesSplit(n_splits=n_fold)
    # valid_pred=np.empty((X_valid.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    for n, (train_indices, test_indices) in enumerate(tscv.split(X_tr)):
        if n==0:
            the_first_set_len = len(train_indices) # the first set that cannot be used i timeSeries stacking
            
        X_train, X_test = X_tr.iloc[train_indices], X_tr.iloc[test_indices]
        y_train, y_test = y_tr.iloc[train_indices], y_tr.iloc[test_indices]
        print(n,end=' ')
        model.fit(X=X_train,y=y_train)
        train_pred=np.append(train_pred,model.predict_proba(X_test)[:,1])
    print(f'- final fit (the_first_set_len={the_first_set_len})' )
    model.fit(X=X_tr,y=y_tr) # fit on all data (except the final data)   
    valid_pred = model.predict_proba(X_final)[:,1]
    return model,valid_pred.reshape(-1,1), train_pred, the_first_set_len


In [None]:
split_ix = int(len(trdf)*.8)
train_X = trdf[trdf.index <  split_ix]
valid_X = trdf[trdf.index >= split_ix]
train_y = y[y.index <  split_ix]
valid_y = y[y.index >=  split_ix]

# the estimators
valid_pred=[None] * len(base_models)
train_pred=[None] * len(base_models)
model=[None] * len(base_models)
for n, model_item in enumerate(base_models):
    model[n],valid_pred[n] ,train_pred[n], the_first_set_len = Stacking(model_item,n_fold=5, X_tr=train_X, y_tr= train_y, X_final=valid_X)
    train_pred[n]=pd.DataFrame(train_pred[n],columns=[model_item[0]])
    valid_pred[n]=pd.DataFrame(valid_pred[n],columns=[model_item[0]])
    
    scores=evaluate_model(model[n],train_X,train_y)
    print(f'mean={np.mean(scores)}: {scores}')
train_y = train_y.iloc[the_first_set_len:]      # remove the first set that can't be used in timeseries stacking
train_pred=pd.concat(train_pred,axis=1)
valid_pred=pd.concat(valid_pred,axis=1)

## Final estimation with the meta model

In [None]:
import time
meta_model.fit(train_pred,train_y)
scores=evaluate_model(meta_model,valid_pred,valid_y)
time.sleep(0.2)
print('models', list(valid_pred.columns))
print(f'mean={np.mean(scores)}: {scores}')

## rf, catb, knn, et - 0.88803
## xgb, rf, catb, knn, et - 0.88720