# Try stacking for V75

In [89]:
import pandas as pd 
import numpy as np 
from IPython.display import display 

from catboost import CatBoostClassifier,Pool, cv, utils 
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier,ExtraTreesClassifier
from sklearn.tree   import DecisionTreeClassifier


In [90]:

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline, Pipeline
from sklearn.impute import SimpleImputer

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer, make_column_transformer

### ***What to do***:
- Kör en enkel modell utan 'streck'  
- kolla om ekipage-nr gör någon skillnad  optimera m
- kolla om 'avd' göt någon skillnad 
- skapa en flaml-ensemble utan streck
- Se hu bra denna ensemble är jämfört med 'streck' inkluderat
- Blir den lika bra som min pipe?
- Kan  med och utan 'streck' användas tillsammans ?
  - Man kan ha som innan på den första och proba-order eller f/insats som kriterium för den andra som ju bara skall hitta överraskningar


In [91]:
## byt ut alla NaN till text för cat_features
def replace_NaN(X_train,X_test=None, cat_features=[]):
    # print('cat_features',cat_features)
    X_train[cat_features]=X_train[cat_features].fillna('missing')
    if X_test is not None:  ## om X_test är med
        X_test[cat_features]=X_test[cat_features].fillnal('missing')    ### byt ut None-värden till texten 'None

    return X_train,X_test

In [92]:
### läs in data och returnera df, alla datum samt index till split-punkt
def basic_data(df, NaN=True, frac=0.25):
    X_train,X_test,y_train,y_test = None,None,None,None
    dfnew = remove_features(df.copy())
    dfnew['plac'] = (dfnew.plac==1) * 1
    cat_features = list(dfnew.loc[:,df.dtypes=='O'].columns)
    if NaN:
        dfnew,_ = replace_NaN(dfnew.copy(), cat_features=cat_features)    
    
    alla_datum = df.datum.unique()
    split_dat = alla_datum[int(len(alla_datum)* (1 - 0.25))]     # större än split_dat är test

    X_train = dfnew.loc[dfnew.datum <= split_dat].copy()
    y_train=X_train.plac
    X_train.drop('plac',axis=1,inplace=True)
    
    X_test = dfnew.loc[dfnew.datum > split_dat].copy()
    y_test=X_test.plac
    X_test.drop('plac',axis=1,inplace=True)
    
    return X_train,X_test, y_train,y_test

In [93]:
# df skall innehålla datum,avd,vodds
def proba_order_score(df_, y,proba):
    kassa=1000
    df = df_.copy()
    df['proba'] = proba[:,1]
    df['f'] = (df.proba*df.vodds - 1) / (df.vodds-1)  # kelly formel
    df['spela'] = df.f >0
    df['insats'] = df.spela * df.f * kassa

    df.sort_values(['datum','avd','proba'],ascending=[True,True,False],inplace=True)
    proba_order=df.groupby(['datum','avd']).proba.cumcount()

    df['prob_order']=proba_order+1
    df['y'] = y
    
    return df, df.loc[df.y==1].prob_order.mean()   # mean prob_order för vinnarhäst

In [94]:
### Features som inte används vid träning
def remove_features(df,remove_mer=[]):
    #remove_mer=['h5_perf','h5_auto','h4_perf','h4_auto', 'h3_perf', 'h2_perf']
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) #
    if remove_mer:
        df.drop(remove_mer,axis=1,inplace=True)
    
    # df=check_unique(df.copy())
    # df=check_corr(df.copy())
    return df

## Bygg min första riktiga Pipeline

In [95]:
# Set a smooth mean value to the features in X_train  ##
def calc_smooth_mean(X, y, by, m=100, tot_mean=None):
    Xcopy = X.copy()
    Xcopy[by] = Xcopy[by].str.lower()
    Xcopy['plac'] = y

    # Compute the number of values and the mean of each group
    agg = Xcopy.groupby(by)['plac'].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * tot_mean) / (counts + m)
    
    return smooth.to_dict()


# transform model to stacking estimator
class ModelTransformer(TransformerMixin):
    
    def __init__(self, model):
        self.model = model

    def fit(self, *args, **kwargs):
        self.model.fit(*args, **kwargs)
        return self

    def transform(self, X, **transform_params):
        return pd.DataFrame(self.model.predict_proba(X))
    
def create_ekipage(df):
    prefix = ['','h1_','h2_','h3_','h4_','h5_',]
    ekipage=[]
    for pref in prefix:
        df[pref+'ekipage'] = df[pref+'kusk'].str.cat(df['häst'], sep =", ")  # concatenate 'häst' and 'kusk' into one column
        ekipage.append(pref+'ekipage')
        df.drop([pref+'kusk'],axis=1,inplace=True)
        
    df.drop(['häst'],axis=1,inplace=True )   
        
    return df, ekipage   
    
# Handle ekipage (häst and kusk)
class CustomSmoothMean(BaseEstimator, TransformerMixin):
    def __init__(self,cols,plac='plac', m=100):
        super().__init__()
        self.map = {}
        self.total_mean=None
        self.cols = cols    
        self.plac=plac
        self.m=m

    def fit(self, X, y=None):
        # print('CustSmooth fit start')
        self.total_mean=y.mean()
        self.map = calc_smooth_mean(X, y, by='ekipage', m=self.m, tot_mean=self.total_mean)
        self.map['missing'] = 0

        display(f'using m={self.m}')
        # print('CustSmooth fit end')
        return self

    def transform(self, X, y=None):
        # print('CustSmooth transform start')   
        for col in self.cols:
            X[col] = X[col].str.lower()
            X[col] = X[col].map(self.map)
            X[col].fillna(0,inplace=True)
            
        # display(X.isna().sum())    
        # print('CustSmooth transform end')    
        return X
    
    def get_feature_names(self):
        return self.cols, self.plac, self.m, self.total_mean,self.map    
####

def set_lower(dfo):
    # print('set_lower start')
    df=dfo.copy()
    for c in df.columns:
        df[c] = df[c].str.lower()
    # print('set_lower end')    
    return df

lower =  FunctionTransformer(set_lower)

def datum_to_num(df):
    dfc=df.copy()
    dfc['datum'] = pd.to_datetime(df.datum).view(float)*10e210
    return dfc

# Handle ekipage (häst and kusk)
class transf_bana(BaseEstimator, TransformerMixin):
    def __init__(self,banor):
        super().__init__()
        self.map = {}
        self.banor = banor

    def fit(self, X, y=None):
        # print('fit bana start')
        huvud_bana=self.banor[0]
        
        self.map = X[huvud_bana].str.lower().value_counts() 
        self.map[None] = 0
        self.map['missing'] = 0
        # print('fit bana end')
        return self

    def transform(self, X, y=None):
        # print('transform bana',X.shape)
        for bana in self.banor:
            X[bana] = X[bana].fillna('missing-1')
            X[bana] = X[bana].str.lower()
            X[bana] = [item[0] for item in X[bana].str.split('-')]  # remove '-10' from 'solvalla-10' etc
            # print('transform bana effter split',bana)
            # print(X[bana][:10])
            X[bana] = X[bana].map(self.map)
            X[bana] = X[bana].fillna(0)
            # print('transform bana innan return')
        return X
    
    def get_feature_names(self):
        return self.map, self.banor
    
####


### Själva pipen

In [99]:
dforg = pd.read_csv('..\\all_data.csv')     
dforg,ekipage = create_ekipage(dforg.copy())      # alla ekipage, h1_ekipage-h5_ekipage
X_train,X_test,y_train,y_test = basic_data(dforg.copy())
cat_features = list(X_train.loc[:,X_train.dtypes=='O'].columns)


streck=True
if not streck:
    X_train.drop('streck',axis=1,inplace=True)
    X_test.drop('streck',axis=1,inplace=True)

könen=['kön1','kön2','kön3']
banor = ['bana','h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]
all_nums = list(X_train.select_dtypes('number').columns)

lower_and_ohe = Pipeline([
        ('lower', lower),
        ('encode', OneHotEncoder(sparse=False, handle_unknown='ignore'))
    ])

############ Parms ###############################
rf_parms={ 'n_jobs': 5,'class_weight': 'balanced'}
etr_parms={'n_jobs': 5,'class_weight': 'balanced'}
cb_parms={'iterations': 200,'early_stopping_rounds': 50,'auto_class_weights': 'Balanced','verbose': False}
final_parms={'n_jobs': 5,'class_weight': 'balanced'}
################################################### 

col_pipe = make_column_transformer(
                        (transf_bana(banor), banor),
                        (FunctionTransformer(datum_to_num), ['datum']),
                        (lower_and_ohe, ['kön']), 
                        (CustomSmoothMean(ekipage,m=100), ekipage),
                        (SimpleImputer(strategy='constant', fill_value=-1),all_nums  ),
                        remainder='passthrough',
                        )

pipe = Pipeline(steps=[
    ('transformers', col_pipe ),
    
    ('estimators', FeatureUnion([
        ('knn', ModelTransformer(KNeighborsClassifier(n_neighbors=5, n_jobs=4))),
        ('cat', ModelTransformer(CatBoostClassifier(**cb_parms))),
        ('dtr', ModelTransformer(DecisionTreeClassifier(class_weight='balanced'))),
        ('etr', ModelTransformer(ExtraTreesClassifier(**etr_parms))),
        ('rf', ModelTransformer(RandomForestClassifier(**rf_parms))),  
    ])),
    ('estimator', RandomForestClassifier(**final_parms)),
])


with_streck = pipe.fit(X_train,y=y_train)
# no_streck = pipe.fit(X_train_nostr,y=y_train)
# the_stack = pipe.fit(X_train,y_train)


'using m=100'

## Jämför olika modellers score (med och utan 'streck')

In [100]:
from sklearn.metrics import roc_auc_score,mean_absolute_error,accuracy_score,matthews_corrcoef,f1_score
X_train,X_test,y_train,y_test = basic_data(dforg)

if not streck:
    print("utan 'streck'")
train_pred= pipe.predict_proba(X_train)
print('auc train',roc_auc_score(y_train, train_pred[:,1]))
test_pred = pipe.predict_proba(X_test)
print('auc test', roc_auc_score(y_test,test_pred[:,1]))
# print('mae test', mean_absolute_error(y_test,test_pred[:,1]))
# po=proba_ordning(X_test.copy(),y_test,df.copy(),test_pred )[['datum','avd','proba','prob_order','vann']]
# print('mean proba-order för vunna',po.loc[po.vann==1].prob_order.mean())

X_test[['datum','avd','vodds']] = dforg[['datum','avd','vodds']]
_,prob_score = proba_order_score(X_test,y_test,test_pred)
print('prob_score test',prob_score,roc_auc_score(y_test,test_pred[:,1]))
X_train[['datum','avd','vodds']] = dforg[['datum','avd','vodds']]
_,prob_score = proba_order_score(X_train,y_train,train_pred)
print('prob_score train',prob_score,roc_auc_score(y_train,train_pred[:,1]))
print()

auc train 1.0
auc test 0.5184634774239365
prob_score test 6.272321428571429 0.5184634774239365
prob_score train 1.0022066936373666 1.0



### Klart överinlärd - prova en gridsearch om det går att nå estimators parametrar

### jämför med CatBoost

In [107]:
#Jämfr CatBoost

X_train,X_test,y_train,y_test = basic_data(dforg)

cb = CatBoostClassifier(iterations=500,  early_stopping_rounds=100,auto_class_weights='Balanced',verbose=100)
cb_pipe = make_pipeline(col_pipe, cb)
cb_pipe.fit(X_train,y_train)
cb_train_pred= cb_pipe.predict_proba(X_train)
cb_test_pred =cb_pipe.predict_proba(X_test)

if not streck:
    print("utan 'streck'")
# print('auc', roc_auc_score(y_test,cb_test_pred[:,1]))
print('mae', mean_absolute_error(y_test,cb_test_pred[:,1]))
# po=proba_ordning(X_test.copy(),y_test,df.copy(),cb_test_pred )[['datum','avd','proba','prob_order','vann']]
# print('cb mean vann ',po.loc[po.vann==1].prob_order.mean())

X_test[['datum','avd','vodds']] = dforg[['datum','avd','vodds']]
_,prob_score = proba_order_score(X_test,y_test,cb_test_pred)

print('prob_score',prob_score,roc_auc_score(y_test,test_pred[:,1]))

'using m=100'

Learning rate set to 0.084837
0:	learn: 0.5977540	total: 14.9ms	remaining: 7.42s
100:	learn: 0.2203682	total: 1.61s	remaining: 6.36s
200:	learn: 0.1781405	total: 3.19s	remaining: 4.74s
300:	learn: 0.1464653	total: 4.67s	remaining: 3.09s
400:	learn: 0.1228111	total: 5.92s	remaining: 1.46s
499:	learn: 0.1037990	total: 7.14s	remaining: 0us
mae 0.11940676036559213
prob_score 4.234375 0.5184634774239365


### FLAML

In [108]:
# FLAML med samma transformers som alla andra och helt utan transformers (raw)
temp_pipe = make_column_transformer(
                        (transf_bana(banor), banor),
                        (FunctionTransformer(datum_to_num), ['datum']),
                        (lower_and_ohe, ['kön']), 
                        # (CustomSmCustomSmoothMean(ekipage,m=100), ekipage),
                        (SimpleImputer(strategy='constant', fill_value=-1),all_nums  ),
                        remainder='passthrough',
                        )

from flaml import AutoML 
automl, automl_raw = None, None
X_train,X_test,y_train,y_test = basic_data(dforg.copy())
# # X_train.drop('streck',axis=1,inplace=True)
# # X_test.drop('streck',axis=1,inplace=True)
# # X_train.drop(['avd'],axis=1,inplace=True)
# # X_test.drop(['avd'],axis=1,inplace=True)
# cat_features = list(X_train.loc[:,X_train.dtypes=='O'].columns)

flml_parms= {'automl__task': 'classification',  'automl__verbose': False, #'automl__X_val': X_test, 'automl__y_val':y_test,
             'automl__split_type':'time', 'automl__metric': 'roc_auc', 'automl__time_budget':320, 
             'automl__max_iter':20000000, 'automl__n_jobs':5,'automl__seed':2021, 'automl__early_stop':True, 'automl__ensemble':True}

automl = AutoML( )
flm_pipe = make_pipeline(temp_pipe, automl)
flm=flm_pipe.fit(X_train,y_train, **flml_parms)

if True: # raw - utan transformed data
    flml_raw_parms={'task': 'classification','split_type':None, 'verbose': 1, 'metric':'roc_auc', 'verbose':False,
            'time_budget':500, 'max_iter':20000000,'n_jobs':5, 'X_val': X_test, 'y_val':y_test,'early_stop':True, 'ensemble':True}

    automl_raw = AutoML()
    automl_raw.fit(X_train,y_train, **flml_raw_parms)

No low-cost partial config given to the search algorithm. For cost-frugal search, consider providing low-cost values for cost-related hps via 'low_cost_partial_config'.


In [109]:
# print('indata flm',  len(banor+['datum']+könen+ekipage+all_nums))

if not streck:
    print("utan 'streck'")
X_train,X_test,y_train,y_test = basic_data(dforg.copy())

if True :   # raw - ej transformed data
    flm_raw_train_pred= automl_raw.predict_proba(X_train)
    flm_raw_test_pred = automl_raw.predict_proba(X_test)
    
    X_test_raw = X_test.copy()
    X_test_raw[['datum','avd','vodds']] = dforg[['datum','avd','vodds']]
    _,prob_score = proba_order_score(X_test_raw,y_test, flm_raw_test_pred)

    print('raw prob_score och auc', prob_score,roc_auc_score(y_test,flm_raw_test_pred[:,1]))
    
if True:  # med transformde data
    flm_train_pred= flm_pipe.predict_proba(X_train)
    flm_test_pred = flm_pipe.predict_proba(X_test)
    # print('flm test auc',roc_auc_score(y_test, flm_test_pred[:,1]))
    # print('flm train',roc_auc_score(y_train, flm_train_pred[:,1]))
    # po=proba_ordning(X_test.copy(),y_test,df.copy(),flm_test_pred )[['datum','avd','proba','prob_order','vann']]
    # print('flm test mean vann',po.loc[po.vann==1].prob_order.mean())
    
    X_test[['datum','avd','vodds']] = dforg[['datum','avd','vodds']]
    _,prob_score = proba_order_score(X_test,y_test, flm_test_pred)

    print('flm prob_score och auc',prob_score,roc_auc_score(y_test,flm_test_pred[:,1]))
    

raw prob_score och auc 3.2020089285714284 0.8131355942102115
flm prob_score och auc 3.1328125 0.8166181926067131


In [66]:
# flm_pipe['columntransformer'].fit_transform(X_train,y_train)[0]
temp_pipe = make_column_transformer(
                        # (transf_bana(banor), banor),
                        # (FunctionTransformer(datum_to_num), ['datum']),
                        # (lower_and_ohe, ['kön']), 
                        (CustomSmoothMean(ekipage,m=100), ekipage),
                        # (SimpleImputer(strategy='constant', fill_value=-1),all_nums  ),
                        remainder='passthrough',
                        )
temp_pipe.fit_transform(X_train,y_train)[0]

'using m=100'

array([0.08554158456740465, 0.08554158456740465, 0.08554158456740465,
       0.08554158456740465, 0.08554158456740465, 0.0, '2014-12-28',
       'ÖREBRO', 5.0, 21018.0, 6.0, 2100.0, 2100.0, 0, 6, 'v', 125000.0,
       'Eskilstuna', 3.0, 2.0, 35.0, 3.92, 16.8, 'Eskilstuna', 3.0, 1.0,
       30.0, 3.7, 14.9, 'Eskilstuna', 3.0, 15.0, 125.0, 52.42, 14.3,
       'Solvalla', 3.0, 15.0, 70.0, 5.2, 13.9, 'Örebro', 3.0, 15.0, 25.0,
       2.2, 12.3, 2140.0, 2140.0, 2640.0, 2140.0, 1609.0, 1, 1, 1, 1, 1,
       3935.030968151612, 6006.507181794034, 11.180339887498944,
       8.366600265340756, 5.0, 21.0, 19.0, 17.0, 10.0, 18.0], dtype=object)