# Try stacking for V75

In [374]:
import pandas as pd 
import numpy as np 
from catboost import CatBoostClassifier,Pool, cv, utils 
from sklearn.impute import SimpleImputer

In [375]:
### return a CatBoost model with some default parameters
def get_model(d=6,l2=2,iterations=3000,use_best=True,verbose=False):
    model = CatBoostClassifier(iterations=iterations,use_best_model=use_best, 
        custom_metric=['Logloss', 'AUC','Recall', 'Precision', 'F1', 'Accuracy'],

        eval_metric='Accuracy', 
        depth=d,l2_leaf_reg=l2,
        auto_class_weights='Balanced',verbose=verbose, random_state=2021) 
    return model                

In [376]:
### Features som inte används vid träning
def remove_features(df,remove_mer=[]):
    #remove_mer=['h5_perf','h5_auto','h4_perf','h4_auto', 'h3_perf', 'h2_perf']
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) #
    if remove_mer:
        df.drop(remove_mer,axis=1,inplace=True)
    
    # df=check_unique(df.copy())
    # df=check_corr(df.copy())
    return df

In [377]:
 ## byt ut alla NaN till text för cat_features
# def replace_NaN(X_train,X_test=None, cat_features=[]):
#     # print('cat_features',cat_features)
#     for c in cat_features:
#         # print(c)
#         X_train.loc[X_train[c].isna(),c] = 'missing'       ### byt ut None-värden till texten 'Missing'
#         if X_test is not None:  ## om X_test är med
#             X_test.loc [X_test[c].isna(),c] = 'missing'    ### byt ut None-värden till texten 'Missing'
    
#     return X_train, X_test

In [378]:
### läs in data och returnera df, alla datum samt index till split-punkt
def load_data(proc=0.75):
    
    df = pd.read_csv('..\\all_data.csv')     
    alla_datum = list(df.datum.unique())
    split_ix = int(len(alla_datum)*proc)
    
    return df,alla_datum,split_ix

In [379]:
def remove_not_used_features(df):
    df.drop(['avd','startnr','vodds','podds','bins','h1_dat','h2_dat','h3_dat','h4_dat','h5_dat'],axis=1,inplace=True) 
    return df

In [362]:
df,alla_datum,split_ix = load_data() 
df = remove_not_used_features(df.copy())
CAT_FEATURES=['datum', 'bana', 'häst', 'kusk', 'kön',
        'h1_kusk', 'h1_bana',
        'h2_kusk', 'h2_bana', 
        'h3_kusk',  'h3_bana', 
        'h4_kusk', 'h4_bana', 
        'h5_kusk', 'h5_bana',]

NUM_FEATURES=[item for item in df.columns if item not in CAT_FEATURES and item !='plac']

PLAC_MEAN=df.plac.mean()
PLAC_MEAN

9.208773316093193

In [363]:
# den hittade inget, kanske skall testa igen längre fram
def remove_low_variance_features(df):
    from sklearn.feature_selection import VarianceThreshold
    print(df.shape)
    selection = VarianceThreshold(threshold=(0.1))
    X=selection.fit_transform(df)
    print(X.shape)
    return X

## Functions that are doing the transformations

In [364]:
# för ´categorical
def impute_test(df):
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df=imp1.fit_transform(df)  # replae NaN's with 'missing'
    return df


In [365]:
# from sklearn.datasets import load_iris
from sklearn_pandas import DataFrameMapper
from sklearn.pipeline import FeatureUnion, make_pipeline, Pipeline
import sklearn.pipeline, sklearn.metrics, sklearn.compose
from sklearn.preprocessing import FunctionTransformer
# from sklearn.feature_extraction.text import CountVectorizer
# Import modules for feature engineering and modelling
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression



In [366]:
# fill missing values in categorical features
def impute_cat_features(df, cat_features=CAT_FEATURES):
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df[cat_features]=imp1.fit_transform(df[cat_features])  # replae NaN's with 'missing'
    return df

In [455]:
# Set a smooth mean value to the features in df
def calc_smooth_mean(df, by, y, m=300, tot_mean=PLAC_MEAN):

    # Compute the number of values and the mean of each group
    agg = df.groupby(by)[y].agg(['count', 'mean'])
    counts = agg['count']
    means = agg['mean']

    # Compute the "smoothed" means
    smooth = (counts * means + m * tot_mean) / (counts + m)

    # return dict with value for each 'häst'
    # df['smooth']=df[by].map(smooth)
    
    # return df[['häst','smooth']].to_dict()
    display('used m = ',m)
    return smooth.to_dict()

# dict1=calc_smooth_mean(df,'häst','plac',tot_mean=df.plac.mean())



# pd.DataFrame.from_dict(dict,orient='index',columns=['smooth'])

In [368]:
# Handle h1-h5_bana
def transform_hx_bana(df,hx,the_map):
    from sklearn.impute import SimpleImputer
    df[hx] = df[hx].str.lower()
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing')
    df[hx]=imp1.fit_transform(df[[hx]])  # replae NaN's with 'missing'

    df[hx] = [item[0] for item in df[hx].str.split('-')]  # remove '-10' from 'solvalla-10' etc
    
    df[hx]=df[hx].map(the_map)  # transform column to numeric by mapping
    # after mapping we get new NaN's - now impute 0
    imp2 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=0)
    df[hx] = imp2.fit_transform(df[[hx]])
    return df
    

In [369]:

# Handle bana and hx_bana  
def transf_bana(df):
    df['bana'] = df.bana.str.lower()
    the_map = df.bana.value_counts() 
    the_map['missing']=0    

    df=transform_hx_bana(df,'h1_bana',the_map)
    df=transform_hx_bana(df,'h2_bana',the_map)
    df=transform_hx_bana(df,'h3_bana',the_map)
    df=transform_hx_bana(df,'h4_bana',the_map)
    df=transform_hx_bana(df,'h5_bana',the_map)

    df['bana']=df.bana.map(the_map)  # transform column to numeric by mapping 
    if df[['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]].isna().sum().sum() != 0:
        print('bana NaNs not 0:',df[['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana',]].isna().sum())
    
    df.drop(['bana','h1_bana','h2_bana','h3_bana','h4_bana','h5_bana'],axis=1,inplace=True)
    return df


In [515]:
# Handle häst and kusk 
class CustomSmoothMean(BaseEstimator, TransformerMixin):
    def __init__(self,cols,col2,y,m=30):
        super().__init__()
        self.map = {}
        self.total_mean=None
        self.cols=cols
        self.col2=col2
        self.y = y
        self.m=m

    def fit(self, df, y=None):
        
        self.total_mean=df[self.y].mean()
        self.map = calc_smooth_mean(df, y=self.y,by=self.cols[0],m=self.m,tot_mean=df[self.y].mean())
        self.map[None] = 'missing'
        self.map[np.nan] = 'missing'
        for col in self.cols:
            df[col] = df[col].str.cat(df[self.col2], sep =", ")
            
        display(f'using m={self.m}')
    
        return self

    def transform(self, df, y=None):
        print('shape',df.shape)
        for col in self.cols:
            df[col] = df[col].map(self.map)

        # df.drop(self.col2,axis=1)
        return df
    def get_feature_names(self):
        return self.cols,self.col2,self.y

def transf_kusk_häst(df,pref='',m=50,):
    df[pref+'ekipage'] = df[pref+'kusk'].str.cat(df['häst'], sep =", ")  # concatenate 'häst' and 'kusk' into one column
    df[pref+'ekipage'] = calc_smooth_mean(df, by=pref+'ekipage', y='plac',m=50) # make numeric with Target encoding with smooth mean
    df.drop([pref+'kusk'],axis=1,inplace=True)
    return df

In [371]:
print(list(df.select_dtypes('object').columns))
print()
print(list(df.select_dtypes('number').columns))

['datum', 'bana', 'häst', 'kusk', 'kön', 'h1_kusk', 'h1_bana', 'h2_kusk', 'h2_bana', 'h3_kusk', 'h3_bana', 'h4_kusk', 'h4_bana', 'h5_kusk', 'h5_bana']

['streck', 'kr', 'spår', 'dist', 'lopp_dist', 'start', 'ålder', 'plac', 'pris', 'h1_spår', 'h1_plac', 'h1_pris', 'h1_odds', 'h1_kmtid', 'h2_spår', 'h2_plac', 'h2_pris', 'h2_odds', 'h2_kmtid', 'h3_spår', 'h3_plac', 'h3_pris', 'h3_odds', 'h3_kmtid', 'h4_spår', 'h4_plac', 'h4_pris', 'h4_odds', 'h4_kmtid', 'h5_spår', 'h5_plac', 'h5_pris', 'h5_odds', 'h5_kmtid', 'h1_dist', 'h2_dist', 'h3_dist', 'h4_dist', 'h5_dist', 'h1_auto', 'h2_auto', 'h3_auto', 'h4_auto', 'h5_auto', 'h1_perf', 'h2_perf', 'h3_perf', 'h4_perf', 'h5_perf', 'senast', 'delta1', 'delta2', 'delta3', 'delta4']


In [542]:
# Handle kön  
def transf_kön(df):
    from sklearn.preprocessing import OneHotEncoder
    df['kön'] = df['kön'].str.lower()
    ohe = OneHotEncoder(sparse=False)
    dftemp=pd.DataFrame(ohe.fit_transform(df[['kön']]),columns=['kön_h','kön_s','kön_v'] )  # replae kön with One Hot Encoding
    # df=pd.concat([df,dftemp],axis=1)

    # check that kön is correct encoded
    if len(df.loc[(df.kön=='h') & (df.kön_h != 1),'kön']):
        assert False, 'Felaktigt kön h'
    if len(df.loc[(df.kön=='s') & (df.kön_s != 1),'kön']):
       assert False, 'Felaktigt kön s'
    if len(df.loc[(df.kön=='v') & (df.kön_v != 1),'kön']):
        assert False, 'Felaktigt kön v'
    df.drop(['kön'],axis=1,inplace=True)
    return df
def set_lower(dfo):
    df=dfo.copy()
    for c in df.columns:
        df[c] = df[c].str.lower()
    return df
def set_cols(df):
    return pd.DataFrame(df,columns=['kön_h','kön_s','kön_v'])

# print(lower(df[['kön']]))
lower =  FunctionTransformer(set_lower)
# s_c = FunctionTransformer(set_cols)
from sklearn.compose import ColumnTransformer,make_column_transformer
from sklearn.preprocessing import OneHotEncoder

# union = FeatureUnion([('o',df.select_dtypes('object')), 
#                      ('n',df.select_dtypes('object')), 
#                       ]
#                       )

pipe=make_pipeline(SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=-1))

mapper = DataFrameMapper([
    (['datum'], None),
    (['bana'], [lower,
                SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing'), ]),
    (['h1_bana','h2_bana','h3_bana','h4_bana','h5_bana'], [lower,
                SimpleImputer(missing_values=np.nan, strategy='constant',fill_value='missing'), ],{'alias':'hx_bana'}),
    (['kön'], OneHotEncoder(sparse=False),{'alias':'kön'}),
    
    (['kusk','h1_kusk','häst','plac'], lower, CustomSmoothMean(cols=['kusk','h1_kusk'],col2='häst',y='plac')),
    (['h1_kusk','häst'], lower,{'alias':'h1ekipage'}),
    (['h2_kusk','häst'], lower,{'alias':'h2ekipage'}),
    (['h3_kusk','häst'], lower,{'alias':'h3ekipage'}),
    (['h4_kusk','häst'], lower,{'alias':'h4ekipage'}),
    (['h5_kusk','häst'], lower,{'alias':'h5ekipage'}),
    
],df_out=True,input_df=True)
pipe2=Pipeline([('the_mapper',mapper), ('the_pipe',pipe)])
display(CustomSmoothMean(['kusk','h1_kusk'],['häst'],y='plac').fit(df).transform(df))

# svar1f = CustomSmoothMean.fit(df)
# svar1=mapper.fit_transform(df)
svar2=pipe.fit_transform(df.select_dtypes(include='number'))

'used m = '

30

'using m=30'

shape (41763, 69)


Unnamed: 0,datum,bana,häst,kusk,streck,kr,spår,dist,lopp_dist,start,...,h1_perf,h2_perf,h3_perf,h4_perf,h5_perf,senast,delta1,delta2,delta3,delta4
0,2014-12-28,ÖREBRO,ALLABALLAKAITOZ,,5.0,21018.0,6.0,2100.0,2100.0,0,...,3935.030968,6006.507182,11.180340,8.366600,5.000000,21.0,19.0,17.0,10.0,18.0
1,2014-12-28,ÖREBRO,ARISTOCAT BOKO,,7.0,23466.0,12.0,2100.0,2100.0,0,...,2735.738970,1484.131591,753.137355,753.137355,900.171313,8.0,7.0,24.0,14.0,11.0
2,2014-12-28,ÖREBRO,ART ON LINE,,23.0,20696.0,2.0,2100.0,2100.0,0,...,2974.603812,0.917738,4904.292577,6006.507182,6935.717077,9.0,6.0,19.0,41.0,15.0
3,2014-12-28,ÖREBRO,BEAR DANCER,,48.0,27477.0,1.0,2100.0,2100.0,0,...,10966.331584,14.142136,11501.585598,10966.331584,11501.585598,18.0,25.0,14.0,13.0,8.0
4,2014-12-28,ÖREBRO,BY AIR,,5.0,30589.0,5.0,2100.0,2100.0,0,...,5.477226,11.180340,2735.738970,2735.738970,3629.367876,16.0,230.0,14.0,49.0,14.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41758,2021-09-04,JÄGERSRO,ULTRA BRIGHT,,4.0,74765.0,6.0,2140.0,2140.0,0,...,,6976.064300,4231.196882,5983.816016,,7.0,49.0,13.0,29.0,13.0
41759,2021-09-04,JÄGERSRO,WILD LOVE,,8.0,58857.0,7.0,2140.0,2140.0,0,...,4034.287935,15.811388,5471.477941,3460.466492,7737.838310,13.0,26.0,10.0,9.0,19.0
41760,2021-09-04,JÄGERSRO,RACING BRODDA,,3.0,135036.0,8.0,2140.0,2140.0,0,...,6651.416330,6651.416330,1484.131591,1484.131591,878.024090,17.0,10.0,25.0,32.0,11.0
41761,2021-09-04,JÄGERSRO,KALI SMART,,3.0,53634.0,9.0,2140.0,2140.0,0,...,4693.236175,5705.344712,11501.585598,18994.243477,11501.585598,17.0,22.0,17.0,41.0,36.0


In [None]:
pd.DataFrame(svar1, columns=list(svar1.columns))
# svar1.columns

In [517]:

# test_pipe=make_pipeline(CustomSmoothMean(col1='kusk',col2='häst',y='plac'))
def date_to_num(df):
    return pd.DataFrame(pd.to_datetime(df.datum).view(float)*10e210)

tranf_datum = FunctionTransformer(date_to_num)
    
preprocessor = make_column_transformer(
                                    
                                    (CustomSmoothMean(cols=['kusk','h1_kusk'],col2='häst',y='plac',m=30), ['kusk','h1_kusk','häst','plac']),
                                    (tranf_datum, ['datum']),
                                    (OneHotEncoder(), ['kön']), 
                                     remainder='drop')

# test_pipe.fit_transform(df.copy())
display(preprocessor.fit_transform(df.copy()))

# type((pd.to_datetime(df.datum).view(float)*10e210).values)


'used m = '

30

'using m=30'

shape (41763, 4)


array([[nan, nan, 'ALLABALLAKAITOZ', ..., 0.0, 0.0, 1.0],
       [nan, nan, 'ARISTOCAT BOKO', ..., 0.0, 0.0, 1.0],
       [nan, nan, 'ART ON LINE', ..., 0.0, 0.0, 1.0],
       ...,
       [nan, nan, 'RACING BRODDA', ..., 0.0, 1.0, 0.0],
       [nan, nan, 'KALI SMART', ..., 0.0, 1.0, 0.0],
       [nan, nan, 'DEVS DAFFODIL', ..., 0.0, 1.0, 0.0]], dtype=object)

In [399]:
## TEST TEST

# Set seed for reproducibility
seed = 123

# Import package/module for data
import pandas as pd
from seaborn import load_dataset

# Import modules for feature engineering and modelling
from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression

# Load dataset
# df = load_dataset('tips').drop(columns=['tip', 'sex']).sample(n=5, random_state=seed)

# # Add missing values
# df.iloc[[1, 2, 4], [2, 4]] = np.nan
# df


In [421]:
## test test
# Partition data
X_train, X_test, y_train, y_test = train_test_split(df.drop(columns=['plac']), 
                                                    df['plac'], 
                                                    test_size=.2, 
                                                    random_state=seed)

# Define categorical columns
categorical = list(X_train.select_dtypes('object').columns)
print(f"Categorical columns are: {categorical}")

# Define numerical columns
numerical = list(X_train.select_dtypes('number').columns)
print(f"Numerical columns are: {numerical}")# Define categorical pipeline
cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
    ('encoder', OneHotEncoder(handle_unknown='ignore', sparse=False))
])

# Define numerical pipeline
num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', MinMaxScaler())
])

# Fit column transformer to training data
preprocessor = ColumnTransformer([
    ('cat', cat_pipe, categorical),
    ('num', num_pipe, numerical)
])
preprocessor.fit(X_train)

# Prepare column names
cat_columns = preprocessor.named_transformers_['cat']['encoder'].get_feature_names(categorical)
columns = np.append(cat_columns, numerical)

# Inspect training data before and after
print("******************** Training data ********************")
display(X_train.shape)
display(len(columns))
display(preprocessor.transform(X_train).shape)
final=pd.DataFrame(preprocessor.transform(X_train),columns=columns)

# Inspect test data before and after
print("******************** Test data ********************")
# display(X_test)
display(pd.DataFrame(preprocessor.transform(X_test), columns=columns))

Categorical columns are: ['datum', 'bana', 'häst', 'kusk', 'kön', 'h1_kusk', 'h1_bana', 'h2_kusk', 'h2_bana', 'h3_kusk', 'h3_bana', 'h4_kusk', 'h4_bana', 'h5_kusk', 'h5_bana']
Numerical columns are: ['streck', 'kr', 'spår', 'dist', 'lopp_dist', 'start', 'ålder', 'pris', 'h1_spår', 'h1_plac', 'h1_pris', 'h1_odds', 'h1_kmtid', 'h2_spår', 'h2_plac', 'h2_pris', 'h2_odds', 'h2_kmtid', 'h3_spår', 'h3_plac', 'h3_pris', 'h3_odds', 'h3_kmtid', 'h4_spår', 'h4_plac', 'h4_pris', 'h4_odds', 'h4_kmtid', 'h5_spår', 'h5_plac', 'h5_pris', 'h5_odds', 'h5_kmtid', 'h1_dist', 'h2_dist', 'h3_dist', 'h4_dist', 'h5_dist', 'h1_auto', 'h2_auto', 'h3_auto', 'h4_auto', 'h5_auto', 'h1_perf', 'h2_perf', 'h3_perf', 'h4_perf', 'h5_perf', 'senast', 'delta1', 'delta2', 'delta3', 'delta4']
******************** Training data ********************


(33410, 68)

21098

(33410, 21098)

******************** Test data ********************


Unnamed: 0,datum_2014-12-28,datum_2015-01-03,datum_2015-01-10,datum_2015-01-17,datum_2015-01-24,datum_2015-01-25,datum_2015-01-31,datum_2015-02-07,datum_2015-02-14,datum_2015-02-21,...,h1_perf,h2_perf,h3_perf,h4_perf,h5_perf,senast,delta1,delta2,delta3,delta4
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.018350,0.049891,0.070556,0.040735,0.040735,0.041209,0.052317,0.007299,0.013034,0.021239
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.049889,0.132286,0.122471,0.099997,0.270799,0.065934,0.022422,0.006257,0.014599,0.010619
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.000116,0.135622,0.023437,0.017469,0.011047,0.060440,0.028401,0.005735,0.013556,0.008850
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.122008,0.249998,0.033830,0.014615,0.024706,0.076923,0.031390,0.003650,0.009906,0.115929
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.122471,0.000109,0.036784,0.036785,0.047490,0.030220,0.037369,0.007299,0.008863,0.013274
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8348,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.234517,0.055780,0.054246,0.223604,0.258197,0.079670,0.091181,0.003650,0.007821,0.013274
8349,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.060649,0.055780,0.054246,0.042476,0.040735,0.057692,0.020927,0.005735,0.042231,0.020354
8350,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.244946,0.158112,0.095898,0.060650,0.040735,0.453297,0.079223,0.008863,0.007299,0.024779
8351,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.041740,0.091968,0.054246,0.042476,0.040735,0.027473,0.016442,0.011992,0.003128,0.014159


In [None]:

def impute_all_numeric_NaNs(df):
    # all features must be numeric
    from sklearn.impute import SimpleImputer
    imp1 = SimpleImputer(missing_values=np.nan, strategy='constant',fill_value=-1)
    trdf=imp1.fit_transform(df)  # replae NaN's with 'missing'
    return pd.DataFrame(trdf,columns=df.columns)

## All the transformations in one function

In [None]:

def transf_all(df):
    
    trdf=transf_bana(df.copy())
    trdf=transf_kusk_häst(trdf)
    trdf=transf_kusk_häst(trdf,pref='h1_')
    trdf=transf_kusk_häst(trdf,pref='h2_')
    trdf=transf_kusk_häst(trdf,pref='h3_')
    trdf=transf_kusk_häst(trdf,pref='h4_')
    trdf=transf_kusk_häst(trdf,pref='h5_')
    trdf.drop(['häst'],axis=1,inplace=True)
    trdf=transf_kön(trdf)
    trdf['datum']=pd.to_datetime(trdf.datum).view(float)*10e210
    
    return impute_all_numeric_NaNs(trdf)

0        9.224990e-03
1        9.224990e-03
2        9.224990e-03
3        9.224990e-03
4        9.224990e-03
             ...     
41758    1.140408e+12
41759    1.140408e+12
41760    1.140408e+12
41761    1.140408e+12
41762    1.140408e+12
Name: datum, Length: 41763, dtype: float64

In [None]:

# transform all categoricals and impute all NaNs
def prepare_all(df):
    trdf = transf_all(df)
    
    y = (trdf.plac==1) * 1
    trdf = trdf.drop('plac',axis=1)
    
    # all features are now numeric
    trdf = impute_all_numeric_NaNs(trdf)
    if trdf.isna().sum().sum() != 0:
        print('still NaNs in data')
        assert False
    return trdf,y

## stacking prepare and run

In [None]:
# metrics
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score

# for tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import TimeSeriesSplit

### CatBoost

In [None]:
#catBoost preprocessing
def catB_preprocess(df):
        y = (df.plac==1) * 1
        df = df.drop('plac',axis=1)
        df = impute_cat_features(df,cat_features=CAT_FEATURES)

        return df,y


In [None]:

# clean the cat_features
df_catb, y = catB_preprocess(df.copy())
df_catb[CAT_FEATURES].isna().sum().sum()

In [None]:
trdf,y=prepare_all(df)
scorer = make_scorer(roc_auc_score)

In [None]:
# CatBoost model GridSearchCV
my_df_1=df_catb             # catboost with Nans abd cat_features
my_cats_1 = CAT_FEATURES
my_df_2 = trdf              # dataset common for all estimators
my_cats_2 = []

my_df = my_df_2
my_cats = my_cats_2
my_pool = Pool(my_df,y,cat_features=my_cats)
my_catb = CatBoostClassifier(cat_features=my_cats)

tscv = TimeSeriesSplit(n_splits=5)
params = {'iterations': [50,100,500,1000],
          'depth': [2,3,4, 5, 6],
          'loss_function': ['Logloss'],
          'l2_leaf_reg': np.logspace(-20, -19, 3),
          'leaf_estimation_iterations': [10],
          'eval_metric': ['AUC'],
        #   'use_best_model': ['True'],
          'logging_level':['Silent'],
          'random_seed': [2021],
         }
# clf.fit(df_catb,y)

catb_grid = RandomizedSearchCV(estimator=my_catb, param_distributions=params, scoring=scorer, cv=tscv)

# GridSearchCV  - compare with default
catb_grid.fit(my_df,y)


In [None]:
# get best estimator and params
best_catb = catb_grid.best_estimator_
print('best gridsearch',catb_grid.best_score_)
best_param = catb_grid.best_params_
best_param

In [None]:
# print(best_catb.fit(my_df,y).best_score_)
best_catb.get_feature_importance(prettified=True).head(8)

### XGBoost

In [None]:
# XGBoost model 
import xgboost as xgb
label = y
dtrain = xgb.DMatrix(trdf, label=label)
param = {'max_depth':2, 'eta':1 }
num_round = 10

# GridSearchCV
params = {'nthread':[4], #when use hyperthread, xgboost may become slower
              'objective':['binary:logistic'],
              'learning_rate': [0.09,0.1,0.15], #so called `eta` value
              'max_depth': [7,8,9],
              'min_child_weight': [9,10,11],
              'use_label_encoder':[False],
            #   'silent': [1],
              'eval_metric': ['logloss'],
              'subsample': [0.5,0.9,1.0],
              'colsample_bytree': [0.7, 0.9, 1.0],
              'n_estimators': [7,8,9], #number of trees, change it to 1000 for better results
              'missing':[-999],
              'seed': [2021],
              }

xgb_clf = xgb.XGBClassifier(num_round=num_round)
xgb_grid = GridSearchCV(estimator=xgb_clf, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)
xgb_grid.fit(trdf, y )

In [None]:
# get best estimator and params
best_xgb = xgb_grid.best_estimator_
print('best gridsearch', xgb_grid.best_score_)
best_param = xgb_grid.best_params_
best_param

In [None]:
pd.DataFrame(best_xgb.feature_importances_,index=trdf.columns).sort_values(by=0,ascending=False).head(6)

### ExtraTree

In [None]:
# ExtraTree  model
tscv = TimeSeriesSplit()
from sklearn.tree import ExtraTreeClassifier
et = ExtraTreeClassifier(min_samples_split=2, random_state=2021,class_weight='balanced')

# GridSearchCV
params = {'class_weight': [ 'balanced'  ,None], 
          'max_depth': [None, 5, 10  ,15 ,20],
          'min_samples_leaf': [1, 2 ,3, 4,],
          'min_samples_split': [2,30, 30,  40  ,45],   
          'criterion': [ 'gini'  ,'entropy'],   
          'splitter': ['random',  'best']
         }

et_grid = GridSearchCV(estimator=et, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)
et_grid.fit(trdf, y)

In [None]:
# get best estimator and params
best_et = et_grid.best_estimator_
print('best gridsearch', et_grid.best_score_)
best_param = et_grid.best_params_
best_param

### KNN

In [None]:
# KNN model
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=4, )

# GridSearchCV

tscv = TimeSeriesSplit()
params = {'n_neighbors': [10,15,20],
          
         }

knn_grid = GridSearchCV(estimator=knn, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)

# GridSearchCV  
knn_grid.fit(trdf, y)

In [None]:
best_knn = knn_grid.best_estimator_
print('best gridsearch',knn_grid.best_score_)
best_param = knn_grid.best_params_
best_param

### RandomForrest

In [None]:
# GridSearch
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

tscv = TimeSeriesSplit()
params = {'n_estimators': [5,10,100],
          'max_depth': [4, 5, 6, None],
          'class_weight': ['balanced'],
        #   'loss_function': ['Logloss'],
        #   'eval_metric': ['F1'],
        #   'logging_level':['Silent'],
          'random_state': [2021],
         }
# clf.fit(df_catb,y)

rf_grid = GridSearchCV(estimator=rf, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)

# GridSearchCV  
rf_grid.fit(trdf, y)

In [None]:
best_rf = rf_grid.best_estimator_
print('best gridsearch',rf_grid.best_score_)
best_param = rf_grid.best_params_
best_param

In [None]:

pd.DataFrame(best_rf.feature_importances_,index=trdf.columns, columns=['importance']).sort_values(by='importance',ascending=False)

### SVC

In [None]:
# GridSearchCV
# from sklearn.svm import SVC
# svc = SVC(C=1.0, gamma='scale', tol=0.001, cache_size=200, class_weight='balanced', random_state=2021)

# tscv = TimeSeriesSplit()
# params = {'C': [1,2,3],
#           'gamma': ['scale','auto'],
#           'class_weight': ['balanced'],
#           'random_state': [2021],
#          }

# svc_grid = GridSearchCV(estimator=svc, param_grid=params, n_jobs=3,scoring=scorer, cv=tscv)

# svc_grid.fit(trdf, y)

In [None]:
# # get best estimator and params
# best_svc = svc_grid.best_estimator_
# print('best gridsearch', svc_grid.best_score_)
# best_param = svc_grid.best_params_
# best_param

## Stack'em

In [None]:
#
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegressionCV
base_models = [('xgb',best_xgb,),
               ('rf',best_rf,),
               ('catb', best_catb,),
              ('knn', best_knn, ),
              ('et', best_et)              # ger  sämre res
               # ('ridge', best_ridge, ) ,   # saknar predict_proba - usless!
            #    ('svc', best_svc, ),        # tar extremt lång tid för fit
               ]
meta_model = LogisticRegressionCV(class_weight='balanced')


In [None]:
def evaluate_model(model, X, y, scoring=scorer):
    print('scorer =',scorer)
    tscv = TimeSeriesSplit(n_splits=5)
    scores = cross_val_score(model, X, y, scoring=scoring, cv=tscv, verbose=1, n_jobs=3, error_score='raise')
    return scores

def Stacking(model_item, X_tr, y_tr, X_final, n_fold):
    model=model_item[1]
    print(model_item[0], end=' ')
    tscv = TimeSeriesSplit(n_splits=n_fold)
    # valid_pred=np.empty((X_valid.shape[0],1),float)
    train_pred=np.empty((0,1),float)
    for n, (train_indices, test_indices) in enumerate(tscv.split(X_tr)):
        if n==0:
            the_first_set_len = len(train_indices) # the first set that cannot be used i timeSeries stacking
            
        X_train, X_test = X_tr.iloc[train_indices], X_tr.iloc[test_indices]
        y_train, y_test = y_tr.iloc[train_indices], y_tr.iloc[test_indices]
        print(n,end=' ')
        model.fit(X=X_train,y=y_train)
        train_pred=np.append(train_pred,model.predict_proba(X_test)[:,1])
    print(f'- final fit (the_first_set_len={the_first_set_len})' )
    model.fit(X=X_tr,y=y_tr) # fit on all data (except the final data)   
    valid_pred = model.predict_proba(X_final)[:,1]
    return model,valid_pred.reshape(-1,1), train_pred, the_first_set_len


In [None]:
split_ix = int(len(trdf)*.8)
train_X = trdf[trdf.index <  split_ix]
valid_X = trdf[trdf.index >= split_ix]
train_y = y[y.index <  split_ix]
valid_y = y[y.index >=  split_ix]
# test 2 models
valid_pred=[None] * len(base_models)
train_pred=[None] * len(base_models)
model=[None] * len(base_models)
for n, model_item in enumerate(base_models):
    model[n],valid_pred[n] ,train_pred[n], the_first_set_len = Stacking(model_item,n_fold=5, X_tr=train_X, y_tr= train_y, X_final=valid_X)
    train_pred[n]=pd.DataFrame(train_pred[n],columns=[model_item[0]])
    valid_pred[n]=pd.DataFrame(valid_pred[n],columns=[model_item[0]])
    
    scores=evaluate_model(model[n],train_X,train_y)
    print(f'mean={np.mean(scores)}: {scores}')
train_y = train_y.iloc[the_first_set_len:]      # remove the first set that can't be used in timeseries stacking
train_pred=pd.concat(train_pred,axis=1)
valid_pred=pd.concat(valid_pred,axis=1)


## Final estimation with the meta model

In [None]:
import time
meta_model.fit(train_pred,train_y)
scores=evaluate_model(meta_model,valid_pred,valid_y)
time.sleep(0.2)
print('models', list(valid_pred.columns))
print(f'mean={np.mean(scores)}: {scores}')

## rf, catb, knn, et - 0.88803
## xgb, rf, catb, knn, et - 0.88720