# Alternativa feature selection metoder

In [1]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import cross_val_score
# from sklearn.linear_model import LinearRegression

from sklearn.ensemble import RandomForestRegressor as rf

# preprocessing for numeric columns
imp_median = SimpleImputer(strategy='median', add_indicator=True)
scaler = StandardScaler()
# preprocessing for categorical columns
imp_constant = SimpleImputer(strategy='constant',) # fill_value='missing')
ohe = OneHotEncoder(handle_unknown='ignore')

In [2]:
df=pd.read_csv('../all_data.csv')
df.set_index('datum',inplace=True)
df.drop(['avd', 'bana', 'häst', 'kusk', 'vodds','podds', 'startnr', 'bins',
         'h1_dat', 'h1_bana', 'h1_kusk', 
         'h2_dat', 'h2_bana', 'h2_kusk',
         'h3_dat', 'h3_bana', 'h3_kusk',
         'h4_dat', 'h4_bana', 'h4_kusk',
         'h5_dat', 'h5_bana', 'h5_kusk'], axis=1, inplace=True)

# test with and without streck
df.drop('streck', axis=1, inplace=True)

# df['kön']=df['kön'].astype('category') # endast om ej catboost
# df['kön']=df['kön'].cat.codes

X = df.drop(['plac'], axis=1)
y = (df['plac']==1)*1


In [3]:
# catboost regression
from catboost import CatBoostClassifier,cv,Pool
# setup catboost model
cat_features = [i for i in X.columns if X[i].dtype == 'object']
# pool=Pool(data=X,
     # label=y,
     # cat_features=[])

model = CatBoostClassifier(iterations=100,                 
                            # learning_rate=0.1,
                            # depth=6,
                            loss_function='Logloss',
                         #    eval_metric='MAE',
                            verbose=False,
                            random_seed=42)  
# model=model.fit(pool)

In [4]:
# select columns by data type
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')


In [5]:
# do all preprocessing in one pipeline
preprocessor = make_column_transformer(
    (make_pipeline(imp_median, scaler), num_cols),
    (make_pipeline(imp_constant, ohe), cat_cols)
    )

pipe = make_pipeline(preprocessor, model)
pipe.fit(X, y)

Pipeline(steps=[('columntransformer',
                 ColumnTransformer(transformers=[('pipeline-1',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(add_indicator=True,
                                                                                 strategy='median')),
                                                                  ('standardscaler',
                                                                   StandardScaler())]),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x000001FA5C018BE0>),
                                                 ('pipeline-2',
                                                  Pipeline(steps=[('simpleimputer',
                                                                   SimpleImputer(strategy='constant')),
                                              

# Feature shuffling

In [6]:
# %conda install feature_engine
from feature_engine.selection import SelectByShuffling

In [7]:
# meassure performance of the feature selections
def catboost_cv(X,y,cat_features):
    labels = y

    cv_dataset = Pool(data=X,
                    label=labels,
                    cat_features=cat_features)

    params = {"iterations": 1000,
              'early_stopping_rounds': 100,
              "loss_function": "Logloss",
            #   'eval_metric': 'RMSE',
              "verbose": False}

    scores = cv(cv_dataset,
                params,
                fold_count=3, 
                )
    score =scores['test-Logloss-mean'].min()
    return score
with_all_features = catboost_cv(X,y,cat_features)


Stopped by overfitting detector  (100 iterations wait)


In [8]:
# We will select features based on the drop in the neg_MAE using 3 fold cross-validation:

# initialize the feature selector
tr = SelectByShuffling(estimator=pipe, scoring="neg_mean_absolute_error", cv=3, threshold=None)

# With the method fit() the transformer finds the important variables —those that cause a drop in score when shuffled. By default, 
# features will be selected if the performance drop is bigger than the mean drop caused by all features.
tr.fit(X, y)


SelectByShuffling(estimator=Pipeline(steps=[('columntransformer',
                                             ColumnTransformer(transformers=[('pipeline-1',
                                                                              Pipeline(steps=[('simpleimputer',
                                                                                               SimpleImputer(add_indicator=True,
                                                                                                             strategy='median')),
                                                                                              ('standardscaler',
                                                                                               StandardScaler())]),
                                                                              <sklearn.compose._column_transformer.make_column_selector object at 0x000001FA5C018BE0>),
                                                                          

In [9]:
# With the method transform() we drop the unselected features from the dataset:

Xt = tr.transform(X)

# We can inspect the individual feature’s importance through one of the transformer’s attributes:
# print('resultat\n',tr.performance_drifts_)

Xt.columns
# tr.initial_model_performance_


Index(['kr', 'spår', 'kön', 'pris', 'h1_pris', 'h1_odds', 'h2_pris', 'h2_odds',
       'h2_kmtid', 'h3_pris', 'h3_odds', 'h3_kmtid', 'h4_pris', 'h4_odds',
       'h4_kmtid', 'h5_pris', 'h5_odds', 'h5_kmtid', 'h1_perf', 'h2_perf',
       'h5_perf', 'delta1', 'delta3'],
      dtype='object')

In [10]:
print('all features cv:',with_all_features)
print('sel features cv:',catboost_cv(Xt, y, cat_features))


all features cv: 0.26451734324327303
Stopped by overfitting detector  (100 iterations wait)
sel features cv: 0.26499260726977464


Med 'streck'  
all features cv: 0.23681292646342436  
sel features cv: 0.2368586479074274  

Wthout 'streck'  
all features cv:   0.26451734324327303  
sel features cv:   0.26499260726977464  

# Feature performance

In [11]:
from feature_engine.selection import SelectBySingleFeaturePerformance

# We want to select features whose r2 > 0.01, utilizing a linear regression and using 3 fold cross-validation.

# initialize the feature selector
sel = SelectBySingleFeaturePerformance(
    estimator=pipe, scoring="r2", cv=3, threshold=0.01)


# The transformer uses the method fit() to fit 1 model per feature, determine performance, and select the important features.

# fit transformer
sel.fit(X, y)

# We can explore the features that will be dropped:
print(sel.features_to_drop_)

# We can also examine each individual feature’s performance:
# sel.feature_performance_

['kr', 'spår', 'dist', 'lopp_dist', 'start', 'ålder', 'pris', 'h1_spår', 'h1_plac', 'h1_pris', 'h1_odds', 'h1_kmtid', 'h2_spår', 'h2_plac', 'h2_pris', 'h2_odds', 'h2_kmtid', 'h3_spår', 'h3_plac', 'h3_pris', 'h3_odds', 'h3_kmtid', 'h4_spår', 'h4_plac', 'h4_pris', 'h4_odds', 'h4_kmtid', 'h5_spår', 'h5_plac', 'h5_pris', 'h5_odds', 'h5_kmtid', 'h1_dist', 'h2_dist', 'h3_dist', 'h4_dist', 'h5_dist', 'h1_auto', 'h2_auto', 'h3_auto', 'h4_auto', 'h5_auto', 'h1_perf', 'h2_perf', 'h3_perf', 'h4_perf', 'h5_perf', 'senast', 'delta1', 'delta2', 'delta3', 'delta4']


In [12]:
# drop variables
Xt = sel.transform(X)
print(Xt.columns)

print('all features cv:',with_all_features)
print('sel features cv:',catboost_cv(Xt, y,['kön']))

Index(['kön'], dtype='object')
all features cv: 0.26451734324327303
Stopped by overfitting detector  (100 iterations wait)
sel features cv: 0.29262238873721497


Med 'streck'  
all features cv: 0.23681292646342436  
sel features cv: 0.29262238873721497  

Utan 'streck' ok   
all features cv: 0.26451734324327303  
sel features cv: 0.29262238873721497  

# Target mean performance

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from feature_engine.selection import SelectByTargetMeanPerformance

X.fillna(0, inplace=True)
# print(sum(X_tranf==np.nan))
# separate train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X,y,
    test_size=0.3,
    random_state=0)

# is there any values in the test set that are missing?
    

In [14]:
# Feature-engine automates the selection of
# categorical and numerical variables

sel = SelectByTargetMeanPerformance(
    variables=None,
    scoring="roc_auc_score",
    threshold=0.6,
    bins=3,
    strategy="equal_frequency",
    cv=3,  # cross validation
    random_state=1,  # seed for reproducibility
)


In [15]:
# find important features
sel.fit(X_train, y_train)

# We can explore the ROC-AUC for each feature:
# print(sel.feature_performance_)

SelectByTargetMeanPerformance(bins=3, random_state=1,
                              strategy='equal_frequency', threshold=0.6)

In [16]:
# We can find the features that will be dropped from the data:
sel.features_to_drop_


['spår',
 'dist',
 'lopp_dist',
 'start',
 'ålder',
 'kön',
 'pris',
 'h1_spår',
 'h1_plac',
 'h1_pris',
 'h1_kmtid',
 'h2_spår',
 'h2_plac',
 'h2_pris',
 'h2_kmtid',
 'h3_spår',
 'h3_plac',
 'h3_pris',
 'h3_kmtid',
 'h4_spår',
 'h4_plac',
 'h4_pris',
 'h4_odds',
 'h4_kmtid',
 'h5_spår',
 'h5_plac',
 'h5_pris',
 'h5_odds',
 'h5_kmtid',
 'h1_dist',
 'h2_dist',
 'h3_dist',
 'h4_dist',
 'h5_dist',
 'h1_auto',
 'h2_auto',
 'h3_auto',
 'h4_auto',
 'h5_auto',
 'h1_perf',
 'h2_perf',
 'h3_perf',
 'h4_perf',
 'h5_perf',
 'senast',
 'delta1',
 'delta2',
 'delta3',
 'delta4']

In [17]:
# remove features
X_tr = sel.transform(X_train)
X_te = sel.transform(X_test)
X_tr.columns 

Index(['kr', 'h1_odds', 'h2_odds', 'h3_odds'], dtype='object')

In [18]:
X[X_tr.columns]
print('all features cv:',with_all_features)
print('sel features cv:', catboost_cv(X[X_tr.columns], y,[]))

all features cv: 0.26451734324327303
Stopped by overfitting detector  (100 iterations wait)
sel features cv: 0.274721721060698


Med 'streck'  
all features cv: 0.23681292646342436   
sel features cv: 0.23722614924386112   

Utan 'streck'  
all features cv: 0.26451734324327303  
sel features cv: 0.274721721060698  