In [2]:
import time

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import (
    RFE,
    SelectFromModel,
    SelectKBest,
    SequentialFeatureSelector,
    f_classif,
    mutual_info_classif,
)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from utils import (
    experiment,
    get_data,
    get_param_combinations,
    get_params_json,
    save_results,
)

In [3]:
X, y = get_data()

In [4]:
# Change value before running experiment
filename = ""

In [5]:
#  TODO remove - this is for testing the script!
# X = X[:100]
# y = y[:100]

In [6]:
def run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds=[42]):
    for fs in feature_selectors:
        for clf in classifiers:
            for k in ks:
                # Generate parameter combinations
                fs_cls, fs_params, k_param_name, requires_estimator = fs
                clf_cls, clf_params = clf

                fs_param_combinations = get_param_combinations(fs_params)
                clf_param_combinations = get_param_combinations(clf_params)

                for fs_params in fs_param_combinations:
                    for clf_params in clf_param_combinations:
                        result = experiment(
                            X,
                            y,
                            fs_cls,
                            fs_params,
                            clf_cls,
                            clf_params,
                            k,
                            k_param_name,
                            requires_estimator,
                            train_test_seeds,
                        )

                        print(result)
                        print(f"Elapsed time: {result[-1]:.2f}s\n")
                        results.append(result)
                        save_results(results, filename)

## Experiments


In [7]:
results = []

### Experiment 1 - parameters of GradientBoostingClassifier


In [8]:
filename = "comparison"

In [9]:
%%script skip

feature_selectors = [
    (
        SelectKBest,
        {"score_func": [f_classif, mutual_info_classif]},
        "k",
        False,
    ),
]

classifiers = [
    (
        GradientBoostingClassifier,
        {
            "n_estimators": [100, 200],
            "learning_rate": [0.1, 0.2],
            "subsample": [0.5, 1],
        },
    ),
]

# ks = np.concatenate([np.arange(1, 20, 1), np.arange(20, 45, 5)])
ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

Couldn't find program: 'skip'


### Experiment 2: simple feature selection with various classifiers


In [10]:
%%script skip

feature_selectors = [
    (SelectKBest, {"score_func": [mutual_info_classif]}, "k", False),
]


classifiers = [
    (GradientBoostingClassifier, {}),
    (RandomForestClassifier, {"random_state": [42]}),
    (SVC, {"kernel": ["linear", "rbf"], "probability": [True], "random_state": [42]}),
    (LinearDiscriminantAnalysis, {}),
    (QuadraticDiscriminantAnalysis, {}),
]

ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

Couldn't find program: 'skip'


### Experiment 3: simple classifier with various feature selectors


In [11]:
%%script skip

feature_selectors = [
    (RFE, {}, "n_features_to_select", True),
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
    (
        SequentialFeatureSelector,
        {
            "direction": [
                # "backward", # too slow
                "forward",
            ],
            "n_jobs": [-2],
        },
        "n_features_to_select",
        True,
    ),
]

classifiers = [
    (RandomForestClassifier, {"random_state": [42]}),
]

ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

Couldn't find program: 'skip'


### Experiment 4: SelectFromModel with best classifiers


In [12]:
filename = "comparison_2"

In [13]:
%%script skip

feature_selectors = [
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
]

classifiers = [
    (
        RandomForestClassifier,
        {"random_state": [42], "n_jobs": [-2], "n_estimators": [100, 200]},
    ),
]

ks = np.arange(1, 21, 1)
train_test_seeds = list(range(42, 47))

run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds)

Couldn't find program: 'skip'


In [14]:
filename = "comparison_3"

In [15]:
%%script skip

from joblib import parallel_backend

feature_selectors = [
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
]

classifiers = [
    (
        GradientBoostingClassifier,
        {
            "n_estimators": [100],
            "learning_rate": [0.1],
            "subsample": [0.5],
        },
    ),
    # (SVC, {"kernel": ["rbf"], "probability": [True], "random_state": [42]}),
    # (QuadraticDiscriminantAnalysis, {}),
]

ks = np.arange(5, 11, 1)
train_test_seeds = list(range(42, 47))

with parallel_backend("threading", n_jobs=-2):
    run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds)

Couldn't find program: 'skip'


### Experiment 5: XGBoost


In [16]:
filename = "comparison_xgboost"

In [19]:
import xgboost as xgb
from joblib import parallel_backend

feature_selectors = [
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
]

classifiers = [
    (
        xgb.XGBClassifier,
        # {},
        {"device": ["cuda"]},
    ),
]

ks = np.arange(1, 11, 1)
train_test_seeds = list(range(42, 47))

with parallel_backend("threading", n_jobs=-2):
    run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds)

Potential solutions:
- Use a data structure that matches the device ordinal in the booster.
- Set the device for booster before call to inplace_predict.




42 0.52
43 0.534
44 0.573
45 0.507
46 0.544
('SelectFromModel', "{'threshold': -Infinity}", 'XGBClassifier', "{'device': 'cuda'}", 1, 0.5356, 0.022508664998173465, 0.58, 2.738766002655029)
Elapsed time: 2.74s

42 0.548
43 0.571
44 0.526
45 0.527
46 0.558
('SelectFromModel', "{'threshold': -Infinity}", 'XGBClassifier', "{'device': 'cuda'}", 2, 0.546, 0.017515707236649036, 0.594, 2.976771831512451)
Elapsed time: 2.98s

42 0.568
43 0.581
44 0.55
45 0.515
46 0.596
('SelectFromModel', "{'threshold': -Infinity}", 'XGBClassifier', "{'device': 'cuda'}", 3, 0.562, 0.027949955277245055, 0.641, 3.132274293899536)
Elapsed time: 3.13s

42 0.601
43 0.621
44 0.553
45 0.547
46 0.628
('SelectFromModel', "{'threshold': -Infinity}", 'XGBClassifier', "{'device': 'cuda'}", 4, 0.5900000000000001, 0.03389395226290375, 0.6890000000000001, 2.9012763500213623)
Elapsed time: 2.90s

42 0.641
43 0.623
44 0.564
45 0.594
46 0.627
('SelectFromModel', "{'threshold': -Infinity}", 'XGBClassifier', "{'device': 'cuda'}", 