In [1]:
import time

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import (
    RFE,
    SelectFromModel,
    SelectKBest,
    SequentialFeatureSelector,
    f_classif,
    mutual_info_classif,
)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from utils import (
    experiment,
    get_data,
    get_param_combinations,
    get_params_json,
    save_results,
)

In [2]:
X, y = get_data()

In [3]:
# Change value before running experiment
filename = ""

In [4]:
#  TODO remove - this is for testing the script!
# X = X[:100]
# y = y[:100]

In [5]:
def run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds=[42]):
    for fs in feature_selectors:
        for clf in classifiers:
            for k in ks:
                # Generate parameter combinations
                fs_cls, fs_params, k_param_name, requires_estimator = fs
                clf_cls, clf_params = clf

                fs_param_combinations = get_param_combinations(fs_params)
                clf_param_combinations = get_param_combinations(clf_params)

                for fs_params in fs_param_combinations:
                    for clf_params in clf_param_combinations:
                        result = experiment(
                            X,
                            y,
                            fs_cls,
                            fs_params,
                            clf_cls,
                            clf_params,
                            k,
                            k_param_name,
                            requires_estimator,
                            train_test_seeds,
                        )

                        print(result)
                        print(f"Elapsed time: {result[-1]:.2f}s\n")
                        results.append(result)
                        save_results(results, filename)

## Experiments


In [6]:
results = []

### Experiment 1 - parameters of GradientBoostingClassifier


In [7]:
filename = "comparison"

In [8]:
# %%script skip

feature_selectors = [
    (
        SelectKBest,
        {"score_func": [f_classif, mutual_info_classif]},
        "k",
        False,
    ),
]

classifiers = [
    (
        GradientBoostingClassifier,
        {
            "n_estimators": [100, 200],
            "learning_rate": [0.1, 0.2],
            "subsample": [0.5, 1],
        },
    ),
]

# ks = np.concatenate([np.arange(1, 20, 1), np.arange(20, 45, 5)])
ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

42 0.52
('SelectKBest', "{'score_func': 'f_classif'}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.1, 'subsample': 0.5}", 1, 0.52, 0.0, 0.515, 0.32425856590270996)
Elapsed time: 0.32s

42 0.503
('SelectKBest', "{'score_func': 'f_classif'}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.1, 'subsample': 1}", 1, 0.503, 0.0, 0.48, 0.3221566677093506)
Elapsed time: 0.32s

42 0.534
('SelectKBest', "{'score_func': 'f_classif'}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.2, 'subsample': 0.5}", 1, 0.534, 0.0, 0.49, 0.25923585891723633)
Elapsed time: 0.26s

42 0.516
('SelectKBest', "{'score_func': 'f_classif'}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.2, 'subsample': 1}", 1, 0.516, 0.0, 0.49, 0.3314220905303955)
Elapsed time: 0.33s

42 0.514
('SelectKBest', "{'score_func': 'f_classif'}", 'GradientBoostingClassifier', "{'n_estimators': 200, 'learning_rate': 0.1, 'subsample': 0.5}", 

KeyboardInterrupt: 

### Experiment 2: simple feature selection with various classifiers


In [None]:
%%script skip

feature_selectors = [
    (SelectKBest, {"score_func": [mutual_info_classif]}, "k", False),
]


classifiers = [
    (GradientBoostingClassifier, {}),
    (RandomForestClassifier, {"random_state": [42]}),
    (SVC, {"kernel": ["linear", "rbf"], "probability": [True], "random_state": [42]}),
    (LinearDiscriminantAnalysis, {}),
    (QuadraticDiscriminantAnalysis, {}),
]

ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

Couldn't find program: 'skip'


### Experiment 3: simple classifier with various feature selectors


In [None]:
%%script skip

feature_selectors = [
    (RFE, {}, "n_features_to_select", True),
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
    (
        SequentialFeatureSelector,
        {
            "direction": [
                # "backward", # too slow
                "forward",
            ],
            "n_jobs": [-2],
        },
        "n_features_to_select",
        True,
    ),
]

classifiers = [
    (RandomForestClassifier, {"random_state": [42]}),
]

ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

Couldn't find program: 'skip'


### Experiment 4: SelectFromModel with best classifiers


In [None]:
filename = "comparison_2"

In [None]:
%%script skip

feature_selectors = [
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
]

classifiers = [
    (
        RandomForestClassifier,
        {"random_state": [42], "n_jobs": [-2], "n_estimators": [100, 200]},
    ),
]

ks = np.arange(1, 21, 1)
train_test_seeds = list(range(42, 47))

run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds)

Couldn't find program: 'skip'


In [None]:
filename = "comparison_3"

In [None]:
from joblib import parallel_backend

feature_selectors = [
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
]

classifiers = [
    (
        GradientBoostingClassifier,
        {
            "n_estimators": [100],
            "learning_rate": [0.1],
            "subsample": [0.5],
        },
    ),
    # (SVC, {"kernel": ["rbf"], "probability": [True], "random_state": [42]}),
    # (QuadraticDiscriminantAnalysis, {}),
]

ks = np.arange(5, 11, 1)
train_test_seeds = list(range(42, 47))

with parallel_backend("threading", n_jobs=-2):
    run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds)

KeyboardInterrupt: 