In [33]:
import time

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import (
    RFE,
    SelectFromModel,
    SelectKBest,
    SequentialFeatureSelector,
    f_classif,
    mutual_info_classif,
)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from utils import (
    experiment,
    get_data,
    get_param_combinations,
    get_params_json,
    save_results,
)

In [4]:
X, y = get_data()

In [35]:
# Change value before running experiment
filename = ""

In [36]:
#  TODO remove - this is for testing the script!
# X = X[:100]
# y = y[:100]

In [41]:
def run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds=[42]):
    for fs in feature_selectors:
        for clf in classifiers:
            for k in ks:
                # Generate parameter combinations
                fs_cls, fs_params, k_param_name, requires_estimator = fs
                clf_cls, clf_params = clf

                fs_param_combinations = get_param_combinations(fs_params)
                clf_param_combinations = get_param_combinations(clf_params)

                for fs_params in fs_param_combinations:
                    for clf_params in clf_param_combinations:
                        # Run experiment
                        start = time.time()
                        accs, accs_top_20pc = experiment(
                            X,
                            y,
                            fs_cls,
                            fs_params,
                            clf_cls,
                            clf_params,
                            k,
                            k_param_name,
                            requires_estimator,
                            train_test_seeds,
                        )
                        elapsed = time.time() - start
                        elapsed = elapsed / len(train_test_seeds)

                        acc = accs.mean()
                        acc_std = accs.std()
                        acc_top_20pc = accs_top_20pc.mean()

                        # Save results
                        result = (
                            fs_cls.__name__,
                            get_params_json(fs_params),
                            clf_cls.__name__,
                            get_params_json(clf_params),
                            k,
                            acc,
                            acc_std,
                            acc_top_20pc,
                            elapsed,
                        )

                        print(result)
                        print(f"Elapsed time: {elapsed:.2f}s\n")
                        results.append(result)
                        save_results(results, filename)

## Experiments


In [42]:
results = []

### Experiment 1 - parameters of GradientBoostingClassifier


In [43]:
filename = "comparison"

In [44]:
%%script skip

feature_selectors = [
    (
        SelectKBest,
        {"score_func": [f_classif, mutual_info_classif]},
        "k",
        False,
    ),
]

classifiers = [
    (
        GradientBoostingClassifier,
        {
            "n_estimators": [100, 200],
            "learning_rate": [0.1, 0.2],
            "subsample": [0.5, 1],
        },
    ),
]

# ks = np.concatenate([np.arange(1, 20, 1), np.arange(20, 45, 5)])
ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

Couldn't find program: 'skip'


### Experiment 2: simple feature selection with various classifiers


In [45]:
%%script skip

feature_selectors = [
    (SelectKBest, {"score_func": [mutual_info_classif]}, "k", False),
]


classifiers = [
    (GradientBoostingClassifier, {}),
    (RandomForestClassifier, {"random_state": [42]}),
    (SVC, {"kernel": ["linear", "rbf"], "probability": [True], "random_state": [42]}),
    (LinearDiscriminantAnalysis, {}),
    (QuadraticDiscriminantAnalysis, {}),
]

ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

Couldn't find program: 'skip'


### Experiment 3: simple classifier with various feature selectors


In [46]:
%%script skip

feature_selectors = [
    (RFE, {}, "n_features_to_select", True),
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
    (
        SequentialFeatureSelector,
        {
            "direction": [
                # "backward", # too slow
                "forward",
            ],
            "n_jobs": [-2],
        },
        "n_features_to_select",
        True,
    ),
]

classifiers = [
    (RandomForestClassifier, {"random_state": [42]}),
]

ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

Couldn't find program: 'skip'


### Experiment 4: SelectFromModel with best classifiers


In [47]:
filename = "comparison_2"

In [48]:
%%script skip

feature_selectors = [
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
]

classifiers = [
    (
        RandomForestClassifier,
        {"random_state": [42], "n_jobs": [-2], "n_estimators": [100, 200]},
    ),
]

ks = np.arange(1, 21, 1)
train_test_seeds = list(range(42, 47))

run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds)

Couldn't find program: 'skip'


In [49]:
filename = "comparison_3"

In [55]:
from joblib import parallel_backend

feature_selectors = [
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
]

classifiers = [
    (
        GradientBoostingClassifier,
        {
            "n_estimators": [100],
            "learning_rate": [0.1],
            "subsample": [0.5],
        },
    ),
    # (SVC, {"kernel": ["rbf"], "probability": [True], "random_state": [42]}),
    # (QuadraticDiscriminantAnalysis, {}),
]

ks = np.arange(5, 11, 1)
train_test_seeds = list(range(42, 47))

with parallel_backend("threading", n_jobs=-2):
    run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds)

42 0.669
43 0.683
44 0.68
45 0.684
46 0.679
('SelectFromModel', "{'threshold': -Infinity}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.1, 'subsample': 0.5}", 5, 0.679, 0.005329165037789696, 0.768, 45.349039268493655)
Elapsed time: 45.35s

42 0.693
43 0.702
44 0.712
45 0.697
46 0.7
('SelectFromModel', "{'threshold': -Infinity}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.1, 'subsample': 0.5}", 6, 0.7008000000000001, 0.0063686733312362685, 0.766, 48.50391154289245)
Elapsed time: 48.50s

42 0.689
43 0.695
44 0.698
45 0.698
46 0.693
('SelectFromModel', "{'threshold': -Infinity}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.1, 'subsample': 0.5}", 7, 0.6946, 0.0033823069050575557, 0.779, 44.32134146690369)
Elapsed time: 44.32s

42 0.679
43 0.693
44 0.701
45 0.691
46 0.698
('SelectFromModel', "{'threshold': -Infinity}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.1, 'subsample': 0