In [11]:
import json
import time
from itertools import product

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import (
    RFE,
    SelectFromModel,
    SelectKBest,
    SequentialFeatureSelector,
    f_classif,
    mutual_info_classif,
)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC

In [12]:
X = pd.read_csv("../data/x_train.txt", sep=" ", header=None)
y = pd.read_csv("../data/y_train.txt", header=None)
scaler = StandardScaler()

X = scaler.fit_transform(X)
y = y.values.ravel()

In [13]:
#  TODO remove - this is for testing the script!
# X = X[:100]
# y = y[:100]

In [14]:
def get_param_combinations(param_dict):
    value_prod = list(product(*param_dict.values()))
    keys = param_dict.keys()
    return [dict(zip(keys, values)) for values in value_prod]

In [15]:
def param_json_to_str(param_json):
    if type(param_json).__name__ == "function":
        return param_json.__name__
    return param_json


def get_params_json(params):
    params_mapped = {k: param_json_to_str(v) for k, v in params.items()}
    return json.dumps(params_mapped).replace('"', "'")

In [16]:
def experiment(
    fs_cls,
    fs_kwargs,
    clf_cls,
    clf_kwargs,
    n_features,
    k_param_name,
    requires_estimator,
):
    clf = clf_cls(**clf_kwargs)

    fs_kwargs = {
        k_param_name: n_features,
        **fs_kwargs,
    }
    if requires_estimator:
        feature_selector = fs_cls(estimator=clf, **fs_kwargs)
    else:
        feature_selector = fs_cls(**fs_kwargs)

    X_train, X_test, y_train, y_test = train_test_split(
        X,
        y,
        test_size=0.2,
        stratify=y,
        random_state=42,
    )

    # Feature selection
    X_train = feature_selector.fit_transform(X_train, y_train)
    X_test = feature_selector.transform(X_test)

    # Training
    clf.fit(X_train, y_train)

    # Prediction
    pred = clf.predict(X_test)

    proba_1 = clf.predict_proba(X_test)[:, 1]
    proba_1 = np.array([proba_1, y_test]).T
    proba_1 = proba_1[proba_1[:, 0].argsort()][::-1]

    # Evaluation
    acc = accuracy_score(y_test, pred)
    top_20pc = proba_1[: int(len(proba_1) * 0.2)]
    acc_top_20pc = accuracy_score(top_20pc[:, 1], np.round(top_20pc[:, 0]))

    return acc, acc_top_20pc

In [17]:
def save_results(results):
    df = pd.DataFrame(
        results,
        columns=[
            "feature_selector",
            "feature_selector_params",
            "classifier",
            "classifier_params",
            "n_features",
            "accuracy",
            "accuracy_top_20pc",
            "elapsed_time",
        ],
    )
    df.to_csv("../results/comparison.csv", index=False)

In [18]:
def run_experiment(feature_selectors, classifiers, ks, results):
    for fs in feature_selectors:
        for clf in classifiers:
            for k in ks:
                # Generate parameter combinations
                fs_cls, fs_params, k_param_name, requires_estimator = fs
                clf_cls, clf_params = clf

                fs_param_combinations = get_param_combinations(fs_params)
                clf_param_combinations = get_param_combinations(clf_params)

                for fs_params in fs_param_combinations:
                    for clf_params in clf_param_combinations:
                        # Run experiment
                        start = time.time()
                        acc, acc_top_20pc = experiment(
                            fs_cls,
                            fs_params,
                            clf_cls,
                            clf_params,
                            k,
                            k_param_name,
                            requires_estimator,
                        )
                        elapsed = time.time() - start

                        # Save results
                        result = (
                            fs_cls.__name__,
                            get_params_json(fs_params),
                            clf_cls.__name__,
                            get_params_json(clf_params),
                            k,
                            acc,
                            acc_top_20pc,
                            elapsed,
                        )

                        print(result)
                        print(f"Elapsed time: {elapsed:.2f}s\n")
                        results.append(result)
                        save_results(results)

## Experiments


In [19]:
results = []

### Experiment 1 - parameters of GradientBoostingClassifier


In [20]:
feature_selectors = [
    (
        SelectKBest,
        {"score_func": [f_classif, mutual_info_classif]},
        "k",
        False,
    ),
]

classifiers = [
    (
        GradientBoostingClassifier,
        {
            "n_estimators": [100, 200],
            "learning_rate": [0.1, 0.2],
            "subsample": [0.5, 1],
        },
    ),
]

# ks = np.concatenate([np.arange(1, 20, 1), np.arange(20, 45, 5)])
ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

('SelectKBest', "{'score_func': 'f_classif'}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.1, 'subsample': 0.5}", 1, 0.516, 0.505, 0.3451216220855713)
Elapsed time: 0.35s

('SelectKBest', "{'score_func': 'f_classif'}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.1, 'subsample': 1}", 1, 0.503, 0.48, 0.34575676918029785)
Elapsed time: 0.35s

('SelectKBest', "{'score_func': 'f_classif'}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.2, 'subsample': 0.5}", 1, 0.517, 0.45, 0.2805194854736328)
Elapsed time: 0.28s

('SelectKBest', "{'score_func': 'f_classif'}", 'GradientBoostingClassifier', "{'n_estimators': 100, 'learning_rate': 0.2, 'subsample': 1}", 1, 0.516, 0.49, 0.32715272903442383)
Elapsed time: 0.33s

('SelectKBest', "{'score_func': 'f_classif'}", 'GradientBoostingClassifier', "{'n_estimators': 200, 'learning_rate': 0.1, 'subsample': 0.5}", 1, 0.516, 0.505, 0.47340965270996094)
Elapsed time: 0.47s

('Se

### Experiment 2: simple feature selection with various classifiers


In [21]:
feature_selectors = [
    (SelectKBest, {"score_func": [mutual_info_classif]}, "k", False),
]


classifiers = [
    (GradientBoostingClassifier, {}),
    (RandomForestClassifier, {"random_state": [42]}),
    (SVC, {"kernel": ["linear", "rbf"], "probability": [True], "random_state": [42]}),
    (LinearDiscriminantAnalysis, {}),
    (QuadraticDiscriminantAnalysis, {}),
]

ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

('SelectKBest', "{'score_func': 'mutual_info_classif'}", 'GradientBoostingClassifier', '{}', 1, 0.542, 0.58, 7.726355314254761)
Elapsed time: 7.73s

('SelectKBest', "{'score_func': 'mutual_info_classif'}", 'GradientBoostingClassifier', '{}', 4, 0.549, 0.595, 7.934386253356934)
Elapsed time: 7.93s

('SelectKBest', "{'score_func': 'mutual_info_classif'}", 'GradientBoostingClassifier', '{}', 7, 0.551, 0.575, 8.281240463256836)
Elapsed time: 8.28s

('SelectKBest', "{'score_func': 'mutual_info_classif'}", 'GradientBoostingClassifier', '{}', 10, 0.541, 0.555, 8.773930311203003)
Elapsed time: 8.77s

('SelectKBest', "{'score_func': 'mutual_info_classif'}", 'GradientBoostingClassifier', '{}', 13, 0.584, 0.635, 9.282235145568848)
Elapsed time: 9.28s

('SelectKBest', "{'score_func': 'mutual_info_classif'}", 'GradientBoostingClassifier', '{}', 16, 0.583, 0.665, 9.65318775177002)
Elapsed time: 9.65s

('SelectKBest', "{'score_func': 'mutual_info_classif'}", 'GradientBoostingClassifier', '{}', 19, 0.

### Experiment 3: simple classifier with various feature selectors


In [22]:
feature_selectors = [
    (RFE, {}, "n_features_to_select", True),
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True),
    (
        SequentialFeatureSelector,
        {
            "direction": [
                # "backward", # too slow
                "forward",
            ],
            "n_jobs": [-2],
        },
        "n_features_to_select",
        True,
    ),
]

classifiers = [
    (RandomForestClassifier, {"random_state": [42]}),
]

ks = np.arange(1, 21, 3)

run_experiment(feature_selectors, classifiers, ks, results)

('RFE', '{}', 'RandomForestClassifier', "{'random_state': 42}", 1, 0.545, 0.565, 3266.4229078292847)
Elapsed time: 3266.42s

('RFE', '{}', 'RandomForestClassifier', "{'random_state': 42}", 4, 0.655, 0.705, 3278.729455471039)
Elapsed time: 3278.73s

('RFE', '{}', 'RandomForestClassifier', "{'random_state': 42}", 7, 0.683, 0.755, 3248.617998600006)
Elapsed time: 3248.62s

('RFE', '{}', 'RandomForestClassifier', "{'random_state': 42}", 10, 0.672, 0.75, 3241.1974375247955)
Elapsed time: 3241.20s

('RFE', '{}', 'RandomForestClassifier', "{'random_state': 42}", 13, 0.674, 0.73, 3242.608179807663)
Elapsed time: 3242.61s

('RFE', '{}', 'RandomForestClassifier', "{'random_state': 42}", 16, 0.681, 0.715, 3241.4741683006287)
Elapsed time: 3241.47s

('RFE', '{}', 'RandomForestClassifier', "{'random_state': 42}", 19, 0.683, 0.715, 3251.128825902939)
Elapsed time: 3251.13s

('SelectFromModel', "{'threshold': -Infinity}", 'RandomForestClassifier', "{'random_state': 42}", 1, 0.545, 0.565, 10.658958673

KeyboardInterrupt: 