In [2]:
import time

import numpy as np
import pandas as pd
from sklearn.discriminant_analysis import (
    LinearDiscriminantAnalysis,
    QuadraticDiscriminantAnalysis,
)
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.feature_selection import (
    RFE,
    SelectFromModel,
    SelectKBest,
    SequentialFeatureSelector,
    f_classif,
    mutual_info_classif,
)
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from utils import (
    experiment,
    get_data,
    get_param_combinations,
    get_params_json,
    save_results,
)

from xgboost import XGBClassifier

In [3]:
X, y = get_data()

In [4]:
def run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds=[42]):
    for fs in feature_selectors:
        for clf in classifiers:
            for k in ks:
                # Generate parameter combinations
                fs_cls, fs_params, k_param_name, requires_estimator = fs
                clf_cls, clf_params = clf

                fs_param_combinations = get_param_combinations(fs_params)
                clf_param_combinations = get_param_combinations(clf_params)

                for fs_params in fs_param_combinations:
                    for clf_params in clf_param_combinations:
                        result = experiment(
                            X,
                            y,
                            fs_cls,
                            fs_params,
                            clf_cls,
                            clf_params,
                            k,
                            k_param_name,
                            requires_estimator,
                            train_test_seeds,
                        )

                        print(result)
                        print(f"Elapsed time: {result[-1]:.2f}s\n")
                        results.append(result)
                        save_results(results, filename)

In [7]:
results = []

filename = "piotrkowe_RF2"

feature_selectors = [
    (SelectFromModel, {"threshold": [-np.inf]}, "max_features", True)
]

classifiers = [
    (
        RandomForestClassifier,
        {
            "criterion": ['gini', 'entropy', 'log_loss'],
            "n_estimators": [10, 75, 150],
            "min_samples_split": [2, 4, 8],
            "max_depth": [2, 6, 10],
            "ccp_alpha" : [0, 0.001]
        },
    )
]

ks = np.arange(5, 7, 1)
train_test_seeds = list(range(42, 47))

run_experiment(feature_selectors, classifiers, ks, results, train_test_seeds)

42 0.601
43 0.584
44 0.602
45 0.597
46 0.62
('SelectFromModel', "{'threshold': -Infinity}", 'RandomForestClassifier', "{'criterion': 'gini', 'n_estimators': 10, 'min_samples_split': 2, 'max_depth': 2, 'ccp_alpha': 0}", 5, 0.6008, 0.011548160026601651, 0.6809999999999999, 0.28538026809692385)
Elapsed time: 0.29s

42 0.609
43 0.542
44 0.618
45 0.525
46 0.6
('SelectFromModel', "{'threshold': -Infinity}", 'RandomForestClassifier', "{'criterion': 'gini', 'n_estimators': 10, 'min_samples_split': 2, 'max_depth': 2, 'ccp_alpha': 0.001}", 5, 0.5788, 0.03780687768118386, 0.645, 0.30038881301879883)
Elapsed time: 0.30s

42 0.638
43 0.616
44 0.582
45 0.65
46 0.64
('SelectFromModel', "{'threshold': -Infinity}", 'RandomForestClassifier', "{'criterion': 'gini', 'n_estimators': 10, 'min_samples_split': 2, 'max_depth': 6, 'ccp_alpha': 0}", 5, 0.6252, 0.024284974778656886, 0.696, 0.6303155422210693)
Elapsed time: 0.63s

42 0.651
43 0.651
44 0.636
45 0.657
46 0.608
('SelectFromModel', "{'threshold': -Inf