In [None]:
%load_ext autoreload
%autoreload 2

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
from pathlib import Path

import pandas as pd

from paddel.preprocessing import get_data

misc_df, classic_df, fresh_df, y = get_data(Path("../data/raw"), Path("../data/cache"))

In [None]:
datasets = {
    "basic": {
        "data": misc_df,
        "params": {},
    },
    "classic": {
        "data": pd.concat([misc_df, classic_df], axis=1),
        "params": {},
    },
    "fresh": {
        "data": pd.concat([misc_df, fresh_df], axis=1),
        "params": {
            "n_features": [10, 20, 40, 80, 160, 240, 320, 400, 480, 560, 640, 720],
        },
    },
    "full": {
        "data": pd.concat([misc_df, classic_df, fresh_df], axis=1),
        "params": {
            "n_features": [10, 20, 40, 80, 160, 240, 320, 400, 480, 560, 640, 720],
        },
    },
}

In [None]:
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

models = {
    "SVC": SVC,
    "KNeighborsClassifier": KNeighborsClassifier,
    "RandomForestClassifier": RandomForestClassifier,
#    "MLPClassifier": MLPClassifier,
#    "AdaBoostClassifier": AdaBoostClassifier,
#    "XGBClassifier": XGBClassifier,
}

In [None]:
parameters = {
    "SVC": {
        'C': (1e-6, 1e+6, 'log-uniform'),
        'gamma': (1e-6, 1e+1, 'log-uniform'),
        'degree': (1, 8),
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    },
    "KNeighborsClassifier": {
        'n_neighbors': (1, 31),
        'metric': [
            "cityblock",
            "cosine",
            "l1",
            "l2",
            "nan_euclidean"
        ],
    },
    "RandomForestClassifier": {
        'n_estimators': (5, 10000),
        "criterion": ["gini", "entropy", "log_loss"],
        "max_features": ["sqrt", "log2", None],
    },
}

In [None]:
from skopt import BayesSearchCV
from paddel import settings
from sklearn.metrics import make_scorer, f1_score
from sklearn.model_selection import RepeatedKFold
from paddel.preprocessing.transformer import FeatureSelector
from sklearn.preprocessing import QuantileTransformer

all_results = []

for dataset_name, dataset in datasets.items():
    data = dataset["data"]
    dataset_params = dataset["params"]
    data = QuantileTransformer(n_quantiles=20).set_output(transform="pandas").fit_transform(data)

    if "n_features" in dataset_params:
        n_features_list = dataset_params["n_features"]
    else:
        n_features_list = [0]

    for model_name, model in models.items():
        search_spaces = parameters[model_name]

        grid = BayesSearchCV(
            estimator=model(),
            search_spaces=search_spaces,
            n_iter=1000,
            cv=RepeatedKFold(n_splits=2, n_repeats=5),
            n_jobs=settings.max_processes,
            verbose=10,
            scoring=make_scorer(f1_score),
        )

        for n_features in n_features_list:
            selected_data = FeatureSelector(n_features=n_features).fit_transform(data, y)

            print(f"Doing dataset: {dataset_name}, model: {model_name}, features: {selected_data.shape[1]}")

            grid.fit(selected_data, y)
            results = pd.DataFrame(grid.cv_results_)

            results.insert(0, 'n_features', n_features)
            results.insert(0, 'dataset', dataset_name)
            results.insert(0, 'model', model_name)

            all_results.append(results)

In [None]:
all_results = pd.concat(all_results, ignore_index=True)
all_results.to_csv("../data/results/bayes_results.csv", index=False)