In [None]:
%load_ext autoreload
%autoreload 2

import logging

logging.basicConfig(level=logging.INFO)

In [None]:
from pathlib import Path

import pandas as pd

from paddel.preprocessing import get_data

misc_df, classic_df, fresh_df, y = get_data(Path("../data/raw"), Path("../data/cache"))

In [None]:
datasets = {
    "basic": {
        "data": misc_df,
        "params": {},
    },
    "classic": {
        "data": pd.concat([misc_df, classic_df], axis=1),
        "params": {},
    },
    "fresh": {
        "data": pd.concat([misc_df, fresh_df], axis=1),
        "params": {
            "select_features__n_features": [5, 10, 20, 40, 80, 160, 360, 720],
        },
    },
    "full": {
        "data": pd.concat([misc_df, classic_df, fresh_df], axis=1),
        "params": {
            "select_features__n_features": [5, 10, 20, 40, 80, 160, 360, 720],
        },
    },
}

In [None]:
from xgboost import XGBClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

models = {
    "SVC": SVC,
    "GaussianNB": GaussianNB,
    "KNeighborsClassifier": KNeighborsClassifier,
    "RandomForestClassifier": RandomForestClassifier,
    "DecisionTreeClassifier": DecisionTreeClassifier,
    "MLPClassifier": MLPClassifier,
    "AdaBoostClassifier": AdaBoostClassifier,
    "XGBClassifier": XGBClassifier,
}

In [None]:
from paddel import settings
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from paddel.hyper_parameters.parameters import model_parameter_rules
from paddel.hyper_parameters.parser import parse_hyper_parameters
from paddel.preprocessing.transformer import FeatureSelector
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import QuantileTransformer

all_results = []

for dataset_name, dataset in datasets.items():
    X = dataset["data"]
    dataset_params = dataset["params"]

    for model_name, model in models.items():
        model_param_grid = parse_hyper_parameters(model_parameter_rules[model], prefix="model__")

        param_grid = []
        for model_params in model_param_grid:
            param_grid.append(dataset_params | model_params)

        pipe = Pipeline([
            ("scale", QuantileTransformer(n_quantiles=20).set_output(transform="pandas")),
            ("select_features", FeatureSelector()),
            ("model", model()),
        ])

        grid = GridSearchCV(
            estimator=pipe,
            param_grid=param_grid,
            scoring={
                "accuracy": make_scorer(accuracy_score),
                "f1": make_scorer(f1_score),
                "g-mean": make_scorer(geometric_mean_score),
            },
            refit="f1",
            cv=RepeatedKFold(n_splits=2, n_repeats=5),
            n_jobs=settings.max_processes,
            verbose=0,
        )

        grid.fit(X, y)
        results = pd.DataFrame(grid.cv_results_)

        results.insert(0, 'dataset', dataset_name)
        results.insert(0, 'model', model_name)

        all_results.append(results)
        
        print(f"Done dataset: {dataset_name}, model: {model_name}")

In [None]:
all_results = pd.concat(all_results, ignore_index=True)
all_results.to_csv("../data/results/all_results.csv", index=False)