In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import os
import warnings

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
from pathlib import Path

results_folder = Path("../data/results")
results_folder.mkdir(parents=True, exist_ok=True)

# Load data

In [None]:
from paddel.preprocessing.features import get_data, clean_data

y, misc_features, classic_features, fresh_features = get_data(Path("../data/raw"), Path("../data/cache"))

In [None]:
from tsfresh import select_features

clean_data(y, misc_features, classic_features, fresh_features)

# Models to try

In [None]:
from xgboost import XGBClassifier
from sklearn.discriminant_analysis import QuadraticDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

models = {
    "SVC": SVC,
    "GaussianNB": GaussianNB,
    "KNeighborsClassifier": KNeighborsClassifier,
    "RandomForestClassifier": RandomForestClassifier,
    "DecisionTreeClassifier": DecisionTreeClassifier,
    "MLPClassifier": MLPClassifier,
    "AdaBoostClassifier": AdaBoostClassifier,
    "XGBClassifier": XGBClassifier,
    "QuadraticDiscriminantAnalysis": QuadraticDiscriminantAnalysis
}

# Features to try

In [None]:
from tsfresh.feature_selection.relevance import calculate_relevance_table

extended_classic_features = pd.merge(misc_features, classic_features, left_index=True, right_index=True)

features_to_try = {
    "extended_classic_features": [pd.merge(misc_features, classic_features, left_index=True, right_index=True)],
    "extended_fresh_features": [],
    "all_features": [],
}

extended_fresh_features = pd.merge(misc_features, fresh_features, left_index=True, right_index=True)

extended_fresh_features_rt = calculate_relevance_table(extended_fresh_features, y)
extended_fresh_features_rt = extended_fresh_features_rt[extended_fresh_features_rt.relevant]
extended_fresh_features_rt.sort_values("p_value", inplace=True)

all_features = pd.merge(extended_classic_features, fresh_features, left_index=True, right_index=True)

all_features_rt = calculate_relevance_table(all_features, y)
all_features_rt = all_features_rt[all_features_rt.relevant]
all_features_rt.sort_values("p_value", inplace=True)

for num_features in [20, 40, 60, 80, 100]:
    extended_fresh_feature_names = extended_fresh_features_rt["feature"][:num_features]
    all_feature_names = all_features_rt["feature"][:num_features]

    features_to_try["extended_fresh_features"].append(extended_fresh_features[extended_fresh_feature_names])
    features_to_try["all_features"].append(all_features[all_feature_names])

# Training

In [None]:
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from paddel import settings
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline
from paddel.hyper_parameters.parameters import model_parameter_rules
from paddel.hyper_parameters.parser import parse_hyper_parameters

import warnings

all_results = []

for model_name in models:
    model = models[model_name]

    clf = Pipeline([
        ("scale", QuantileTransformer(n_quantiles=20)),
        ("model", model()),
    ])

    param_grid = parse_hyper_parameters(model_parameter_rules[model], prefix="model__")

    for features_name in features_to_try:
        for features in features_to_try[features_name]:
            grid = GridSearchCV(
                estimator=clf,
                param_grid=param_grid,
                scoring={
                    "accuracy": make_scorer(accuracy_score),
                    "f1": make_scorer(f1_score),
                    "g-mean": make_scorer(geometric_mean_score),
                },
                refit="f1",
                cv=RepeatedKFold(n_splits=2, n_repeats=5),
                n_jobs=settings.max_processes,
                verbose=2,
            )

            grid.fit(features, y)
            results = pd.DataFrame(grid.cv_results_)
            results.to_csv(results_folder / f"{model_name}-{features_name}.csv", index=False)

            results.insert(0, 'feature_amount', features.shape[1])
            results.insert(0, 'dataset', features_name)
            results.insert(0, 'model', model_name)

            all_results.append(results)

all_results = pd.concat(all_results, ignore_index=True)
all_results.to_csv(results_folder / "all_results.csv", index=False)