In [None]:
import os
import warnings

os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
warnings.filterwarnings('ignore')

In [None]:
import pandas as pd
from pathlib import Path

results_folder = Path("../data/results")
results_folder.mkdir(parents=True, exist_ok=True)

# Load data

In [None]:
from paddel.preprocessing.features import get_data, clean_data

y, misc_features, classic_features, fresh_features = get_data(Path("../data/raw"), Path("../data/cache"))

In [None]:
from tsfresh import select_features

clean_data(y, misc_features, classic_features, fresh_features)
fresh_features = select_features(fresh_features, y)

# Models to try

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC

models = {
    "SVC": SVC,
    "GaussianNB": GaussianNB,
    "KNeighborsClassifier": KNeighborsClassifier,
    "RandomForestClassifier": RandomForestClassifier,
    "DecisionTreeClassifier": DecisionTreeClassifier,
}

# Features to try

In [None]:
extended_classic_features = pd.merge(misc_features, classic_features, left_index=True, right_index=True)
extended_fresh_features = pd.merge(misc_features, fresh_features, left_index=True, right_index=True)
all_features = pd.merge(extended_classic_features, fresh_features, left_index=True, right_index=True)

features_to_try = {
    "extended_classic_features": pd.merge(misc_features, classic_features, left_index=True, right_index=True),
    "extended_fresh_features": pd.merge(misc_features, fresh_features, left_index=True, right_index=True),
    "all_features": pd.merge(extended_classic_features, fresh_features, left_index=True, right_index=True),
}

# Training

In [None]:
from sklearn.exceptions import UndefinedMetricWarning
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import make_scorer, accuracy_score, f1_score
from tsfresh import select_features
from paddel import settings
from sklearn.model_selection import GridSearchCV, RepeatedKFold
from sklearn.preprocessing import QuantileTransformer
from sklearn.pipeline import Pipeline
from paddel.hyper_parameters.parameters import model_parameter_rules
from paddel.hyper_parameters.parser import parse_hyper_parameters

import warnings

with warnings.catch_warnings():
    warnings.simplefilter("ignore", category=UndefinedMetricWarning)

    for model_name in models:
        model = models[model_name]

        clf = Pipeline([
            ("scale", QuantileTransformer(n_quantiles=20)),
            ("model", model()),
        ])

        param_grid = parse_hyper_parameters(model_parameter_rules[model], prefix="model__")

        grid = GridSearchCV(
            estimator=clf,
            param_grid=param_grid,
            scoring={
                "accuracy": make_scorer(accuracy_score),
                "f1": make_scorer(f1_score),
                "g-mean": make_scorer(geometric_mean_score),
            },
            refit="accuracy",
            cv=RepeatedKFold(n_splits=2, n_repeats=5),
            n_jobs=settings.max_processes,
            verbose=2,
        )

        for features_name in features_to_try:
            features = features_to_try[features_name]
            features = select_features(features, y)

            grid.fit(features, y)
            results = pd.DataFrame(grid.cv_results_)
            results.to_csv(results_folder / f"{model_name}-{features_name}.csv", index=False)
