In [1]:
import datetime

from joblib import Parallel, delayed

import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score

from smote_variants.oversampling import SMOTE
from common_datasets.binary_classification import get_filtered_data_loaders

In [2]:
import logging
logger = logging.getLogger('smote_variants')
logger.setLevel(logging.ERROR)

In [3]:
classifiers = {
DecisionTreeClassifier: [{'max_depth': md, 'random_state': 5} for md in range(4, 10, 2)],
RandomForestClassifier: [{'max_depth': md, 'random_state': 5} for md in range(4, 10, 2)],
KNeighborsClassifier: [{'n_neighbors': nn} for nn in range(1, 10, 2)],
SVC: [{'C': c, 'probability': True, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\
            + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 2, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\
            + [{'C': c, 'probability': True, 'kernel': 'poly', 'degree': 3, 'random_state': 5} for c in [0.001, 0.01, 0.1]]\
}

In [4]:
datasets = get_filtered_data_loaders(n_col_bounds=(2, 40),
                                        n_bounds=(10, 500),
                                        n_minority_bounds=(10, 500),
                                        n_from_phenotypes=1,
                                        n_smallest=20)

In [5]:
smote_params = [
    #{'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},
    #{'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},
    #{'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},
    #{'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},
    {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},
    #{'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},
    #{'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},
    #{'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},
    #{'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'id'}},
    #{'n_neighbors': 3, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},
    #{'n_neighbors': 5, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},
    #{'n_neighbors': 7, 'proportion': 0.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},
    #{'n_neighbors': 3, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},
    {'n_neighbors': 5, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'MI_weighted'}},
    #{'n_neighbors': 7, 'proportion': 1.0, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},
    #{'n_neighbors': 3, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},
    #{'n_neighbors': 5, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},
    #{'n_neighbors': 7, 'proportion': 1.5, 'random_state': 5, 'nn_params': {'metric': 'precomputed', 'metric_learning_method': 'n_unique_inv'}},
]

In [6]:
for data_loader in datasets:
        results = []

In [7]:
def job_generator(data_loader):

    dataset = data_loader()
    print(datetime.datetime.now(), dataset['name'])

    X = dataset['data']
    y = dataset['target']

    validator = RepeatedStratifiedKFold(n_splits=5, n_repeats=200, random_state=5)

    for fidx, (train, test) in enumerate(validator.split(X, y, y)):
        X_train = X[train]
        X_test = X[test]
        y_train = y[train]
        y_test = y[test]

        ss = StandardScaler()
        ss.fit(X_train)
        X_train = ss.transform(X_train)
        X_test = ss.transform(X_test)

        for sparam in smote_params:
            oversampling = SMOTE(**sparam)
            X_samp, y_samp = oversampling.sample(X_train, y_train)

            for classifier, cparams in classifiers.items():
                for param in cparams:
                    job = {
                        'X_samp': X_samp,
                        'y_samp': y_samp,
                        'X_test': X_test,
                        'y_test': y_test,
                        'classifier': classifier,
                        'param': param
                    }
                    description = {
                        'name': dataset['name'],
                        'fold': fidx,
                        'sparam': sparam,
                        'classifier': classifier.__name__,
                        'cparam': param
                    }
                    yield job, description

In [8]:
def do_job(job, description):
    classifier_obj = job['classifier'](**job['param'])
    classifier_obj.fit(job['X_samp'], job['y_samp'])
    y_pred = classifier_obj.predict_proba(job['X_test'])
    auc = roc_auc_score(job['y_test'], y_pred[:, 1])
    return description | {'auc': auc}

In [9]:
for data_loader in datasets:
    dataset = data_loader()

    if dataset['name'] in ['iris0', 'dermatology-6']:
        continue

    results = Parallel(n_jobs=3)(delayed(do_job)(*x) for x in job_generator(data_loader))
    results = pd.DataFrame.from_dict(results)
    results.to_csv(f"{dataset['name']}-ml.csv")

2023-12-16 11:14:50.048117 appendicitis


2023-12-16 11:21:45.349117 bupa
2023-12-16 11:31:09.550091 cleveland-0_vs_4
2023-12-16 11:42:26.786704 CM1


In [None]:
results.to_csv(f"{dataset['name']}.csv")

In [None]:
do_job(*next(job_generator(datasets[0])))

2023-12-14 21:42:48.609799 appendicitis


{'name': 'appendicitis',
 'fold': 0,
 'sparam': {'n_neighbors': 3,
  'proportion': 0.5,
  'random_state': 5,
  'ss_params': {'within_simplex_sampling': 'random'}},
 'classifier': 'DecisionTreeClassifier',
 'cparam': {'max_depth': 2, 'random_state': 5},
 'auc': 0.5823529411764707}

In [None]:
tmp = pd.DataFrame.from_dict(results)

Unnamed: 0,name,fold,sparam,classifier,cparam,auc
0,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.582353
1,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.723529
2,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.764706
3,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.782353
4,appendicitis,0,"{'n_neighbors': 3, 'proportion': 0.5, 'random_...",RandomForestClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.976471
...,...,...,...,...,...,...
441,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.647059
442,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.661765
443,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",DecisionTreeClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.661765
444,appendicitis,1,"{'n_neighbors': 7, 'proportion': 0.5, 'random_...",RandomForestClassifier,"[{'max_depth': 2, 'random_state': 5}, {'max_de...",0.647059
