In [2]:
import os
import pandas as pd
from pathlib import Path
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
import pickle
from sklearn.metrics import roc_auc_score, average_precision_score
import optuna
import deslib.des.knora_u as des
import deslib.des.knora_e as knora_e
import deslib.des.des_p as desp
import deslib.static as static
import deslib.des.meta_des as meta
import numpy as np
from sklearn.ensemble import AdaBoostClassifier

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
path = Path(os.path.abspath(''))
data_path = path.resolve().parents[0]/'data'
with open(data_path/'pool_classifiers.pkl', 'rb') as model_file:
    pool = pickle.load(model_file)

In [6]:
train = pd.read_pickle(data_path/'train_OHE.pkl')
validation = pd.read_pickle(data_path/'validation_OHE.pkl')
train_x = train.drop(['label'], axis = 1)
train_y = train['label']
validation_x = validation.drop(['label'], axis = 1)
validation_y = validation['label']

In [7]:
test_OHE = pd.read_pickle(data_path/'test_final_OHE.pkl')
test_X = test_OHE.drop(['label'], axis = 1)
test_Y = test_OHE['label']

In [8]:
def probs(model,xtest):
    probs = model.predict_proba(xtest)
    if len(probs[0])>1:
        return [prob[1] for prob in probs]
    return probs

In [9]:
def perf_report(probs, testy):
    print(f"ROC_AUC is {roc_auc_score(y_true = testy, y_score = probs)}")
    print(f"PR_AUC is {average_precision_score(y_true = testy, y_score = probs)}")


In [26]:
pool_probs = probs(pool, test_X)

In [27]:
# performance of 50 bagged trees
roc_auc_score(y_true = test_Y, y_score = pool_probs)

0.8880216305468979

In [174]:
average_precision_score(y_true= test_Y, y_score= pool_probs)

0.3790472567667988

In [22]:
with open(data_path/'knu_homo_baggedtrees.pkl', 'rb') as model_file:
    ens = pickle.load(model_file)

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [28]:
ens_probs = probs(ens,test_X)

In [30]:
# performance of KNU using 50 bagged trees
roc_auc_score(y_true = test_Y, y_score = ens_probs)

0.8879578922910064

In [175]:
average_precision_score(y_true= test_Y, y_score= ens_probs)

0.3844581832481087

In [31]:
with open(data_path/'knu_hetero_rf_nn_NB.pkl', 'rb') as model_file:
    het_ens = pickle.load(model_file)

In [32]:
het_ens_probs = probs(het_ens, test_X)



In [33]:
# performance of hetero ensemble using nn, rf and NB
roc_auc_score(y_true = test_Y, y_score = het_ens_probs)

0.8864593346432779

In [176]:
average_precision_score(y_true= test_Y, y_score= het_ens_probs)

0.4032371269903813

In [23]:
with open(data_path/'rf.pkl', 'rb') as model_file:
    rf = pickle.load(model_file)

In [24]:
with open(data_path/'nn.pkl', 'rb') as model_file:
    nn = pickle.load(model_file)

In [25]:
with open(data_path/'naive_bayes.pkl', 'rb') as model_file:
    NB = pickle.load(model_file)

In [42]:
classifiers = [rf, nn, NB, pool]

In [77]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,20)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    knu = des.KNORAU(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state = 42)
    knu.fit(X_train, y_train)
    probs = knu.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_knorau',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )    

[I 2023-10-30 14:45:50,442] A new study created in memory with name: heterogeneous_knorau


In [78]:
study.optimize(lambda trial: objective(trial, classifiers, train_x, train_y, validation_x, validation_y), n_trials= 60)

[W 2023-10-30 14:45:53,003] Trial 0 failed with parameters: {'k': 19, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.477447244685287} because of the following error: ValueError('Invalid value for parameter "voting". "voting" should be one of these options {selection, hybrid, weighting}').
Traceback (most recent call last):
  File "c:\Users\65829\anaconda3\envs\wp\lib\site-packages\optuna\study\_optimize.py", line 200, in _run_trial
    value_or_values = func(trial)
  File "C:\Users\65829\AppData\Local\Temp\ipykernel_23688\2213714495.py", line 1, in <lambda>
    study.optimize(lambda trial: objective(trial, classifiers, train_x, train_y, validation_x, validation_y), n_trials= 60)
  File "C:\Users\65829\AppData\Local\Temp\ipykernel_23688\1178912775.py", line 7, in objective
    knu.fit(X_train, y_train)
  File "c:\Users\65829\anaconda3\envs\wp\lib\site-packages\deslib\base.py", line 120, in fit
    self._validate_parameters()
  File "c:\Users\65829\anaconda3\envs\wp\lib\site-packages\deslib\

ValueError: Invalid value for parameter "voting". "voting" should be one of these options {selection, hybrid, weighting}

In [49]:
study.best_params

{'k': 19, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.23793040741239418}

In [15]:
trainf = pd.read_pickle(data_path/'train_final_OHE.pkl')
testf = pd.read_pickle(data_path/'test_final_OHE.pkl')
trainf_x = trainf.drop(['label'], axis = 1)
trainf_y = trainf['label']
testf_x = testf.drop(['label'], axis = 1)
testf_y = testf['label']

In [55]:
knu = des.KNORAU(classifiers, k = 19, with_IH = False, DFP = False, IH_rate=0.23793040741239418)

In [56]:
knu.fit(trainf_x, trainf_y)



In [114]:
new_het_probs = probs(knu, testf_x)



In [115]:
# performance of hetero ensemble using nn, rf and NB and POOL and KNORA-U
roc_auc_score(y_true = testf_y, y_score = new_het_probs)

0.8932043486924863

In [177]:
average_precision_score(y_true= test_Y, y_score= new_het_probs)

0.4117238064544785

In [83]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,20)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    voting = trial.suggest_categorical('voting', [True, False])
    kne = knora_e.KNORAE(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state = 42)
    kne.fit(X_train, y_train)
    probs = kne.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_knorae',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )  

[I 2023-10-30 14:47:45,585] A new study created in memory with name: heterogeneous_knorae


In [85]:
study.optimize(lambda trial: objective(trial, classifiers, train_x, train_y, validation_x, validation_y), n_trials= 30)

[I 2023-10-30 15:01:47,242] Trial 60 finished with value: 0.7437138257020447 and parameters: {'k': 19, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.48010221713449736, 'voting': False}. Best is trial 59 with value: 0.8536028987229447.
[I 2023-10-30 15:02:01,341] Trial 61 finished with value: 0.839338229120786 and parameters: {'k': 18, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.49851394018178924, 'voting': False}. Best is trial 59 with value: 0.8536028987229447.
[I 2023-10-30 15:02:14,463] Trial 62 finished with value: 0.8269923069272529 and parameters: {'k': 17, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.45296012389977774, 'voting': False}. Best is trial 59 with value: 0.8536028987229447.
[I 2023-10-30 15:02:28,803] Trial 63 finished with value: 0.839338229120786 and parameters: {'k': 18, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.4793291681824581, 'voting': False}. Best is trial 59 with value: 0.8536028987229447.
[I 2023-10-30 15:02:42,142] Trial 64 finished with value: 0.83396689886244

In [87]:
study.best_params

{'k': 19,
 'ih_ind': True,
 'dfp': False,
 'ih_rate': 0.49902251082966376,
 'voting': False}

In [121]:
kne = knora_e.KNORAE(classifiers, k = 19, with_IH = True, DFP = False, IH_rate = 0.49902251082966376, random_state=42)

In [122]:
kne.fit(trainf_x, trainf_y)



In [123]:
het_kne_probs = probs(kne, testf_x)



In [125]:
# performance of hetero ensemble using nn, rf and NB and POOL and KNORA-E
roc_auc_score(y_true = testf_y, y_score = het_kne_probs)

0.8566613990871392

In [178]:
average_precision_score(y_true= test_Y, y_score= het_kne_probs)

0.22564371449556567

In [19]:
# try upsizing the pooled classifier
rng = np.random.RandomState(42)
pool_classifiers = BaggingClassifier(DecisionTreeClassifier(random_state=rng),
                                     random_state=rng, n_estimators= 100, n_jobs= -1)

In [20]:
pool_classifiers.fit(train_x, train_y)

In [21]:
perf_report(probs(pool_classifiers, test_X), test_Y)

ROC_AUC is 0.8919952050504152
PR_AUC is 0.38672349804710643


In [22]:
model_filename = 'pool_100.pkl'

# Open the file in write-binary mode and save the model
with open(data_path/model_filename, 'wb') as file:
    pickle.dump(pool_classifiers, file)

In [132]:
# try adding a knn classifier
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=7, weights = 'distance', n_jobs = -1)
neigh.fit(train_x, train_y)

In [134]:
L_pool_probs = probs(pool_classifiers, test_X)

In [135]:
# performance of 100 bagged trees
roc_auc_score(y_true = testf_y, y_score = L_pool_probs)

0.8919952050504152

In [179]:
average_precision_score(y_true= test_Y, y_score= L_pool_probs)

0.38672349804710643

In [140]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,20)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    knu = des.KNORAU(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state= 42)
    knu.fit(X_train, y_train)
    probs = knu.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'homogeneous_knorau',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )    

[I 2023-10-30 16:08:38,427] A new study created in memory with name: homogeneous_knorau


In [141]:
study.optimize(lambda trial: objective(trial, pool_classifiers, train_x, train_y, validation_x, validation_y), n_trials= 60)

[I 2023-10-30 16:09:46,602] Trial 0 finished with value: 0.8749438454772198 and parameters: {'k': 18, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.41991598958386905}. Best is trial 0 with value: 0.8749438454772198.
[I 2023-10-30 16:10:50,206] Trial 1 finished with value: 0.8095132994834688 and parameters: {'k': 18, 'ih_ind': True, 'dfp': True, 'ih_rate': 0.4601683685757624}. Best is trial 0 with value: 0.8749438454772198.
[I 2023-10-30 16:11:48,681] Trial 2 finished with value: 0.8030062528819961 and parameters: {'k': 7, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.38118374798116156}. Best is trial 0 with value: 0.8749438454772198.
[I 2023-10-30 16:12:21,013] Trial 3 finished with value: 0.8342882616058062 and parameters: {'k': 8, 'ih_ind': True, 'dfp': True, 'ih_rate': 0.11780882785553337}. Best is trial 0 with value: 0.8749438454772198.
[I 2023-10-30 16:12:51,310] Trial 4 finished with value: 0.8484972933455437 and parameters: {'k': 15, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.0802079

In [143]:
study.best_params

{'k': 5, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.42013941411333383}

In [144]:
large_homo = des.KNORAU(pool_classifiers, k = 5, with_IH = False, DFP = False, random_state= 42)

In [145]:
large_homo.fit(trainf_x, trainf_y)

In [146]:
large_homo_p = probs(large_homo, testf_x)

In [147]:
# KNORAU 100 bagged trees
roc_auc_score(y_true = testf_y, y_score = large_homo_p)

0.892250721098477

In [183]:
average_precision_score(y_true = testf_y, y_score = large_homo_p)

0.38900615218763934

In [162]:
# try using 100 pooled DTs in knora-u
new_classifiers = [pool_classifiers, nn, rf, NB]


In [163]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,20)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    voting = trial.suggest_categorical('voting', [True, False])
    kne = knora_e.KNORAE(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state = 42)
    kne.fit(X_train, y_train)
    probs = kne.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_knorae',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )  

[I 2023-10-31 18:19:41,871] A new study created in memory with name: heterogeneous_knorae


In [164]:
study.optimize(lambda trial: objective(trial, new_classifiers, train_x, train_y, validation_x, validation_y), n_trials= 60)

[I 2023-10-31 18:20:01,968] Trial 0 finished with value: 0.7429957646767062 and parameters: {'k': 19, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.08667761331725266, 'voting': False}. Best is trial 0 with value: 0.7429957646767062.
[I 2023-10-31 18:20:18,730] Trial 1 finished with value: 0.8412129958163428 and parameters: {'k': 13, 'ih_ind': True, 'dfp': True, 'ih_rate': 0.47870470025148665, 'voting': True}. Best is trial 1 with value: 0.8412129958163428.
[I 2023-10-31 18:20:31,578] Trial 2 finished with value: 0.7627793033610047 and parameters: {'k': 8, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.0011836128751728792, 'voting': False}. Best is trial 1 with value: 0.8412129958163428.
[I 2023-10-31 18:20:45,128] Trial 3 finished with value: 0.7747098629170902 and parameters: {'k': 12, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.17692014261856442, 'voting': True}. Best is trial 1 with value: 0.8412129958163428.
[I 2023-10-31 18:20:56,542] Trial 4 finished with value: 0.7894005835678756 and pa

In [165]:
study.best_params

{'k': 19,
 'ih_ind': True,
 'dfp': True,
 'ih_rate': 0.48442576184359243,
 'voting': True}

In [167]:
kne = knora_e.KNORAE(new_classifiers, k = 19, with_IH = True, DFP = True, IH_rate=0.48442576184359243, random_state = 42)

In [168]:
kne.fit(trainf_x, trainf_y)



In [169]:
large_kne_probs = probs(kne, testf_x)



In [171]:
# KNORAE (100 bagged trees, nn, rf, NB)
roc_auc_score(y_true = testf_y, y_score = large_kne_probs)

0.8577112650407139

In [184]:
average_precision_score(y_true = testf_y, y_score = large_kne_probs)

0.2253622861483084

In [185]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,20)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    knu = des.KNORAU(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state= 42)
    knu.fit(X_train, y_train)
    probs = knu.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_knorau',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )    

[I 2023-10-31 19:06:14,705] A new study created in memory with name: heterogeneous_knorau


In [186]:
study.optimize(lambda trial: objective(trial, new_classifiers, train_x, train_y, validation_x, validation_y), n_trials= 60)

[I 2023-10-31 19:06:54,569] Trial 0 finished with value: 0.8686811146430587 and parameters: {'k': 15, 'ih_ind': True, 'dfp': True, 'ih_rate': 0.1374677126957008}. Best is trial 0 with value: 0.8686811146430587.
[I 2023-10-31 19:07:16,332] Trial 1 finished with value: 0.8941084573425833 and parameters: {'k': 7, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.1364014667945675}. Best is trial 1 with value: 0.8941084573425833.
[I 2023-10-31 19:07:46,886] Trial 2 finished with value: 0.8667541975425049 and parameters: {'k': 17, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.23487182375145965}. Best is trial 1 with value: 0.8941084573425833.
[I 2023-10-31 19:07:58,196] Trial 3 finished with value: 0.9121018324651285 and parameters: {'k': 7, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.21201472923561204}. Best is trial 3 with value: 0.9121018324651285.
[I 2023-10-31 19:08:10,665] Trial 4 finished with value: 0.8714264863349794 and parameters: {'k': 15, 'ih_ind': True, 'dfp': True, 'ih_rate': 0.11883642

In [189]:
study.best_params

{'k': 19, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.2975670784147758}

In [190]:
knu = des.KNORAU(new_classifiers, k = 19, with_IH = False, DFP = False, IH_rate=0.2975670784147758, random_state= 42)

In [191]:
knu.fit(trainf_x, trainf_y)



In [193]:
best_knu_probs = probs(knu, test_X) 



In [195]:
# KNORAU (100 bagged trees, nn, rf, NB)
roc_auc_score(y_true = testf_y, y_score = best_knu_probs)

0.8939318345854221

In [196]:
average_precision_score(y_true = testf_y, y_score = best_knu_probs)

0.4161210342204853

In [197]:
knn_probs = probs(neigh,testf_x)

In [205]:
perf_report(knn_probs, test_Y)

ROC_AUC is 0.8005969729318013
PR_AUC is 0.15994111483792048


In [207]:
most_classifiers = [neigh, nn, NB, rf, pool_classifiers]

In [210]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,25)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    knu = des.KNORAU(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state= 42)
    knu.fit(X_train, y_train)
    probs = knu.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_knorau',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )    

[I 2023-10-31 23:10:26,464] A new study created in memory with name: heterogeneous_knorau


In [211]:
study.optimize(lambda trial: objective(trial, most_classifiers, train_x, train_y, validation_x, validation_y), n_trials= 60)

[I 2023-10-31 23:12:21,414] Trial 0 finished with value: 0.8309353619141233 and parameters: {'k': 8, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.11284873292412623}. Best is trial 0 with value: 0.8309353619141233.
[I 2023-10-31 23:14:06,454] Trial 1 finished with value: 0.8479495364910755 and parameters: {'k': 16, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.17803084687887832}. Best is trial 1 with value: 0.8479495364910755.
[I 2023-10-31 23:16:06,535] Trial 2 finished with value: 0.8920990683322075 and parameters: {'k': 5, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.15177432787339895}. Best is trial 2 with value: 0.8920990683322075.
[I 2023-10-31 23:18:02,895] Trial 3 finished with value: 0.8958358466707054 and parameters: {'k': 10, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.03740453111090747}. Best is trial 3 with value: 0.8958358466707054.
[I 2023-10-31 23:20:05,650] Trial 4 finished with value: 0.8540893344510418 and parameters: {'k': 24, 'ih_ind': True, 'dfp': True, 'ih_rate': 0.229674

In [212]:
study.best_params

{'k': 19, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.3166105946899518}

In [222]:
bknu = des.KNORAU(new_classifiers, k = 19, with_IH = False, DFP = False, IH_rate=0.3166105946899518, random_state= 42)

In [223]:
bknu.fit(trainf_x, trainf_y)



In [224]:
bknu_probs = probs(bknu, test_X)



In [225]:
perf_report(bknu_probs, test_Y)

ROC_AUC is 0.8939318345854221
PR_AUC is 0.4161210342204853


In [226]:
perf_report(probs(nn,test_X), test_Y)

ROC_AUC is 0.8882016624832005
PR_AUC is 0.3774998767083372


In [227]:
perf_report(probs(NB,test_X), test_Y)

ROC_AUC is 0.7796173405953397
PR_AUC is 0.11218142453121212


In [228]:
perf_report(probs(rf,test_X), test_Y)

ROC_AUC is 0.8878310370476331
PR_AUC is 0.3773455197352872


In [229]:
perf_report(probs(pool_classifiers,test_X), test_Y)

ROC_AUC is 0.8919952050504152
PR_AUC is 0.38672349804710643


In [230]:
import pandas as pd
import pickle
import optuna.integration.lightgbm as lgb
from lightgbm import early_stopping
from lightgbm import log_evaluation

# Load the model from the 'model.pkl' file
loaded_model = pickle.load(open(data_path/'model.pkl', 'rb'))
test=pd.read_pickle(data_path/'test_final.pkl')
X_test=test.drop("label",axis=1)
y_test=test["label"]
categorical_features = ["sequence", "m1_seq", "p1_seq"]
for c in categorical_features:
    X_test[c]=X_test[c].astype("category")
predictions = loaded_model.predict(X_test,raw_scores=True)



In [239]:
loaded_model.params

{'objective': 'binary',
 'metric': 'binary_logloss',
 'verbosity': -1,
 'boosting_type': 'gbdt',
 'feature_pre_filter': False,
 'lambda_l1': 6.929425894844475,
 'lambda_l2': 1.1426652031102964e-08,
 'num_leaves': 247,
 'feature_fraction': 0.7799999999999999,
 'bagging_fraction': 0.9901149131368316,
 'bagging_freq': 1,
 'min_child_samples': 20,
 'categorical_column': [0, 1, 2],
 'num_iterations': 1000}

In [231]:
perf_report(predictions, test_Y)

ROC_AUC is 0.9116249182061309
PR_AUC is 0.4259408207595274


In [237]:
from sklearn.ensemble import VotingClassifier
voting_classifiers = [("perceptron", nn),
                      ("bayes", NB),
                   ("tree", pool_classifiers),
                      ("knn", neigh),
                    #  ("random_forest"), rf
                    ]
model_voting = VotingClassifier(estimators=voting_classifiers, voting = 'soft').fit(
    trainf_x, trainf_y)


In [238]:
perf_report(probs(model_voting,test_X), test_Y)

ROC_AUC is 0.881317930846924
PR_AUC is 0.3817898206586108


In [244]:
stacked_dt = static.StackedClassifier(new_classifiers,
                               random_state=42,
                               meta_classifier=DecisionTreeClassifier())

In [245]:
stacked_dt.fit(trainf_x, trainf_y)



In [246]:
perf_report(probs(stacked_dt, test_X),test_Y)





ROC_AUC is 0.7518043964369635
PR_AUC is 0.20847399064942276




In [247]:
stacked_lr = static.StackedClassifier(new_classifiers,
                               random_state=42)

In [248]:
stacked_lr.fit(trainf_x, trainf_y)



In [249]:
perf_report(probs(stacked_lr, test_X),test_Y)



ROC_AUC is 0.8810642591894527
PR_AUC is 0.398491036040853


In [250]:
stacked_lr = static.StackedClassifier(most_classifiers,
                               random_state=42)

In [251]:
stacked_lr.fit(trainf_x, trainf_y)



In [252]:
perf_report(probs(stacked_lr, test_X),test_Y)



ROC_AUC is 0.8854886998265574
PR_AUC is 0.3807323312788329


In [264]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,25)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    knu = desp.DESP(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state= 42, voting = 'soft')
    knu.fit(X_train, y_train)
    probs = knu.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_desp',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )    

[I 2023-11-01 09:41:30,119] A new study created in memory with name: heterogeneous_desp


In [255]:
study.optimize(lambda trial: objective(trial, new_classifiers, train_x, train_y, validation_x, validation_y), n_trials= 60)

[I 2023-11-01 09:18:29,729] Trial 0 finished with value: 0.9144118209306862 and parameters: {'k': 6, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.39759090858917373}. Best is trial 0 with value: 0.9144118209306862.
[I 2023-11-01 09:18:55,035] Trial 1 finished with value: 0.9160800526476907 and parameters: {'k': 18, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.37088816397932034}. Best is trial 1 with value: 0.9160800526476907.
[I 2023-11-01 09:19:18,282] Trial 2 finished with value: 0.9164099402869791 and parameters: {'k': 16, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.36252684239221405}. Best is trial 2 with value: 0.9164099402869791.
[I 2023-11-01 09:19:41,141] Trial 3 finished with value: 0.9140040790335352 and parameters: {'k': 8, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.3137865142094863}. Best is trial 2 with value: 0.9164099402869791.
[I 2023-11-01 09:20:04,738] Trial 4 finished with value: 0.9121154916733724 and parameters: {'k': 22, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.216

In [257]:
study.best_params

{'k': 10, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.20646492058689186}

In [269]:
desp_model = desp.DESP(new_classifiers, k = 10, with_IH = False, DFP = True, IH_rate=0.20646492058689186, random_state= 42, voting = 'soft')

AttributeError: 'DESP' object has no attribute 'DESP'

In [260]:
desp.fit(trainf_x, trainf_y)



In [261]:
perf_report(probs(desp, test_X),test_Y)



ROC_AUC is 0.8990381561312746
PR_AUC is 0.42561512367948906


In [271]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,25)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    knu = desp.DESP(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state= 42, voting = 'soft')
    knu.fit(X_train, y_train)
    probs = knu.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_desp',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )    

[I 2023-11-01 09:43:01,132] A new study created in memory with name: heterogeneous_desp


In [272]:
study.optimize(lambda trial: objective(trial, most_classifiers, train_x, train_y, validation_x, validation_y), n_trials= 60)

[I 2023-11-01 09:46:35,022] Trial 0 finished with value: 0.870807979302586 and parameters: {'k': 21, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.13625878888830706}. Best is trial 0 with value: 0.870807979302586.
[I 2023-11-01 09:49:30,729] Trial 1 finished with value: 0.9012682720165126 and parameters: {'k': 16, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.4127632371488429}. Best is trial 1 with value: 0.9012682720165126.
[I 2023-11-01 09:52:24,600] Trial 2 finished with value: 0.905050839379336 and parameters: {'k': 9, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.4431043961378874}. Best is trial 2 with value: 0.905050839379336.
[I 2023-11-01 09:56:01,144] Trial 3 finished with value: 0.9051661515132335 and parameters: {'k': 14, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.047490033154447664}. Best is trial 3 with value: 0.9051661515132335.
[I 2023-11-01 09:58:00,128] Trial 4 finished with value: 0.7889201863077255 and parameters: {'k': 6, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.191334377

In [273]:
study.best_params

{'k': 20, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.061371970722223765}

In [277]:
desp_B = desp.DESP(most_classifiers, k = 20, with_IH = False, DFP = True, IH_rate=0.061371970722223765, random_state= 42, voting = 'soft')

In [278]:
desp_B.fit(trainf_x, trainf_y)



In [279]:
perf_report(probs(desp_B, test_X),test_Y)



ROC_AUC is 0.895051729137535
PR_AUC is 0.3823656357587157


In [298]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,25)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    knu = desp.DESP(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state= 42, voting = 'soft')
    knu.fit(X_train, y_train)
    probs = knu.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return average_precision_score(y_true=y_valid,y_score=true_probs), roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_desp', directions = ["maximize","maximize"] )    

[I 2023-11-01 12:25:01,700] A new study created in memory with name: heterogeneous_desp


In [299]:
study.optimize(lambda trial: objective(trial, new_classifiers, train_x, train_y, validation_x, validation_y), n_trials= 60)

[I 2023-11-01 12:25:24,608] Trial 0 finished with values: [0.25440127190966105, 0.8878434338280786] and parameters: {'k': 21, 'ih_ind': True, 'dfp': True, 'ih_rate': 0.0959566887816522}. 
[I 2023-11-01 12:25:37,301] Trial 1 finished with values: [0.45191029760574314, 0.9123476659222454] and parameters: {'k': 17, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.024209491754130785}. 
[I 2023-11-01 12:25:49,549] Trial 2 finished with values: [0.45029030650500257, 0.9150965896541345] and parameters: {'k': 21, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.20779164138672157}. 
[I 2023-11-01 12:26:01,458] Trial 3 finished with values: [0.4496822784538822, 0.9164691301893689] and parameters: {'k': 5, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.2970759033133715}. 
[I 2023-11-01 12:26:15,303] Trial 4 finished with values: [0.2119876403446445, 0.8619803849894666] and parameters: {'k': 15, 'ih_ind': True, 'dfp': True, 'ih_rate': 0.25174862505069795}. 
[I 2023-11-01 12:26:27,316] Trial 5 finished with value

In [300]:
study.best_trials

[FrozenTrial(number=1, state=TrialState.COMPLETE, values=[0.45191029760574314, 0.9123476659222454], datetime_start=datetime.datetime(2023, 11, 1, 12, 25, 24, 609745), datetime_complete=datetime.datetime(2023, 11, 1, 12, 25, 37, 301827), params={'k': 17, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.024209491754130785}, user_attrs={}, system_attrs={'nsga2:generation': 0}, intermediate_values={}, distributions={'k': IntDistribution(high=25, log=False, low=5, step=1), 'ih_ind': CategoricalDistribution(choices=(True, False)), 'dfp': CategoricalDistribution(choices=(True, False)), 'ih_rate': FloatDistribution(high=0.5, log=False, low=0.0, step=None)}, trial_id=1, value=None),
 FrozenTrial(number=24, state=TrialState.COMPLETE, values=[0.451030886349055, 0.9165993608911357], datetime_start=datetime.datetime(2023, 11, 1, 12, 30, 32, 320569), datetime_complete=datetime.datetime(2023, 11, 1, 12, 30, 44, 200979), params={'k': 14, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.02143242982711463}, user_a

In [313]:
opt = desp.DESP(new_classifiers, k = 14, with_IH = False, DFP = True, IH_rate=0.2650472926942986, random_state= 42, voting = 'soft')

In [314]:
opt.fit(trainf_x, trainf_y)



In [315]:
perf_report(probs(opt, test_X), test_Y)



ROC_AUC is 0.8952215878036313
PR_AUC is 0.419030986253681


In [319]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,25)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    Kp = trial.suggest_int('Kp',3,7)
    selection_threshold = trial.suggest_float('selection_threshold',0.5,1.0)
    knu = meta.METADES(pool, k = k, with_IH = ih_ind, DFP = True, Kp = Kp, selection_threshold = selection_threshold, IH_rate=ih_rate, random_state= 42, voting = 'soft')
    knu.fit(X_train, y_train)
    probs = knu.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_METADES', direction = 'maximize')    

[I 2023-11-01 13:09:28,922] A new study created in memory with name: heterogeneous_METADES


In [320]:
study.optimize(lambda trial: objective(trial, new_classifiers, train_x, train_y, validation_x, validation_y), n_trials= 70)

[I 2023-11-01 13:11:34,150] Trial 0 finished with value: 0.8969806045701192 and parameters: {'k': 25, 'ih_ind': False, 'ih_rate': 0.32911710820999796, 'Kp': 7, 'selection_threshold': 0.8184094366310651}. Best is trial 0 with value: 0.8969806045701192.
[I 2023-11-01 13:13:18,732] Trial 1 finished with value: 0.8608023993707077 and parameters: {'k': 23, 'ih_ind': True, 'ih_rate': 0.3406689405855055, 'Kp': 7, 'selection_threshold': 0.8822867041276499}. Best is trial 0 with value: 0.8969806045701192.
[I 2023-11-01 13:15:38,377] Trial 2 finished with value: 0.863288504436175 and parameters: {'k': 19, 'ih_ind': True, 'ih_rate': 0.16584982051575997, 'Kp': 7, 'selection_threshold': 0.729992490003902}. Best is trial 0 with value: 0.8969806045701192.
[I 2023-11-01 13:19:26,830] Trial 3 finished with value: 0.8499493349935352 and parameters: {'k': 12, 'ih_ind': True, 'ih_rate': 0.10748479910376141, 'Kp': 6, 'selection_threshold': 0.5589753615931787}. Best is trial 0 with value: 0.8969806045701192

In [321]:
study.best_params

{'k': 24,
 'ih_ind': False,
 'ih_rate': 0.4940692081713565,
 'Kp': 5,
 'selection_threshold': 0.6544931130898003}

In [322]:
metaD = meta.METADES(new_classifiers, k = 24, with_IH = False, DFP = True, Kp = 5, selection_threshold = 0.6544931130898003, IH_rate=0.2, random_state= 42, voting = 'soft')

In [323]:
metaD.fit(trainf_x, trainf_y)



In [324]:
perf_report(probs(metaD, test_X), test_Y)



ROC_AUC is 0.8880722833369133
PR_AUC is 0.3801563683493936


In [3]:
def objective(trial, X_train, y_train, X_valid, y_valid):
    n_estimators = trial.suggest_int('n_estimators',50, 350)
    learning_rate = trial.suggest_float('lr', 0.01, 10) 
    adaB = AdaBoostClassifier(n_estimators= n_estimators, learning_rate= learning_rate, random_state= 42)
    adaB.fit(X_train, y_train)
    probs = adaB.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'adaB', direction = 'maximize')    

[I 2023-11-01 16:25:02,610] A new study created in memory with name: adaB


In [11]:
study.optimize(lambda trial: objective(trial, train_x, train_y, validation_x, validation_y), n_trials= 40, n_jobs= -1)

[I 2023-11-01 16:33:12,315] Trial 2 finished with value: 0.6678811952420748 and parameters: {'n_estimators': 70, 'lr': 6.928955093144314}. Best is trial 2 with value: 0.6678811952420748.
[I 2023-11-01 16:36:05,356] Trial 4 finished with value: 0.6678811952420748 and parameters: {'n_estimators': 128, 'lr': 27.72602963144483}. Best is trial 2 with value: 0.6678811952420748.
[I 2023-11-01 16:36:09,993] Trial 9 finished with value: 0.7766286749082605 and parameters: {'n_estimators': 134, 'lr': 0.011315048325636646}. Best is trial 9 with value: 0.7766286749082605.
[I 2023-11-01 16:38:22,429] Trial 8 finished with value: 0.7911327520043192 and parameters: {'n_estimators': 179, 'lr': 0.01285761480905331}. Best is trial 8 with value: 0.7911327520043192.
[I 2023-11-01 16:38:59,739] Trial 6 finished with value: 0.8835080721722856 and parameters: {'n_estimators': 197, 'lr': 0.7582447017231118}. Best is trial 6 with value: 0.8835080721722856.
[I 2023-11-01 16:41:24,417] Trial 5 finished with value

In [12]:
study.best_params

{'n_estimators': 205, 'lr': 0.730555370099575}

In [13]:
adaB = AdaBoostClassifier(n_estimators = 205, learning_rate = 0.730555370099575)

In [16]:
adaB.fit(trainf_x, trainf_y)

In [17]:
perf_report(probs(adaB, test_X), test_Y)

ROC_AUC is 0.8944882386900804
PR_AUC is 0.399560486943237


In [18]:
model_filename = 'adaboost.pkl'

# Open the file in write-binary mode and save the model
with open(data_path/model_filename, 'wb') as file:
    pickle.dump(adaB, file)

In [26]:
class_v0 = [adaB, pool_classifiers, rf, nn, NB]

In [42]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,25)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    desp_model = desp.DESP(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state= 42, voting = 'soft')
    desp_model.fit(X_train, y_train)
    probs = desp_model.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_desp',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )    

[I 2023-11-01 23:55:35,456] A new study created in memory with name: heterogeneous_desp


In [43]:
study.optimize(lambda trial: objective(trial, class_v0, train_x, train_y, validation_x, validation_y), n_trials= 60, n_jobs= -1)

[I 2023-11-01 23:57:16,392] Trial 7 finished with value: 0.9118276472708062 and parameters: {'k': 5, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.3312643627630331}. Best is trial 7 with value: 0.9118276472708062.
[I 2023-11-01 23:57:31,226] Trial 1 finished with value: 0.912524815842873 and parameters: {'k': 20, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.23674675215560648}. Best is trial 1 with value: 0.912524815842873.
[I 2023-11-01 23:57:34,636] Trial 4 finished with value: 0.8297437493783929 and parameters: {'k': 15, 'ih_ind': True, 'dfp': True, 'ih_rate': 0.3534551790779074}. Best is trial 1 with value: 0.912524815842873.
[I 2023-11-01 23:57:36,479] Trial 3 finished with value: 0.8403865006981374 and parameters: {'k': 20, 'ih_ind': True, 'dfp': True, 'ih_rate': 0.3331976143251249}. Best is trial 1 with value: 0.912524815842873.
[I 2023-11-01 23:57:39,519] Trial 6 finished with value: 0.8256301641817456 and parameters: {'k': 5, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.07197060426890

In [44]:
study.best_params

{'k': 6, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.4565716319490343}

In [45]:
desp_model = desp.DESP(class_v0, k = 6, with_IH = False, DFP = True, IH_rate=0.4, random_state= 42, voting = 'soft')

In [46]:
desp_model.fit(trainf_x, trainf_y)



In [47]:
perf_report(probs(desp_model, test_X), test_Y)



ROC_AUC is 0.8961270670959012
PR_AUC is 0.407286596404152


In [48]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,25)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    dfp = trial.suggest_categorical('dfp', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    knu = des.KNORAU(pool, k = k, with_IH = ih_ind, DFP = dfp, IH_rate=ih_rate, random_state= 42)
    knu.fit(X_train, y_train)
    probs = knu.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_knorau',pruner = optuna.pruners.HyperbandPruner(
        min_resource=1, reduction_factor=3
    ), direction = "maximize", )    

[I 2023-11-02 00:13:51,606] A new study created in memory with name: heterogeneous_knorau


In [49]:
study.optimize(lambda trial: objective(trial, class_v0, train_x, train_y, validation_x, validation_y), n_trials= 60, n_jobs= -1)

[I 2023-11-02 00:15:49,751] Trial 3 finished with value: 0.9120148720684368 and parameters: {'k': 21, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.33661534100265017}. Best is trial 3 with value: 0.9120148720684368.
[I 2023-11-02 00:15:50,029] Trial 7 finished with value: 0.8936530535273048 and parameters: {'k': 8, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.4157728450413322}. Best is trial 3 with value: 0.9120148720684368.
[I 2023-11-02 00:15:50,726] Trial 4 finished with value: 0.9125405739838259 and parameters: {'k': 16, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.0829297299824236}. Best is trial 4 with value: 0.9125405739838259.
[I 2023-11-02 00:15:57,620] Trial 0 finished with value: 0.8972950569811793 and parameters: {'k': 16, 'ih_ind': False, 'dfp': True, 'ih_rate': 0.41051074176870805}. Best is trial 4 with value: 0.9125405739838259.
[I 2023-11-02 00:16:02,613] Trial 2 finished with value: 0.8459162842303634 and parameters: {'k': 22, 'ih_ind': True, 'dfp': False, 'ih_rate': 0.3220

In [50]:
study.best_params

{'k': 18, 'ih_ind': False, 'dfp': False, 'ih_rate': 0.2107380391285169}

In [51]:
knu_v0 = des.KNORAU(class_v0, k = 18, with_IH = False, DFP = False, IH_rate=0.03, random_state= 42)

In [52]:
knu_v0.fit(trainf_x, trainf_y)



In [53]:
perf_report(probs(knu_v0,test_X), test_Y)



ROC_AUC is 0.8987165332415098
PR_AUC is 0.3985937736713405


In [54]:
def objective(trial, pool, X_train, y_train, X_valid, y_valid):
    k = trial.suggest_int('k', 5,25)
    ih_ind = trial.suggest_categorical('ih_ind', [True, False])
    ih_rate = trial.suggest_float('ih_rate', 0.0,0.5)
    # Kp = trial.suggest_int('Kp',3,7)
    selection_threshold = trial.suggest_float('selection_threshold',0.5,1.0)
    knu = meta.METADES(pool, k = k, with_IH = ih_ind, DFP = True, Kp = 5, selection_threshold = selection_threshold, IH_rate=ih_rate, random_state= 42, voting = 'soft')
    knu.fit(X_train, y_train)
    probs = knu.predict_proba(X_valid)
    true_probs = [entry[1] for entry in probs]
    return roc_auc_score(y_true=y_valid,y_score=true_probs)

study = optuna.create_study(study_name = 'heterogeneous_METADES', direction = 'maximize')    

[I 2023-11-02 01:07:36,305] A new study created in memory with name: heterogeneous_METADES


In [55]:
study.optimize(lambda trial: objective(trial, class_v0, train_x, train_y, validation_x, validation_y), n_trials= 60, n_jobs= -1)

[I 2023-11-02 01:22:27,497] Trial 6 finished with value: 0.8615419663834938 and parameters: {'k': 23, 'ih_ind': True, 'ih_rate': 0.06255761379758434, 'selection_threshold': 0.5629197260306139}. Best is trial 6 with value: 0.8615419663834938.
[I 2023-11-02 01:22:32,888] Trial 1 finished with value: 0.8739744614784495 and parameters: {'k': 9, 'ih_ind': False, 'ih_rate': 0.04040183086071908, 'selection_threshold': 0.9639061068683175}. Best is trial 1 with value: 0.8739744614784495.
[I 2023-11-02 01:22:37,598] Trial 5 finished with value: 0.8751847706609249 and parameters: {'k': 12, 'ih_ind': False, 'ih_rate': 0.04397474882861713, 'selection_threshold': 0.6158491983248146}. Best is trial 5 with value: 0.8751847706609249.
[I 2023-11-02 01:22:52,264] Trial 0 finished with value: 0.8764932130203578 and parameters: {'k': 23, 'ih_ind': False, 'ih_rate': 0.3236974924630173, 'selection_threshold': 0.5487939907692118}. Best is trial 0 with value: 0.8764932130203578.
[I 2023-11-02 01:22:53,684] Tri