In [7]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import minmax_scale, normalize, robust_scale, OneHotEncoder

from sklearn.kernel_approximation import Nystroem
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.utils import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import roc_auc_score

sys.path.append('/mnt/c/Users/conor/Git_Projects/PHD/')

from Preprocessing_Methods import *

from TSquared.hotelling_t2 import HotellingT2

from tqdm.notebook import tqdm

import optuna

In [2]:
y_label = 'Opt_Group'

total = pd.read_pickle('./December_Datat_32.pickle')
#total = truncate(total, start=1000, end=1800)

ht2 = HotellingT2().fit(total)
#total = total.loc[ht2.predict(total)==1,:]

min_ = total.index.get_level_values('Patient_Number').value_counts().min()
sel = total#.groupby('Patient_Number').sample(min_)
sel = sel.reset_index(['ASMA']).dropna()#.sample(2000)
sel['IDX'] = np.arange(sel.shape[0])
sel = sel.set_index('IDX', append=True)

sel.columns = [str(col) for col in sel.columns]

patient_weights = compute_sample_weight('balanced', sel.reset_index()['Patient_Number'])
label_weights = compute_sample_weight('balanced', sel.reset_index()[y_label])

weights = patient_weights * label_weights

In [4]:
cols = total.columns.tolist()

In [5]:
bootstrap_pats = pd.read_pickle('./train_indices_20_December.pickle')

def bootstrap_it(n):

    for idx, row in bootstrap_pats.iloc[:n,:].iterrows():

        train_i = sel.query(f"Patient_Number in {list(row.iloc[0])}").reset_index()['IDX'].values
        test_i = sel.query(f"Patient_Number in {list(row.iloc[1])}").reset_index()['IDX'].values 

        yield train_i, test_i

In [6]:
def objective(trial):

    scalers = {'Std': normalize, 'Robust': robust_scale, 'MinMax': minmax_scale
               }

    num_pipe = Pipeline([
        ("Normalise spectra", FunctionTransformer(scalers[trial.suggest_categorical(
            'Scaler1', scalers.keys())], kw_args={"axis": 1})),
        ("Feature Scale", FunctionTransformer(scalers[trial.suggest_categorical(
            'Scaler2', scalers.keys())], kw_args={"axis": 0})),
        ("PCA", PCA(trial.suggest_int('pc_comps', 2, 10))),
    ])

    cat_pipe = Pipeline([
        ("OneHot", OneHotEncoder(sparse=False, handle_unknown='ignore', dtype=int))
        #("Encoding", OrdinalEncoder())
    ])

    ct = make_column_transformer(
        (num_pipe, make_column_selector(dtype_include=np.number)),
        (cat_pipe, make_column_selector(dtype_include=object))
    )

    clf = LogisticRegression(C=trial.suggest_float('C', 1e3, 1e10, log=True))

    pipe = Pipeline([('Preprocess', ct), ('Classifier', clf)])

    scores = []

    for i, (train_i, test_i) in tqdm(enumerate(bootstrap_it(5))):

        pipe.fit(sel.iloc[train_i, :], sel.iloc[train_i, :].index.get_level_values(
            f'{y_label}').astype(int), **{'Classifier__sample_weight': weights[train_i]})

        pred_df = pd.DataFrame(pipe.predict_proba(sel.iloc[test_i])[:, 1], columns=[
                               'Preds'], index=sel.iloc[test_i].index)
        pred_df['Y_true'] = sel.iloc[test_i, :].index.get_level_values(
            f'{y_label}').astype(int)
        pred_df['Weights'] = weights[test_i]

        score = roc_auc_score(
            pred_df['Y_true'], pred_df['Preds'], sample_weight=pred_df['Weights'])
        scores.append(score)

        # For pruning (stops trial early if not promising)
        trial.report(score, i)
        # Handle pruning based on the intermediate value.
        if trial.should_prune():
            raise optuna.exceptions.TrialPruned()

    return np.median(scores)


In [7]:
study = optuna.create_study(direction="maximize")  # Create a new study.
study.optimize(objective, n_trials=30, n_jobs=4)

[32m[I 2021-12-29 19:26:44,319][0m A new study created in memory with name: no-name-9687d4f0-8235-4572-af6c-0e71a7b1c893[0m


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:27:09,599][0m Trial 3 finished with value: 0.5902052840114088 and parameters: {'Scaler1': 'Std', 'Scaler2': 'MinMax', 'pc_comps': 4, 'C': 22360229.985547673}. Best is trial 3 with value: 0.5902052840114088.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:27:12,458][0m Trial 1 finished with value: 0.7233597738420243 and parameters: {'Scaler1': 'MinMax', 'Scaler2': 'MinMax', 'pc_comps': 7, 'C': 107210153.49450926}. Best is trial 1 with value: 0.7233597738420243.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:27:16,151][0m Trial 0 finished with value: 0.6081340953496017 and parameters: {'Scaler1': 'Std', 'Scaler2': 'Std', 'pc_comps': 4, 'C': 288424963.2406135}. Best is trial 1 with value: 0.7233597738420243.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:27:40,190][0m Trial 6 finished with value: 0.5949709592913288 and parameters: {'Scaler1': 'Std', 'Scaler2': 'Robust', 'pc_comps': 4, 'C': 12121120.982066708}. Best is trial 1 with value: 0.7233597738420243.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:28:06,697][0m Trial 7 finished with value: 0.695095565977632 and parameters: {'Scaler1': 'MinMax', 'Scaler2': 'Robust', 'pc_comps': 6, 'C': 1444191.594697875}. Best is trial 1 with value: 0.7233597738420243.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:28:16,832][0m Trial 8 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:28:32,952][0m Trial 9 finished with value: 0.720200622905358 and parameters: {'Scaler1': 'Std', 'Scaler2': 'MinMax', 'pc_comps': 8, 'C': 4666087.136651044}. Best is trial 1 with value: 0.7233597738420243.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:28:45,595][0m Trial 10 finished with value: 0.7181028229303829 and parameters: {'Scaler1': 'Std', 'Scaler2': 'Std', 'pc_comps': 2, 'C': 3272728.456253602}. Best is trial 1 with value: 0.7233597738420243.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:36:32,691][0m Trial 5 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:36:33,400][0m Trial 4 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:36:36,899][0m Trial 12 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:36:40,943][0m Trial 14 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:36:42,337][0m Trial 13 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:36:45,222][0m Trial 15 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:36:46,256][0m Trial 16 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:44:44,989][0m Trial 2 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:44:59,890][0m Trial 19 finished with value: 0.7201995594490569 and parameters: {'Scaler1': 'Std', 'Scaler2': 'MinMax', 'pc_comps': 8, 'C': 90374.70788678847}. Best is trial 1 with value: 0.7233597738420243.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:45:14,325][0m Trial 20 finished with value: 0.6902655584032424 and parameters: {'Scaler1': 'Std', 'Scaler2': 'MinMax', 'pc_comps': 7, 'C': 5650144386.951477}. Best is trial 1 with value: 0.7233597738420243.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:45:17,175][0m Trial 21 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:45:50,245][0m Trial 18 pruned. [0m





[32m[I 2021-12-29 19:45:50,295][0m Trial 17 pruned. [0m


HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…




HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:45:55,270][0m Trial 23 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:46:17,286][0m Trial 24 finished with value: 0.6902658289549817 and parameters: {'Scaler1': 'Std', 'Scaler2': 'MinMax', 'pc_comps': 7, 'C': 509958.9648970875}. Best is trial 1 with value: 0.7233597738420243.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:46:24,007][0m Trial 25 finished with value: 0.6902672904410261 and parameters: {'Scaler1': 'Std', 'Scaler2': 'MinMax', 'pc_comps': 7, 'C': 112929.6182223323}. Best is trial 1 with value: 0.7233597738420243.[0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:46:24,375][0m Trial 26 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:46:28,017][0m Trial 11 pruned. [0m





HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…

[32m[I 2021-12-29 19:46:29,554][0m Trial 27 pruned. [0m
[32m[I 2021-12-29 19:46:29,592][0m Trial 28 pruned. [0m






[32m[I 2021-12-29 19:46:31,754][0m Trial 29 pruned. [0m





[32m[I 2021-12-29 19:46:49,339][0m Trial 22 pruned. [0m





In [9]:
study.best_trial

FrozenTrial(number=1, values=[0.7233597738420243], datetime_start=datetime.datetime(2021, 12, 29, 19, 26, 44, 346118), datetime_complete=datetime.datetime(2021, 12, 29, 19, 27, 12, 458390), params={'Scaler1': 'MinMax', 'Scaler2': 'MinMax', 'pc_comps': 7, 'C': 107210153.49450926}, distributions={'Scaler1': CategoricalDistribution(choices=('Std', 'Robust', 'MinMax')), 'Scaler2': CategoricalDistribution(choices=('Std', 'Robust', 'MinMax')), 'pc_comps': IntUniformDistribution(high=10, low=2, step=1), 'C': LogUniformDistribution(high=10000000000.0, low=1000.0)}, user_attrs={}, system_attrs={}, intermediate_values={0: 0.6425992535687283, 1: 0.7813838589145391, 2: 0.6573327156368153, 3: 0.7233597738420243, 4: 0.7696154044947654}, trial_id=1, state=TrialState.COMPLETE, value=None)