In [7]:
%load_ext autoreload

%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import os

from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import minmax_scale, normalize, robust_scale, OrdinalEncoder,OneHotEncoder

from sklearn.kernel_approximation import Nystroem
from sklearn.compose import make_column_transformer
from sklearn.compose import make_column_selector

from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.utils import compute_sample_weight
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

from sklearn.metrics import roc_auc_score

sys.path.append('/mnt/c/Users/conor/Git_Projects/PHD/')

from Preprocessing_Methods import *

from TSquared.hotelling_t2 import HotellingT2

from tqdm.notebook import tqdm

In [11]:
def best_pipe():

    scalers = {'Std': normalize
            ,'Robust': robust_scale
            ,'MinMax': minmax_scale
            }

    num_pipe = Pipeline([
    ("Normalise spectra", FunctionTransformer(scalers['MinMax'], kw_args = {"axis": 1})),
    ("Feature Scale", FunctionTransformer(scalers['MinMax'], kw_args = {"axis": 0})),
    ("PCA", PCA(7)),
    ])

    cat_pipe = Pipeline([
        ("OneHot", OneHotEncoder(sparse=False, handle_unknown='ignore', dtype=int))
        #("Encoding", OrdinalEncoder(handle_unknown='ignore', dtype=int))
    ])

    ct = make_column_transformer(
        (num_pipe, make_column_selector(dtype_include=np.number)),
        (cat_pipe, make_column_selector(dtype_include=object))
    )

    clf = LogisticRegression(C=1e5)

    pipe = Pipeline([('Preprocess', ct), ('Classifier', clf)])

    return pipe

In [10]:
def create_pipe():

    numeric_pipe = Pipeline([
    ("Normalise spectra", FunctionTransformer(minmax_scale, kw_args = {"axis": 1})),
    ("Feature Scale", StandardScaler()),
    ("PCA", PCA(0.99))
    ])

    categorical_pipe = Pipeline([
        ("OneHot", OneHotEncoder(sparse=False, handle_unknown='use_encoded_value', dtype=int))
    ])

    ct = make_column_transformer(
        (numeric_pipe,     make_column_selector(dtype_include=np.number)),
        (categorical_pipe, make_column_selector(dtype_include=object))
    )

    #pipe = Pipeline([('Preprocess', ct), ('Classifier', LogisticRegression())])
    pipe = Pipeline([('Preprocess', ct), ('Classifier', LogisticRegression())])

    return pipe

In [12]:
bootstrap_pats = pd.read_pickle('./train_indices_20_December.pickle')

def bootstrap_it(n):

    for idx, row in bootstrap_pats.iloc[:n,:].iterrows():

        train_i = sel.query(f"Patient_Number in {list(row.iloc[0])}").reset_index()['IDX'].values
        test_i = sel.query(f"Patient_Number in {list(row.iloc[1])}").reset_index()['IDX'].values 

        yield train_i, test_i

In [14]:
y_label = 'Opt_Group'

total = pd.read_pickle('./December_Datat_32.pickle')
#total = truncate(total, start=1000, end=1800)

ht2 = HotellingT2().fit(total)
#total = total.loc[ht2.predict(total)==1,:]

min_ = total.index.get_level_values('Patient_Number').value_counts().min()
sel = total#.groupby('Patient_Number').sample(min_)
sel = sel.reset_index(['ASMA']).dropna()#.sample(2000)
sel['IDX'] = np.arange(sel.shape[0])
sel = sel.set_index('IDX', append=True)

sel.columns = [str(col) for col in sel.columns]

In [17]:
for y_label in ['1yeardeath', '2year', 'Opt_Group']:

    y_label='2year'

    patient_weights = compute_sample_weight('balanced', sel.reset_index()['Patient_Number'])
    label_weights = compute_sample_weight('balanced', sel.reset_index()[y_label])

    weights = patient_weights * label_weights

    for name, X_data in zip(['ASMA', 'ASMA_FTIR', 'FTIR'], [sel.iloc[:,[0]], sel, sel.iloc[:,1:]]):

        straps = dict()

        for i, (train_i, test_i) in tqdm(enumerate(bootstrap_it(100))):
            
            pipe = best_pipe()
            #pipe.fit(X_data.iloc[train_i,:], sel.iloc[train_i,:].index.get_level_values(f'{y_label}').astype(int), **{'Classifier__sample_weight': weights[train_i]})
            pipe.fit(X_data.iloc[train_i,:], sel.iloc[train_i,:].index.get_level_values(f'{y_label}').astype(int), **{'Classifier__sample_weight': weights[train_i]})

            pred_df = pd.DataFrame(pipe.predict_proba(X_data.iloc[test_i])[:,1], columns=['Preds'], index=sel.iloc[test_i].index)
            pred_df['Y_true'] = sel.iloc[test_i,:].index.get_level_values(f'{y_label}').astype(int)
            pred_df['Weights'] = weights[test_i]

            straps[i] = pred_df.astype(np.float16)

        save_path = f'./Results/best_Opt/{y_label}/{name}'

        if not os.path.exists(save_path):

            os.makedirs(save_path)

        pd.concat(straps).to_pickle(f'./Results/best_Opt/{y_label}/{name}/results_March.pickle')
    break

0it [00:00, ?it/s]

0it [00:00, ?it/s]

0it [00:00, ?it/s]