In [None]:
import pandas as pd
import numpy as np
import random
from sklearn.metrics import f1_score, recall_score, precision_score, roc_auc_score, confusion_matrix,accuracy_score
import json
from sklearn.ensemble import RandomForestClassifier
import operator
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook as tqdm
from IPython.display import display

import warnings
warnings.filterwarnings("ignore")

In [None]:
from functions.preprocessing import *
from functions.functions import *
from functions.df_modifications import *
from functions.feature_selection import *
from functions.other import *

In [None]:
type_of_df = 'vl'
type_of_mask = 'v1'

In [None]:
df = pd.read_csv(f'data/TG_{type_of_df}.csv', index_col = 0)
df.columns = [col.replace('__gen', '') for col in df.columns]
df.index = [ind.replace('_pat','').replace('_non','') for ind in df.index]
df['id'] = df.index.map(lambda x: x.split('_')[0] + '_' + x.split('_')[1])
df1 = df.copy()
df.head()

In [None]:
df = df1.copy()
df_non_symm = df[(df.f_1_1_1__right__v1 != df.f_1_1_1__left__v1) | (df.pathology == 0)]
df_symm = df[(df.f_1_1_1__right__v1 == df.f_1_1_1__v1).values & (df.f_1_1_1__left__v1 == df.f_1_1_1__v1).values]

df_non_symm = split_areas(df_non_symm)
df_symm = split_areas(df_symm)
df_symm = df_symm.loc[[ind for ind in df_symm.index if ind.find('right') != -1], :]

df = pd.concat([df_symm,df_non_symm],axis=0)
df1 = df.copy()

In [None]:
test_id = ['PAC_69','PAC_48','PAC_14','PAC_65','PAC_63','PAC_66','PAC_61','PAC_64','PAC_62']
#df = df[[col for col in df.columns if col.find('f_1') != -1 or col.find('f_4') != -1 ] + ['id', 'pathology']]
df_test = df[df.id.isin(test_id)]
df = df.drop(df_test.index)
df1 = df.copy()

In [None]:
y = df.pathology
X = df.iloc[:,:-2]
id_column = df.id

y_control = df_test.pathology
X_control = df_test.iloc[:,:-2]
X1 = X.copy()

#cols = [col for col in X.columns if col.find('f_3') == -1 and col.find('f_2') == -1]

#X = X[cols]
#X_control = X_control[cols]

In [None]:
def first_round(X_train, X_test, X_control):
    X_train,q1 = variance(X_train)
    X_train,q2 = outliers(X_train)
    X_train = correlation(X_train, thr=0.95)
    return X_train, X_test[X_train.columns], X_control[X_train.columns]


second_round = [ttest, pca, permut, lr_selection, boruta_selection, greedy]

In [None]:
X_control_main = X_control.copy()

In [None]:
n=5
N=10

res = {}
for s in range(N):
    res[f'split_{s}'] = {}
    for func in second_round:
        res[f'split_{s}'][func.__name__] = {}
        for model_name in ['lr', 'svm', 'rf', 'xgb']:
            res[f'split_{s}'][func.__name__][model_name] = {}
            for metric in ['acc_test', 'auc_test', 'acc_control', 'auc_control']:
                res[f'split_{s}'][func.__name__][model_name][metric] = []
        res[f'split_{s}'][func.__name__]['n_features'] = []

for s in tqdm(range(N)):   
    X_fold, y_fold = make_folds(X,y,n=n, id_column=id_column)
    for j in tqdm(range(n)):
        X_control = X_control_main.copy()
        X_train, y_train = pd.concat([X_fold[k] for k in range(len(X_fold)) if k!=j]),pd.concat([y_fold[k] for k in range(len(X_fold)) if k!=j])
        X_test,y_test = X_fold[j].copy(), y_fold[j].copy()    

        X_train, X_test, X_control = first_round(X_train, X_test, X_control)

        print(f'FOLD {j}')
        print(f'First round selection: {X_train.columns.size} features')

        for func in second_round:
            X_train_c = X_train.copy()
            X_test_c = X_test.copy()
            X_control_c = X_control.copy()
            #kwargs = {'id_column': id_column}
            try:
                X_train, X_test, X_control = func(X_train, y_train,X_test, y_test, X_control)
                print(f'Second round selection: {X_train.columns.size} features ({func.__name__})')
                res[f'split_{s}'][func.__name__]['n_features'].append(X_train.columns.size)
            except:
                for model_name in ['lr', 'svm', 'rf', 'xgb']:
                    res = write_dict(res, f'split_{s}', func.__name__, model_name, y_test, X_test, 
                                     y_control, X_control, None, fillna=True)
                    res[f'split_{s}'][func.__name__]['n_features'].append(float('nan'))
                continue

            for model_name in ['lr', 'svm', 'rf', 'xgb']:
                try:
                    if model_name == 'svm':
                        dict_concat = {'probability':True}
                        X_train_non = X_train.copy()
                        X_test_non = X_test.copy()
                        X_control_non = X_control.copy()
                        X_train, y_train, X_test, y_test, scaler = normalization(X_train, y_train, X_test, y_test)
                        X_control.loc[:, :] = scaler.transform(X_control)
                    else:
                        dict_concat = {}
                    print(model_name)

                    clf = hp_model(model_name, X_train, y_train, evals=15, max_iterations=51,metric=roc_auc_score, 
                                  dict_concat=dict_concat, id_column=id_column, oversampling=False, random_state=42, 
                                  class_w='balanced', n_folds=4, thr_diff=0.2, thr_min=0.6, print_scores=False)
                    res = write_dict(res, f'split_{s}', func.__name__, model_name, y_test, X_test, 
                                     y_control, X_control, clf, fillna=False)
                    X_train, X_test, X_control =  X_train_non.copy(), X_test_non.copy(), X_control_non.copy()
                except:
                    res = write_dict(res, f'split_{s}', func.__name__, model_name, y_test, X_test, 
                                     y_control, X_control, None, fillna=True)
                    continue
                
                with open('results.json', 'w+') as f:
                    json.dump(res, f)
                X_train, X_test, X_control = X_train_c.copy(), X_test_c.copy(), X_control_c.copy() 


In [None]:
res1 = res.copy()
res = {k:v for k,v in res.items() if k in ['split_0','split_1','split_2',]}

In [None]:
new_res = {'lr':{},'svm':{},'rf':{},'xgb':{}}
for s in [f'split_{i}' for i in range(3)]:
    for f in [func.__name__ for func in second_round]:
        for m in ['lr', 'svm', 'rf', 'xgb']:
            try:
                len(new_res[m][f])
            except:
                new_res[m][f] = []
            new_res[m][f].append(res[s][f][m])

In [None]:
new_res = {'lr':{},'svm':{},'rf':{},'xgb':{}}
for s in [f'split_{i}' for i in range(3)]:
    df = pd.DataFrame()
    for f in [func.__name__ for func in second_round]:
        for m in ['lr', 'svm', 'rf', 'xgb']:
            try:
                new_res[m][f].iloc[0,:]
            except:
                new_res[m][f] = pd.DataFrame()
            new_res[m][f] = pd.concat([new_res[m][f],pd.DataFrame(pd.DataFrame(res[s]).loc[m, f]).mean()],axis=1)
            

In [None]:
dfs = {}
for m in ['lr', 'svm', 'rf', 'xgb']:
    a = []
    for f in [func.__name__ for func in second_round]:
        a.append(new_res[m][f].apply(lambda x: np.mean(x),axis=1))
    dfs[m] = pd.DataFrame(a,index=[func.__name__ for func in second_round]).T

In [None]:
res1 = deepcopy(res)

In [None]:
import json
with open('results.json', 'r+') as f:
    res = json.load(f)

In [None]:
dfs1 = {}
for m in ['lr', 'svm', 'rf', 'xgb']:
    a = []
    for f in [func.__name__ for func in second_round]:
        a.append(new_res[m][f].apply(lambda x: np.std(x),axis=1))
    dfs1[m] = pd.DataFrame(a,index=[func.__name__ for func in second_round]).T

In [None]:
s = []

s.append([
'acc test & 0.805 $\pm$ 0.017 & 0.9103 $\pm$ 0.008 & 0.8152 $\pm$ 0.013 & 0.4091 $\pm$ 0.01 & 0.7778 $\pm$ 0.016 & 0.8078 $\pm$ 0.01',

'auc test & 0.9044 $\pm$ 0.025 & 0.972 $\pm$ 0.006 & 0.8998 $\pm$ 0.032 & 0.5 $\pm$ 0.0 & 0.885 $\pm$ 0.016 & 0.9143 $\pm$ 0.017 ',

'acc control & 0.5082 & 0.8175 & 0.6054 & 0.4025 & 0.5265 & 0.7066 ',

'auc control & 0.7017 & 0.9607 & 0.704 & 0.5042 & 0.6922 & 0.7977 ',
])

s.append([
'acc test & 0.9084 $\pm$ 0.017 & 0.8914 $\pm$ 0.034 & 0.9276 $\pm$ 0.003 & 0.4266 $\pm$ 0.02 & 0.9082 $\pm$ 0.018 & 0.9026 $\pm$ 0.015 ',

'auc test & 0.9739 $\pm$ 0.011 & 0.9721 $\pm$ 0.01 & 0.9851 $\pm$ 0.007 & 0.5 $\pm$ 0.0 & 0.9727 $\pm$ 0.009 & 0.9767 $\pm$ 0.011 ',

'acc control & 0.8438 & 0.841 & \textbf{0.8904} & 0.4158 & 0.8363 & \textbf{0.8724} ',

'auc control & 0.9742 & 0.966 & \textbf{0.9819} & 0.5 & 0.9716 & \textbf{0.9761} ',
])

s.append([
'acc test & 0.8905 $\pm$ 0.014 & 0.8809 $\pm$ 0.017 & 0.878 $\pm$ 0.016 & 0.5242 $\pm$ 0.044 & 0.8938 $\pm$ 0.016 & 0.889 $\pm$ 0.016',

'auc test & 0.9649 $\pm$ 0.007 & 0.9491 $\pm$ 0.013 & 0.9732 $\pm$ 0.011 & 0.5221 $\pm$ 0.072 & 0.9641 $\pm$ 0.009 & 0.9744 $\pm$ 0.009',

'acc control & 0.7323 & 0.7062 & 0.753 & 0.5149 & 0.733 & 0.7492',

'auc control & 0.9448 & 0.8825 & 0.9371 & 0.4681 & 0.9462 & 0.9394',
])

s.append([
'acc test & 0.8733 $\pm$ 0.023 & 0.8536 $\pm$ 0.023 & 0.858 $\pm$ 0.022 & 0.4928 $\pm$ 0.042 & 0.8708 $\pm$ 0.019 & 0.8677 $\pm$ 0.022',

'auc test & 0.9636 $\pm$ 0.02 & 0.9428 $\pm$ 0.01 & 0.9429 $\pm$ 0.017 & 0.5229 $\pm$ 0.053 & 0.958 $\pm$ 0.019 & 0.9691 $\pm$ 0.017',

'acc control & 0.8191 & 0.7371 & 0.7283 & 0.4969 & 0.8325 & 0.7645',

'auc control & 0.9407 & 0.893 & 0.8893 & 0.4599 & 0.9428 & 0.9133',
])

In [None]:

reses = []
for df in dfs:
    sample = dfs[df].apply(lambda x: np.round(x,4)).astype('str')
    sample2 = dfs1[df].apply(lambda x: np.round(x,3)).astype('str')
    sample.iloc[:2,:] += ' +- '
    sample.iloc[:2,:] += sample2.iloc[:2,:]
    display(sample)
    reses.append(sample)

In [None]:
for i in range(4):
    for j in range(4):
        a = s[i][j].split(' & ')
        a[4] = reses[i].iloc[j,0]
        print(' & '.join(a).replace('+-', '$\pm$') + ' \\' + '\\')
        print('\\hline')
    print('#####################')

In [None]:
new_res

In [None]:
new_res = {'n_features':{}}
for s in [f'split_{i}' for i in range(3)]:
    for f in [func.__name__ for func in second_round]:
        for m in ['n_features']:
            try:
                len(new_res[m][f])
            except:
                new_res[m][f] = []
            new_res[m][f].append(res[s][f][m])

In [None]:
dfs = {}
for m in ['n_features']:
    a = []
    for f in [func.__name__ for func in second_round]:
        a.append(new_res[m][f].apply(lambda x: np.mean(x),axis=1))
    dfs[m] = pd.DataFrame(a,index=[func.__name__ for func in second_round]).T
    display(dfs[m])