In [7]:
# Importing useful libraries

from os.path import join

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy as sp
import pandas as pd

from sklearn import metrics
from sklearn.preprocessing import normalize, StandardScaler
from sklearn.utils import shuffle

from sklearn.cluster import FeatureAgglomeration
from sklearn.feature_selection import RFE, GenericUnivariateSelect, chi2, mutual_info_classif, f_classif
from sklearn.model_selection import LeaveOneOut, KFold, RepeatedKFold

from neurocombat_sklearn import CombatModel

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from tools import *

# Defining local variables
data_path = 'C:/Users/b_charmettant/data/parotide_ml/'

In [8]:
df_meta = get_meta_data(data_path)

type_to_include=['t1', 't2', 'gado', 'diff']
# type_to_include=['t1']


ls_exams, id_to_feat, feat_to_id = load_features(df_meta, data_path, verbose=False, type_to_include=type_to_include)

features = []
labels = []

# Meta data fields to include in meta variables for harmonization

meta_fields = ['sexe', 'tesla', 'age']
meta_variables = []

for exam in ls_exams:
    lbl, ft, meta = format_exam(exam, feat_to_id, meta_fields=meta_fields)
    features.append(ft)
    labels.append(lbl)
    meta_variables.append(meta)
    
features = np.array(features)
labels = np.array(labels)
meta_variables = np.array(meta_variables)

print(f"{len(labels)} with {len(features[0])} features")
print(f"Labels : {len(labels) - sum(labels)} benign lesions and {sum(labels)} malignant")

# Adjusting the scales of the different features

scaler = StandardScaler()
rescaled_features = scaler.fit_transform(features)

print(f"Shape features : {rescaled_features.shape}")
print(f"Shape meta variables : {meta_variables.shape}")

n_features = features.shape[1]
n_exams = features.shape[0]

features = rescaled_features

107 with 432 features
Labels : 58 benign lesions and 49 malignant
Shape features : (107, 432)
Shape meta variables : (107, 3)


In [9]:
# Feature harmonization using ComBat

harmonization = CombatModel()

# h_sexe, h_age = None, None

h_tesla = meta_variables[:, 1].reshape(-1,1)
h_sexe = meta_variables[:, 0].reshape(-1,1)
h_age = meta_variables[:, 2].reshape(-1,1)

features = harmonization.fit_transform(features, h_tesla, h_sexe, h_age)

In [10]:
# K fold experiment

n_selected_features = 10
n_splits = 10
n_repeats = 10

classifier = LogisticRegression(penalty='l2', solver='liblinear', C=0.5)
# classifier = LogisticRegression(penalty='l1', solver='saga', C=1, l1_ratio=0)
# classifier = DecisionTreeClassifier()
# classifier = SVC(C=1, probability=True, kernel='rbf')
# classifier = RandomForestClassifier(n_estimators=20, verbose=0)


#iterator = KFold(n_splits=n_splits, random_state=None, shuffle=True)
iterator = RepeatedKFold(n_splits=n_splits, n_repeats=10, random_state=None)

features_summary = dict(zip([i for i in range(n_features)], np.zeros(n_features)))

ls_auc_train = []
ls_auc_test = []

n = 0
for train_id, test_id in iterator.split(features):
    n += 1
    
    # Spliting between testing and training set
    train_features = features[train_id]
    train_labels = labels[train_id]
    
    test_labels = labels[test_id]
    if sum(test_labels) == 0 or sum(test_labels) == len(test_labels):
        print("Skipping this batch", end='\r')
        continue
    
    # Performing feature selection using RFE or p-value (on train set only)
    
    ###
    if False:
        feature_significance_p = feature_t_test(train_features, train_labels, id_to_feat)
        feature_significance_auc = feature_auc(train_features, train_labels, id_to_feat, LogisticRegression(penalty='none'))

        selection_p_value = choose_features_from_dict(feature_significance_p, n_selected_features, feat_to_id)
        
        selected_features_id = selection_p_value
    
    ###
    if True:
        estimator = classifier
        selector = RFE(estimator, n_selected_features, step=1).fit(train_features, train_labels)
        selection_rfe = np.where(selector.support_.astype(int) > 0)[0]
        
        selected_features_id = selection_rfe
        
        for i in selected_features_id:
            features_summary[i] += 1
    ###
    
    if False:
        agglo = FeatureAgglomeration(n_clusters=n_selected_features).fit(train_features)
        features_s = agglo.transform(features)
    
    # Applying feature selection to train and test subsets
    features_s = features[:, selected_features_id]
    
    # Evaluation of the classifier using LOO strategy
    loo = LeaveOneOut()

    loo_predictions = []
    loo_labels = []

    X = features_s
    y = labels
    
    # clf = classifier
    clf =SVC(C=0.8, probability=True, kernel='linear')
    
    for in_id, out_id in loo.split(X):

        X_train, X_test = X[in_id], X[out_id]
        y_train, y_test = y[in_id], y[out_id]

        clf.fit(X_train, y_train)

        pred = clf.predict_proba(X_test)[0, 1]

        loo_predictions.append(pred)
        loo_labels.append(y_test)
        
    loo_predictions = np.array(loo_predictions)
    loo_labels = np.array(loo_labels)
    
    # Computing metrics on train and test set, separatly
    ls_auc_train.append(metrics.roc_auc_score(loo_labels[train_id], loo_predictions[train_id]))
    ls_auc_test.append(metrics.roc_auc_score(loo_labels[test_id], loo_predictions[test_id]))
    
    print("K-fold {:2}/{:2} - Auc train : {:.3f} - Auc test: {:.3f}"
          .format(n, n_splits*n_repeats, ls_auc_train[-1], ls_auc_test[-1]) , end='\r')

K-fold 100/100 - Auc train : 0.834 - Auc test: 0.938

In [11]:
print("Average training AUC : {:.3f}".format(np.mean(ls_auc_train)))
print("Average testing AUC : {:.3f}".format(np.mean(ls_auc_test)))

Average training AUC : 0.790
Average testing AUC : 0.485


In [12]:
for i in order_dict(features_summary).keys():
    if features_summary[i] > 0:
        print(">{:40.40} - {:3}/100".format(id_to_feat[i], int(features_summary[i])))
    

> t1_OriginalGrayLevelCo-occurrenceMatrixS -  84/100
> t1_OriginalShapeCompactness2             -  76/100
> diff_OriginalGrayLevelDependenceMatrixLa -  64/100
> t1_OriginalGrayLevelSizeZoneMatrixGrayLe -  44/100
> t1_OriginalGrayLevelDependenceMatrixLarg -  44/100
> t2_OriginalFirstOrderMinimum             -  39/100
> t1_OriginalShapeMaximum2DDiameterColumn  -  34/100
> gado_OriginalGrayLevelDependenceMatrixLa -  31/100
> diff_OriginalFirstOrderMedian            -  31/100
> t2_OriginalShapeSurfaceAreatoVolumeRatio -  26/100
> t1_OriginalShapeMajorAxis                -  23/100
> diff_OriginalGrayLevelSizeZoneMatrixSize -  23/100
> t1_OriginalGrayLevelRunLengthMatrixRunEn -  22/100
> gado_OriginalGrayLevelSizeZoneMatrixSmal -  22/100
> gado_OriginalShapeElongation             -  21/100
> t1_OriginalNeighboringGrayToneDifference -  19/100
> diff_OriginalShapeLeastAxis              -  16/100
> t1_OriginalGrayLevelSizeZoneMatrixLargeA -  15/100
> t2_OriginalFirstOrderTotalEnergy         -  