In [None]:
import json
import pandas as pd
from scipy.stats import mannwhitneyu
import numpy as np
import rpy2.robjects as robjects
from rpy2.robjects.conversion import localconverter
from rpy2.robjects import pandas2ri
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from tqdm import tqdm
from sklearn.ensemble import RandomForestClassifier
import os.path
from pymrmr import mRMR
import matplotlib.pyplot as plt
from boruta import BorutaPy
from sklearn.metrics import auc, precision_recall_curve, average_precision_score, PrecisionRecallDisplay
import pandas as pd

# Import raw data

In [None]:
# import data from csv 
cnv = pd.read_csv('dataset_hg19/dataset/cnv.csv', sep=' ')
miRNA = pd.read_csv('dataset_hg19/dataset/mirna.csv', sep=' ')
mRNA = pd.read_csv('dataset_hg19/dataset/mrna.csv', sep=' ')
proteins = pd.read_csv('dataset_hg19/dataset/proteins.csv', sep=' ')

In [None]:
label = pd.read_csv('dataset_hg19/dataset/labels_pfi.csv', sep=' ')

In [None]:
def store_filtered_normalize_data(dataset_dict):
    mkdir_p('filtered_normalize_data')
    filtered_normalize_data = create_data_normalized_filtered(dataset_dict, MinMaxScaler, 0.05)
    for data in dataset_dict:
        filtered_normalize_data[data].to_json('filtered_normalize_data/'+data+'.json')

In [None]:
def load_filtered_normalize_data(dataset_dict):
    dict_test = {}
    if os.path.exists('filtered_normalize_data'):
        for data in dataset_dict:
            with open('filtered_normalize_data/'+data+'.json', 'r') as fp:
                dataset = pd.read_json(fp)
                dict_test[data] = dataset
        return dict_test
    else:
        store_filtered_normalize_data(dataset_dict)
        load_filtered_normalize_data(dataset_dict)

In [None]:
dataset_dict = {'cnv': cnv, 'mirna': miRNA, 'mrna': mRNA, 'proteins': proteins}

datasets = load_filtered_normalize_data(dataset_dict)

# Create concatenation of all dataset

In [None]:
dataset_whole_dataset = pd.concat([datasets['cnv'], datasets['proteins'], datasets['mrna'], datasets['mirna']], axis=1)

In [None]:
datasets['whole_dataset'] = dataset_whole_dataset

# Create concatenation of dataset: proteins+mirna, proteins+mirna+mrna

In [None]:
proteins_mirna_dataset = pd.concat([datasets['proteins'], datasets['mirna']], axis=1)
datasets['proteins_mirna_dataset'] = proteins_mirna_dataset

In [None]:
proteins_mirna_mrna_dataset = pd.concat([datasets['proteins'], datasets['mirna'], datasets['mrna']], axis=1)
datasets['proteins_mirna_mrna_dataset'] = proteins_mirna_mrna_dataset

# Remove correlated feature using R code

In [None]:
robjects.r("""
install.packages('caret')
""")

In [None]:
robjects.r("""
library(doParallel)
""")

In [None]:
robjects.r("""
remove_correlated_par <- function(df, dim_split = 1000, maxiter = 5, method = "pearson", cutoff = 0.8){
  
  #df has features on columns
  X = t(df)
  niter = 0
  if (!is.finite(dim_split)) dim_split = nrow(X)
  cat("dim(X) before starting remove correlation: ", dim(X), "\n")
  while(niter < maxiter ){
    cat("niter = ", niter, '\n')
    filtered_X = NULL
    
    cl <- makeCluster(4)
    registerDoParallel(cl)
    
    filtered_X = foreach(nR = seq(1, nrow(X), by=dim_split), 
                         .combine='rbind', .packages = c("caret")) %dopar% {
                           
         subX = X[nR:min(nrow(X), (nR+dim_split-1)), ]
         cc = cor(t(subX), use = "pairwise.complete.obs", method = "kendall")
         
         select_corr = caret::findCorrelation(cc, cutoff = cutoff, exact = FALSE)
         #print(length(select_corr))
         
         if (length(select_corr)>0){ 
           subX = subX[-select_corr, ]
         }
         
         return(subX)
           
     }
    
    stopCluster(cl)
    
    print(names(filtered_X))
    #    if (length(unique(filtered_X[,1]))>1) cat('PROBLEMA!')
    no_removed = nrow(X)-nrow(filtered_X)
    cat('Removed = ', no_removed, '\n')
    cat('dim filetered_X =', dim(filtered_X), '\n')

    X = filtered_X[sample(nrow(filtered_X)), ]
    
    niter = niter + 1
    cat("nrow(X) =", dim(X), '\n')
    if (no_removed ==0) break;
  }
  cat('final dimension = ', dim(t(X)), '\n')
  
  
  return(t(X))
    }
    """)

# Convert data in a r structure data

In [None]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_proteins = robjects.conversion.py2rpy(datasets['proteins'])

In [None]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_miRNA = robjects.conversion.py2rpy(datasets['mirna'])

In [None]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_mRNA = robjects.conversion.py2rpy(datasets['mrna'])

In [None]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_cnv = robjects.conversion.py2rpy(datasets['cnv'])

In [None]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_whole_dataset = robjects.conversion.py2rpy(datasets['whole_dataset'])

In [None]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_proteins_mirna_dataset = robjects.conversion.py2rpy(datasets['proteins_mirna_dataset'])

In [None]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_proteins_mirna_mrna_dataset = robjects.conversion.py2rpy(datasets['proteins_mirna_mrna_dataset'])

# Store data in local

In [None]:
def compute_and_save_result(data, path):
    data_removed = robjects.globalenv["remove_correlated_par"](data)
    df = pd.DataFrame(data=np.array(data_removed), columns=data_removed.colnames)
    df.to_json(path)
    
def restore_result(file_name):
    return pd.read_json(f'elaborated_data_with_r_function/{file_name}.json')

def load_or_store_removed_correlated(data, file_name):
    if os.path.exists(f'elaborated_data_with_r_function/{file_name}.json'):
        return restore_result(file_name)
    else:
        compute_and_save_result(data, f'elaborated_data_with_r_function/{file_name}.json')
        return restore_result(file_name)

In [None]:
df_proteins_removed = load_or_store_removed_correlated(r_proteins, 'r_proteins_removed')

In [None]:
df_mirna_removed = load_or_store_removed_correlated(r_miRNA, 'r_mirna_removed')

In [None]:
df_mrna_removed = load_or_store_removed_correlated(r_mRNA, 'r_mrna_removed')

In [None]:
df_cnv_removed = load_or_store_removed_correlated(r_cnv, 'r_cnv_removed')

In [None]:
df_dataset_whole_dataset_removed = load_or_store_removed_correlated(r_whole_dataset, 'r_dataset_whole_dataset_removed')

In [None]:
df_dataset_proteins_mirna_removed = load_or_store_removed_correlated(r_proteins_mirna_dataset, 'r_proteins_mirna_dataset_removed')

In [None]:
df_dataset_proteins_mirna_mrna_removed = load_or_store_removed_correlated(r_proteins_mirna_mrna_dataset, 'r_proteins_mirna_mrna_dataset_removed')

In [None]:
print('DATI MANIPOLATI CON LA FUNZIONE R ELIMINANDO LE FEATURE CORRELATE')

# Mann-Whitney

In [None]:
def create_dataframe_pos_neg(dataset, label):
    mask_1 = label['x']==1
    mask_0 = label['x']==0
    return (dataset[mask_1.values], dataset[mask_0.values])

In [None]:
def mann_whitney_features_selection(dataset, label, min_pValue):
    pos, neg = create_dataframe_pos_neg(dataset, label)
    H, pval = mannwhitneyu(pos, neg)
    mask = (pval < min_pValue)
    return dataset.columns[mask]

# Multivariate feature selection, mrmr and Boruta 

In [None]:
def execute_mrmr(dataset, n_feature):
    return mRMR(dataset, 'MIQ', n_feature)

In [None]:
def execute_boruta_feature_selection(
    X_train: pd.DataFrame,
    y_train: np.ndarray,
    holdout_number: int,
    max_iter: int = 100,
):
    """Returns tuple with list of kept features and list of discared features.
    
    Parameters
    --------------------------
    X_train: pd.DataFrame,
        The data reserved for the input of the training of the Boruta model.
    y_train: np.ndarray,
        The data reserved for the output of the training of the Boruta model.
    holdout_number: int,
        The current holdout number.
    max_iter: int = 100,
        Number of iterations to run Boruta for.
    """

    model = RandomForestClassifier(n_jobs=-1, class_weight='balanced_subsample', max_depth=5)
    
    boruta_selector = BorutaPy(
        model,
        n_estimators='auto',
        verbose=False,
        alpha=0.05,
        max_iter=max_iter, 
        random_state=42,
    )
    boruta_selector.fit(X_train.values, y_train.values.ravel())
    
    kept_features = list(X_train.columns[boruta_selector.support_])
    discarded_features = list(X_train.columns[~boruta_selector.support_])
    
    return kept_features, discarded_features

# Train on fold

In [None]:
def train_using_mann_whitney_on_fold(X,
                            label, 
                            num_external_fold,
                            type_dataset, 
                            tecnique_feature_selection,
                            path):
    
    _, ax = plt.subplots(figsize=(10, 10))
    average_precision_scores =[]
    y_score_concatenated = []
    y_test_fold_concatenated = []
    list_selected_feature = {}
    iterator = 0
    result_dict = {}
    result_dict[type_dataset] = {}
    result_dict[type_dataset][tecnique_feature_selection] = {} 
    
   
    X_numpy = X.values
    label_numpy = label['x'].values
    
    external_fold = StratifiedKFold(n_splits=num_external_fold, shuffle=True, random_state=1)
    
    for train_index, test_index in tqdm(external_fold.split(X, label_numpy), desc=f"Running fold ", dynamic_ncols=True, leave=False):
        
        X_train_fold, X_test_fold = X_numpy[train_index], X_numpy[test_index]
        y_train_fold, y_test_fold = label_numpy[train_index], label_numpy[test_index]
        
        df_X_train = pd.DataFrame(data=X_train_fold, columns=X.columns)
        df_X_test = pd.DataFrame(data=X_test_fold, columns=X.columns)
        df_y_train = pd.DataFrame(data=y_train_fold, columns=label.columns)
        
        selected_features = mann_whitney_features_selection(df_X_train, df_y_train, 0.05)
        
        list_selected_feature[iterator] = selected_features
        
        X_train_selected_feature = df_X_train[selected_features]
        X_test_selected_feature = df_X_test[selected_features]
        
        clf = GridSearchCV(estimator=RandomForestClassifier(),
                   scoring='average_precision',
                   param_grid=model_selection_grid_DT, 
                   cv=2, 
                   n_jobs=-1)
        clf.fit(X_train_selected_feature, y_train_fold)
        
        # Get the predicited probability of testing data
        y_score = clf.predict_proba(X_test_selected_feature)[:, 1]
        
        avg_precision_score = average_precision_score(y_test_fold, y_score)
        average_precision_scores.append(avg_precision_score)
        
        #concatenation of result for external AUPRC
        y_score_concatenated = np.concatenate((y_score_concatenated, y_score), axis=None)
        y_test_fold_concatenated = np.concatenate((y_test_fold_concatenated, y_test_fold), axis=None)
               
        PrecisionRecallDisplay.from_predictions(y_test_fold, y_score, ax=ax,name=f'Result on {iterator} fold:')

        plt.title('Random_Forest internal')
        ax.legend(title='AUPRC')
        
        iterator = iterator +1
        plt.savefig('Result/'+path+tecnique_feature_selection+'/'+type_dataset+'_AUPRC_Internal.pdf')
    
    internal_average_precision_score = np.mean(average_precision_scores)
    
    external_average_precision_score = average_precision_score(y_test_fold_concatenated, y_score_concatenated)
    
    #plot precision recall curve
    PrecisionRecallDisplay.from_predictions(y_test_fold_concatenated, y_score_concatenated,name='Concat of predictions')

    plt.title('Random_Forest external')
    plt.legend(title='AUPRC')
    plt.savefig('Result/'+path+tecnique_feature_selection+'/'+type_dataset+'_'+'AUPRC_External.pdf')
    
    result_dict[type_dataset][tecnique_feature_selection]['internal'] = '{0:.4f}'.format(internal_average_precision_score)
    result_dict[type_dataset][tecnique_feature_selection]['external'] = '{0:.4f}'.format(external_average_precision_score)
    result_dict[type_dataset][tecnique_feature_selection]['best_estimator'] = str(clf.best_estimator_)
    result_dict[type_dataset][tecnique_feature_selection]['feature_used'] = str(list_selected_feature)
      
    with open('Result/'+path+tecnique_feature_selection+'/'+type_dataset+'.json', 'w', encoding='utf-8') as fp:
        json.dump(result_dict[type_dataset], fp, ensure_ascii=False, indent=4) 
    
    return list_selected_feature

# GridSearch

In [None]:
model_selection_grid_DT = [
    {'criterion': ['gini', 'entropy'],
     'max_leaf_nodes': [None, 2, 5, 10],
     'max_features': [None, 'sqrt', 'log2'],
    'n_estimators': [51, 101, 251, 500]}
]

# Train using multivariate feature selection after mannwhitney on fold

In [None]:
def train_using_multivariate_on_fold(X,
                             label,
                             list_selected_feature_from_mannwhiney, 
                             num_external_fold,
                             tecnique_feature_selection,
                             path,
                             type_dataset,
                             type_feature_selection=None):
    _, ax = plt.subplots(figsize=(10, 10))
    average_precision_scores =[]
    y_score_concatenated = []
    y_test_fold_concatenated = []
    result_dict = {}
    list_selected_feature = {}
    iterator = 0
    result_dict[type_dataset] = {}
    result_dict[type_dataset][tecnique_feature_selection] = {}
    
    X_numpy = X.values
    label_numpy = label['x'].values
    
    external_fold = StratifiedKFold(n_splits=num_external_fold, shuffle=True, random_state=1)
    
    for (train_index, test_index), selected_features in zip(tqdm(external_fold.split(X, label_numpy), desc=f"Running fold ", dynamic_ncols=True, leave=False), list_selected_feature_from_mannwhiney.values()):
        
        X_train_fold, X_test_fold = X_numpy[train_index], X_numpy[test_index]
        y_train_fold, y_test_fold = label_numpy[train_index], label_numpy[test_index]
        
        df_X_train = pd.DataFrame(data=X_train_fold, columns=X.columns)
        df_X_test = pd.DataFrame(data=X_test_fold, columns=X.columns)
        df_y_train = pd.DataFrame(data=y_train_fold, columns=label.columns)
        
        #TAKE FEATURES FROM MANNWHINEY
        X_train_selected_feature = df_X_train[selected_features]
        X_test_selected_feature = df_X_test[selected_features]
       
        if type_feature_selection == "mrmr":
            selected_features = execute_mrmr(X_train_selected_feature, 100)
        elif type_feature_selection == 'boruta':
            selected_features, discarded_feature =  execute_boruta_feature_selection(X_train_selected_feature, df_y_train, 5)
        
        list_selected_feature[iterator] = selected_features
        
        
        X_train_selected_feature = X_train_selected_feature[selected_features]
        X_test_selected_feature = X_test_selected_feature[selected_features]
        
        
        clf = GridSearchCV(estimator=RandomForestClassifier(),
                   scoring='average_precision',
                   param_grid=model_selection_grid_DT, 
                   cv=2, 
                   n_jobs=-1)
        clf.fit(X_train_selected_feature, y_train_fold)
        
        # Get the predicited probability of testing data
        y_score = clf.predict_proba(X_test_selected_feature)[:, 1]
        
        avg_precision_score = average_precision_score(y_test_fold, y_score)
        average_precision_scores.append(avg_precision_score)
        
        #concatenation of result for external AUPRC
        y_score_concatenated = np.concatenate((y_score_concatenated, y_score), axis=None)
        y_test_fold_concatenated = np.concatenate((y_test_fold_concatenated, y_test_fold), axis=None)
               
        PrecisionRecallDisplay.from_predictions(y_test_fold, y_score, ax=ax,name=f'Result on {iterator} fold:')

        plt.title('Random_Forest internal')
        ax.legend(title='AUPRC')
        
        plt.savefig('Result/'+path+tecnique_feature_selection+'/'+type_dataset+'_AUPRC_Internal.pdf')
        
        iterator = iterator + 1
        
    internal_average_precision_score = np.mean(average_precision_scores)
    
    external_average_precision_score = average_precision_score(y_test_fold_concatenated, y_score_concatenated)

    #plot precision recall curve
    PrecisionRecallDisplay.from_predictions(y_test_fold_concatenated, y_score_concatenated,name='Concat of predictions')
    plt.title('Random_Forest external')
    plt.legend(title='AUPRC')
    
    result_dict[type_dataset][tecnique_feature_selection]['internal'] = '{0:.4f}'.format(internal_average_precision_score)
    result_dict[type_dataset][tecnique_feature_selection]['external'] = '{0:.4f}'.format(external_average_precision_score)
    result_dict[type_dataset][tecnique_feature_selection]['best_estimator'] = str(clf.best_estimator_)
    result_dict[type_dataset][tecnique_feature_selection]['feature_used'] = str(list_selected_feature)
    
    plt.savefig('Result/'+path+tecnique_feature_selection+type_dataset+'_'+'AUPRC_External.pdf')
    
    with open('Result/'+path+tecnique_feature_selection+'/'+type_dataset+'.json', 'w', encoding='utf-8') as fp:
        json.dump(result_dict[type_dataset], fp, ensure_ascii=False, indent=4)
    
    return list_selected_feature

# Train proteins

In [None]:
!mkdir 'Result/Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold'
!mkdir 'Result/Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/r_function_with_mannwhitney'
list_selected_feature_proteins = train_using_mann_whitney_on_fold(
    df_proteins_removed, label, 10, 'proteins', 'r_function_with_mannwhitney',
    'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/')

In [None]:
!mkdir 'Result/Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/r_function_with_mannwhitney_mrmr'
train_using_multivariate_on_fold(df_proteins_removed, label, list_selected_feature_proteins, 10,
                         'r_function_with_mannwhitney_mrmr/', 
                         'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/',
                         'proteins',
                         'mrmr')

In [None]:
!mkdir 'Result/Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/r_function_with_mannwhitney_boruta'
train_using_multivariate_on_fold(df_proteins_removed, label, list_selected_feature_proteins, 10,
                         'r_function_with_mannwhitney_boruta/', 
                         'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/',
                         'proteins',
                         'boruta')

# Train miRNA 

In [None]:
list_selected_feature_mirna = train_using_mann_whitney_on_fold(
    df_mirna_removed, label, 10, 'mirna', 'r_function_with_mannwhitney',
    'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/')

In [None]:
train_using_multivariate_on_fold(df_mirna_removed, label, list_selected_feature_mirna, 10,
                         'r_function_with_mannwhitney_mrmr/', 
                         'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/',
                         'miRNA',
                         'mrmr')

In [None]:
train_using_multivariate_on_fold(df_mirna_removed, label, list_selected_feature_mirna, 10,
                         'r_function_with_mannwhitney_boruta/', 
                         'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/',
                         'miRNA',
                         'boruta')

# Train mrna

In [None]:
list_selected_feature_mrna = train_using_mann_whitney_on_fold(
    df_mrna_removed, label, 10, 'mRNA', 'r_function_with_mannwhitney',
    'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/')

In [None]:
train_using_multivariate_on_fold(df_mrna_removed, label, list_selected_feature_mrna, 10,
                         'r_function_with_mannwhitney_mrmr/', 
                         'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/',
                         'mRNA',
                         'mrmr')

In [None]:
train_using_multivariate_on_fold(df_mrna_removed, label, list_selected_feature_mrna, 10,
                         'r_function_with_mannwhitney_boruta/', 
                         'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/',
                         'mRNA',
                         'boruta')

# Train cnv

In [None]:
list_selected_feature_cnv = train_using_mann_whitney_on_fold(
    df_cnv_removed, label, 10, 'cnv', 'r_function_with_mannwhitney',
    'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/')

In [None]:
train_using_multivariate_on_fold(df_cnv_removed, label, list_selected_feature_cnv, 10,
                         'r_function_with_mannwhitney_mrmr/', 
                         'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/',
                         'cnv',
                         'mrmr')

In [None]:
train_using_multivariate_on_fold(df_cnv_removed, label, list_selected_feature_cnv, 10,
                         'r_function_with_mannwhitney_boruta/', 
                         'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/',
                         'cnv',
                         'boruta')

# Train whole dataset

In [None]:
list_selected_feature_concat = train_using_mann_whitney_on_fold(
    df_dataset_whole_dataset_removed, label, 10, 'whole_dataset',
    'r_function_with_mannwhitney', 'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/')

In [None]:
train_using_multivariate_on_fold(df_dataset_whole_dataset_removed, label, list_selected_feature_concat, 10,
                         'r_function_with_mannwhitney_mrmr/', 
                         'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/',
                         'whole_dataset',
                         'mrmr')

In [None]:
train_using_multivariate_on_fold(df_dataset_whole_dataset_removed, label, list_selected_feature_concat, 10,
                         'r_function_with_mannwhitney_boruta/', 
                         'Experiments_using_R_Function_mannwhtiney_and_multivariate_feature_selection_on_fold/',
                         'whole_dataset',
                         'boruta')

# Feature selection on whole dataset before train with: 1) mannwthineyu, 2) mannwthineyu + boruta and mannwthineyu + mrmr 

# Prepare data mannwhitney

In [None]:
proteins_selected_features = mann_whitney_features_selection(df_proteins_removed, label, 0.05)
proteins_mannwhitney = df_proteins_removed[proteins_selected_features]

In [None]:
cnv_selected_features = mann_whitney_features_selection(df_cnv_removed, label, 0.05)
cnv_mannwhitney = df_cnv_removed[cnv_selected_features]

In [None]:
mirna_selected_features = mann_whitney_features_selection(df_mirna_removed, label, 0.05)
mirna_mannwhitney = df_mirna_removed[mirna_selected_features]

In [None]:
mrna_selected_features = mann_whitney_features_selection(df_mrna_removed, label, 0.05)
mrna_mannwhitney = df_mrna_removed[mrna_selected_features]

In [None]:
whole_dataset_selected_features = mann_whitney_features_selection(df_dataset_whole_dataset_removed, label, 0.05)
whole_mannwhitney = df_dataset_whole_dataset_removed[whole_dataset_selected_features]

In [None]:
dataset_proteins_mirna_selected_features = mann_whitney_features_selection(
    df_dataset_proteins_mirna_removed,label, 0.05)


dataset_proteins_mirna_mannwhitney = df_dataset_proteins_mirna_removed[dataset_proteins_mirna_selected_features]

In [None]:
dataset_proteins_mirna_mrna_selected_features = mann_whitney_features_selection(
    df_dataset_proteins_mirna_mrna_removed,label, 0.05)

dataset_proteins_mirna_mrna_mannwhitney = df_dataset_proteins_mirna_mrna_removed[dataset_proteins_mirna_mrna_selected_features]

# Prepare data Boruta

In [None]:
proteins_selected_features_boruta, discarded_feature =  execute_boruta_feature_selection(proteins_mannwhitney, label, 5)

In [None]:
cnv_selected_features_boruta, discarded_feature =  execute_boruta_feature_selection(cnv_mannwhitney, label, 5)

In [None]:
mirna_selected_features_boruta, discarded_feature =  execute_boruta_feature_selection(mirna_mannwhitney, label, 5)

In [None]:
mrna_selected_features_boruta, discarded_feature =  execute_boruta_feature_selection(mrna_mannwhitney, label, 5)

In [None]:
whole_dataset_selected_features_boruta, discarded_feature =  execute_boruta_feature_selection(whole_mannwhitney, label, 5)

In [None]:
dataset_proteins_mirna_selected_features_boruta, discarded_feature =  execute_boruta_feature_selection(dataset_proteins_mirna_mannwhitney, label, 5)

In [None]:
dataset_proteins_mirna_mrna_selected_features_boruta, discarded_feature =  execute_boruta_feature_selection(dataset_proteins_mirna_mrna_mannwhitney, label, 5)

# Prepare data mrmr

In [None]:
proteins_selected_features_mrmr = execute_mrmr(proteins_mannwhitney, 100)

In [None]:
cnv_selected_features_mrmr = execute_mrmr(cnv_mannwhitney, 100)

In [None]:
mirna_selected_features_mrmr = execute_mrmr(mirna_mannwhitney, 100)

In [None]:
mrna_selected_features_mrmr = execute_mrmr(mrna_mannwhitney, 100)

In [None]:
whole_dataset_selected_features_mrmr = execute_mrmr(whole_mannwhitney, 100)

In [None]:
dataset_proteins_mirna_selected_features_mrmr = execute_mrmr(dataset_proteins_mirna_mannwhitney, 100)

In [None]:
dataset_proteins_mirna_mrna_selected_features_mrmr = execute_mrmr(dataset_proteins_mirna_mrna_mannwhitney, 100)

# Training function on whole dataset

In [None]:
def train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(X, label, num_external_fold,
                                          type_dataset, tecnique_feature_selection,
                                          path):
    
    """
    X : data using number of features of intrinsic dimensionality and appliying umap features extraction.
    """
    
    _, ax = plt.subplots(figsize=(10, 10))
    average_precision_scores =[]
    y_score_concatenated = []
    y_test_fold_concatenated = []
    result_dict = {}
    result_dict[type_dataset] = {}
    result_dict[type_dataset][tecnique_feature_selection] = {}   
    iterator = 1
    
    label_numpy = label['x'].to_numpy()
    
    external_fold = StratifiedKFold(n_splits=num_external_fold, shuffle=True, random_state=1)
    
    for train_index, test_index in tqdm(external_fold.split(X, label_numpy), desc=f"Running fold ", dynamic_ncols=True, leave=False):
        
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = label_numpy[train_index], label_numpy[test_index]
        
        clf = GridSearchCV(estimator=RandomForestClassifier(),
                   scoring='average_precision',
                   param_grid=model_selection_grid_DT, 
                   cv=2, 
                   n_jobs=-1)
        clf.fit(X_train_fold, y_train_fold)
        
        # Get the predicited probability of testing data
        y_score = clf.predict_proba(X_test_fold)[:, 1]
        
        avg_precision_score = average_precision_score(y_test_fold, y_score)
        average_precision_scores.append(avg_precision_score)
        
        #concatenation of result for external AUPRC
        y_score_concatenated = np.concatenate((y_score_concatenated, y_score), axis=None)
        y_test_fold_concatenated = np.concatenate((y_test_fold_concatenated, y_test_fold), axis=None)
               
        PrecisionRecallDisplay.from_predictions(y_test_fold, y_score, ax=ax,name=f'Result on {iterator} fold:')

        plt.title('Random_Forest internal')
        ax.legend(title='AUPRC')
        
        plt.savefig('Result/'+path+tecnique_feature_selection+'/'+type_dataset+'_AUPRC_Internal.pdf')
        
        iterator = iterator + 1
        
    internal_average_precision_score = np.mean(average_precision_scores)
    
    external_average_precision_score = average_precision_score(y_test_fold_concatenated, y_score_concatenated)

    #plot precision recall curve
    PrecisionRecallDisplay.from_predictions(y_test_fold_concatenated, y_score_concatenated,name='Concat of predictions')
    plt.title('Random_Forest external')
    plt.legend(title='AUPRC')
    
    result_dict[type_dataset][tecnique_feature_selection]['internal'] = '{0:.4f}'.format(internal_average_precision_score)
    result_dict[type_dataset][tecnique_feature_selection]['external'] = '{0:.4f}'.format(external_average_precision_score)
    result_dict[type_dataset][tecnique_feature_selection]['best_estimator'] = str(clf.best_estimator_)
    
    plt.savefig('Result/'+path+tecnique_feature_selection+'/'+type_dataset+'_'+'AUPRC_External.pdf')
    
    with open('Result/'+path+tecnique_feature_selection+'/'+type_dataset+'.json', 'w', encoding='utf-8') as fp:
        json.dump(result_dict[type_dataset], fp, ensure_ascii=False, indent=4)
    
    return '{0:.4f}'.format(internal_average_precision_score), '{0:.4f}'.format(external_average_precision_score), clf.best_estimator_

# Proteins only with mannwhitney feature selection on whole dataset

In [None]:
!mkdir 'Result/Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset'
!mkdir 'Result/Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/r_function_mannwhtiney'
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    proteins_mannwhitney.to_numpy(), 
    label, 10, 'proteins', 'r_function_mannwhtiney', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# Proteins mannwhitney + mrmr on whole dataset

In [None]:
!mkdir 'Result/Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/r_function_with_mannwhitney_mrmr'
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    proteins_selected_features_mrmr.to_numpy(), 
    label, 
    10, 
    'proteins', 
    'r_function_with_mannwhitney_mrmr', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# Proteins mannwhitney + boruta on whole dataset

In [None]:
!mkdir 'Result/Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/r_function_with_mannwhitney_boruta'
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    proteins_selected_features_boruta.to_numpy(), 
    label, 
    10, 
    'proteins', 
    'r_function_with_mannwhitney_boruta', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# mRNA only mannwhitney on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    mrna_mannwhitney.to_numpy(), 
    label, 10, 'mrna', 'r_function_mannwhtiney', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# mRNA mannwthiney + mrmr on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    mrna_selected_features_mrmr.to_numpy(), 
    label, 
    10, 
    'mrna', 
    'r_function_with_mannwhitney_mrmr', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# mRNA mannwhitney + boruta on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    mrna_selected_features_boruta.to_numpy(), 
    label, 
    10, 
    'mrna', 
    'r_function_with_mannwhitney_mrmr', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# miRNA only mannwhitney on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    mirna_mannwhitney.to_numpy(), 
    label, 10, 'mirna', 'r_function_mannwhtiney', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# miRNA mannwthiney + mrmr on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    mirna_selected_features_mrmr.to_numpy(), 
    label, 
    10, 
    'mirna', 
    'r_function_with_mannwhitney_mrmr', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# miRNA mannwhitney + boruta on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    mirna_selected_features_boruta.to_numpy(), 
    label, 
    10, 
    'mirna', 
    'r_function_with_mannwhitney_mrmr', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# cnv only mannwhitney on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    cnv_mannwhitney.to_numpy(), 
    label, 10, 'cnv', 'r_function_mannwhtiney', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# cnv mannwthiney + mrmr on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    cnv_selected_features_mrmr.to_numpy(), 
    label, 
    10, 
    'cnv', 
    'r_function_with_mannwhitney_mrmr', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# cnv mannwthiney + boruta on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    cnv_selected_features_boruta.to_numpy(), 
    label, 
    10, 
    'cnv', 
    'r_function_with_mannwhitney_mrmr', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# Whole dataset (cnv+proteins+mirna+mrna) only mannwthiney on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    whole_mannwhitney.to_numpy(), 
    label, 10, 'cnv', 'r_function_mannwhtiney', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# Whole dataset (cnv+proteins+mirna+mrna) mannwthiney + mrmr on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    whole_dataset_selected_features_mrmr.to_numpy(), 
    label, 
    10, 
    'cnv', 
    'r_function_with_mannwhitney_mrmr', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')

# Whole dataset (cnv+proteins+mirna+mrna) mannwthiney + boruta on whole dataset

In [None]:
train_using_mannwhitney_and_boruta_or_mrmr_whole_dataset(
    whole_dataset_selected_features_boruta.to_numpy(), 
    label, 
    10, 
    'cnv', 
    'r_function_with_mannwhitney_mrmr', 
    'Experiments_using_R_Function_mannwhitney_and_multivariate_feature_selection_whole_dataset/')