In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import rpy2.robjects as robjects
from sklearn.preprocessing import MinMaxScaler
from rpy2.robjects.packages import importr
from rpy2.robjects import pandas2ri
from rpy2.robjects.conversion import localconverter
import skdim
import os.path
from boruta import BorutaPy
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import auc, precision_recall_curve, average_precision_score, PrecisionRecallDisplay
import umap.umap_ as umap
from tqdm import tqdm
import json
import numpy as np
import math
from scipy.stats import mannwhitneyu
from json import JSONEncoder
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from rpy2.robjects import pandas2ri
from sklearn.manifold import TSNE

#pandas2ri.activate()

  from pandas.core.index import Index as PandasIndex


# Import data

In [2]:
# import data from csv 
cnv = pd.read_csv('dataset_hg19/dataset/cnv.csv', sep=' ')

In [3]:
mirna = pd.read_csv('dataset_hg19/dataset/mirna.csv', sep=' ')

In [4]:
mrna = pd.read_csv('dataset_hg19/dataset/mrna.csv', sep=' ')

In [5]:
proteins = pd.read_csv('dataset_hg19/dataset/proteins.csv', sep=' ')
len(proteins.columns)

216

# Import label

In [6]:
label = pd.read_csv('dataset_hg19/dataset/labels_pfi.csv', sep=' ')

# Normalize data

In [7]:
def dataNormalization(scaler, data):
    scl = scaler()
    model = scl.fit(data)
    scaled_data = model.transform(data)
    
    return pd.DataFrame(scaled_data, columns=data.columns, index=data.index)

# Filter feature with low variability

In [8]:
def filter_features_with_low_variability(data, alfa) -> pd.DataFrame:
    #Delete columns with standard deviation < alfa (0.10?)
    for column in data:
        if data[column].std() < alfa:
            #print(f'column: {column} has standard deviation < {alfa}, removed')
            data.drop(column, axis=1, inplace=True)
            
    return data

# Normalize and filter datasets

In [9]:
dataset_dict = {'cnv': cnv, 'mirna': mirna, 'mrna': mrna, 'proteins': proteins}

In [10]:
def create_data_normalized_filtered(data_dict, scaler, min_variance):
    dict_filtered_nornalize = {}
    for (key, value) in zip(data_dict, data_dict.values()):
        if key != 'cnv':
            dict_filtered_nornalize[key] = filter_features_with_low_variability(dataNormalization(MinMaxScaler, value), 0.05)
        else:
            dict_filtered_nornalize[key] = value
    return dict_filtered_nornalize

# Store and retrieve data normalize and filtered

In [11]:
def mkdir_p(path):
    try:
        os.makedirs(path)
    except OSError as exc: # Python >2.5
        if exc.errno == errno.EEXIST and os.path.isdir(path):
            pass
        else: raise

In [12]:
def store_filtered_normalize_data(dataset_dict):
    mkdir_p('filtered_normalize_data')
    filtered_normalize_data = create_data_normalized_filtered(dataset_dict, MinMaxScaler, 0.05)
    for data in dataset_dict:
        filtered_normalize_data[data].to_json('filtered_normalize_data/'+data+'.json')

In [13]:
def load_filtered_normalize_data(dataset_dict):
    dict_test = {}
    if os.path.exists('filtered_normalize_data'):
        for data in dataset_dict:
            with open('filtered_normalize_data/'+data+'.json', 'r') as fp:
                dataset = pd.read_json(fp)
                dict_test[data] = dataset
        return dict_test
    else:
        store_filtered_normalize_data(dataset_dict)
        load_filtered_normalize_data(dataset_dict)

In [14]:
dataset_dict = {'cnv': cnv, 'mirna': mirna, 'mrna': mrna, 'proteins': proteins}

datasets = load_filtered_normalize_data(dataset_dict)

In [15]:
print('DATI CARICATI CON SUCCESSO')

DATI CARICATI CON SUCCESSO


In [16]:
datasets = load_filtered_normalize_data(dataset_dict)

# Create concatenation of all dataset

In [17]:
whole_dataset = pd.concat([datasets['cnv'], datasets['proteins'], datasets['mrna'], datasets['mirna']], axis=1)

In [18]:
datasets['whole_dataset'] = whole_dataset

# Create concatenation of dataset: proteins+mirna, proteins+mirna+mrna

In [19]:
proteins_mirna_dataset = pd.concat([datasets['proteins'], datasets['mirna']], axis=1)
datasets['proteins_mirna_dataset'] = proteins_mirna_dataset

In [20]:
proteins_mirna_mrna_dataset = pd.concat([datasets['proteins'], datasets['mirna'], datasets['mrna']], axis=1)
datasets['proteins_mirna_mrna_dataset'] = proteins_mirna_mrna_dataset

# Remove correlated feature using R code

In [21]:
robjects.r("""
install.packages('caret')
""")

R[write to console]: Installazione pacchetto in ‘/usr/local/lib/R/site-library’
(perché ‘lib’ non è specificato)

R[write to console]: apertura URL 'https://cloud.r-project.org/src/contrib/caret_6.0-93.tar.gz'

R[write to console]: Content type 'application/x-gzip'
R[write to console]:  length 2273775 bytes (2.2 MB)

R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to console]: =
R[write to con

gcc -I"/usr/share/R/include" -DNDEBUG      -fpic  -g -O2 -ffile-prefix-map=/build/r-base-Faorqz/r-base-4.2.2.20221110=. -fstack-protector-strong -Wformat -Werror=format-security -Wdate-time -D_FORTIFY_SOURCE=2  -c caret.c -o caret.o
gcc -shared -L/usr/lib/R/lib -Wl,-Bsymbolic-functions -flto=auto -ffat-lto-objects -flto=auto -Wl,-z,relro -o caret.so caret.o -L/usr/lib/R/lib -lR


** help
*** installing help indices
** building package indices
** installing vignettes
** testing if installed package can be loaded from temporary location
** checking absolute paths in shared objects and dynamic libraries
** testing if installed package can be loaded from final location
** testing if installed package keeps a record of temporary installation path
* DONE (caret)
R[write to console]: 

R[write to console]: 
R[write to console]: I pacchetti scaricati con il codice sorgente sono in
	‘/tmp/RtmpLs5nK2/downloaded_packages’
R[write to console]: 
R[write to console]: 



<rpy2.rinterface.NULLType object at 0x7fe8485c4ac0> [RTYPES.NILSXP]

In [22]:
robjects.r("""
library(doParallel)
""")

R[write to console]: Caricamento del pacchetto richiesto: foreach

R[write to console]: Caricamento del pacchetto richiesto: iterators

R[write to console]: Caricamento del pacchetto richiesto: parallel



0,1,2,3,4,5,6
'doParall...,'parallel','iterators',...,'datasets','methods','base'


In [23]:
robjects.r("""
remove_correlated_par <- function(df, dim_split = 1000, maxiter = 5, method = "pearson", cutoff = 0.8){
  
  #df has features on columns
  X = t(df)
  niter = 0
  if (!is.finite(dim_split)) dim_split = nrow(X)
  cat("dim(X) before starting remove correlation: ", dim(X), "\n")
  while(niter < maxiter ){
    cat("niter = ", niter, '\n')
    filtered_X = NULL
    
    cl <- makeCluster(4)
    registerDoParallel(cl)
    
    filtered_X = foreach(nR = seq(1, nrow(X), by=dim_split), 
                         .combine='rbind', .packages = c("caret")) %dopar% {
                           
         subX = X[nR:min(nrow(X), (nR+dim_split-1)), ]
         cc = cor(t(subX), use = "pairwise.complete.obs", method = "kendall")
         
         select_corr = caret::findCorrelation(cc, cutoff = cutoff, exact = FALSE)
         #print(length(select_corr))
         
         if (length(select_corr)>0){ 
           subX = subX[-select_corr, ]
         }
         
         return(subX)
           
     }
    
    stopCluster(cl)
    
    print(names(filtered_X))
    #    if (length(unique(filtered_X[,1]))>1) cat('PROBLEMA!')
    no_removed = nrow(X)-nrow(filtered_X)
    cat('Removed = ', no_removed, '\n')
    cat('dim filetered_X =', dim(filtered_X), '\n')

    X = filtered_X[sample(nrow(filtered_X)), ]
    
    niter = niter + 1
    cat("nrow(X) =", dim(X), '\n')
    if (no_removed ==0) break;
  }
  cat('final dimension = ', dim(t(X)), '\n')
  
  
  return(t(X))
    }
    """)

R object with classes: ('function',) mapped to:

In [24]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_proteins = robjects.conversion.py2rpy(datasets['proteins'])

In [25]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_miRNA = robjects.conversion.py2rpy(datasets['mirna'])

In [26]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_mRNA = robjects.conversion.py2rpy(datasets['mrna'])

In [27]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_cnv = robjects.conversion.py2rpy(datasets['cnv'])

In [28]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_whole_dataset = robjects.conversion.py2rpy(datasets['whole_dataset'])

In [29]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_proteins_mirna_dataset = robjects.conversion.py2rpy(datasets['proteins_mirna_dataset'])

In [30]:
with localconverter(robjects.default_converter + pandas2ri.converter):
    r_proteins_mirna_mrna_dataset = robjects.conversion.py2rpy(datasets['proteins_mirna_mrna_dataset'])

# Store data in local

In [31]:
def compute_and_save_result(data, path):
    data_removed = robjects.globalenv["remove_correlated_par"](data)
    df = pd.DataFrame(data=np.array(data_removed), columns=data_removed.colnames)
    df.to_json(path)
    
def restore_result(file_name):
    return pd.read_json(f'elaborated_data_with_r_function/{file_name}.json')

def load_or_store_removed_correlated(data, file_name):
    if os.path.exists(f'elaborated_data_with_r_function/{file_name}.json'):
        return restore_result(file_name)
    else:
        compute_and_save_result(data, f'elaborated_data_with_r_function/{file_name}.json')
        return restore_result(file_name)

In [32]:
df_proteins_removed = load_or_store_removed_correlated(r_proteins, 'r_proteins_removed')

In [33]:
df_mirna_removed = load_or_store_removed_correlated(r_miRNA, 'r_mirna_removed')

In [34]:
df_mrna_removed = load_or_store_removed_correlated(r_mRNA, 'r_mrna_removed')

In [35]:
df_cnv_removed = load_or_store_removed_correlated(r_cnv, 'r_cnv_removed')

In [36]:
df_dataset_whole_dataset_removed = load_or_store_removed_correlated(r_whole_dataset, 'r_dataset_whole_dataset_removed')

In [37]:
df_dataset_proteins_mirna_removed = load_or_store_removed_correlated(r_proteins_mirna_dataset, 'r_dataset_proteins_mirna_removed')

dim(X) before starting remove correlation:  989 627 
niter =  0 
NULL
Removed =  12 
dim filetered_X = 977 627 
nrow(X) = 977 627 
niter =  1 
NULL
Removed =  0 
dim filetered_X = 977 627 
nrow(X) = 977 627 
final dimension =  627 977 


In [38]:
df_dataset_proteins_mirna_removed

Unnamed: 0,hsa.mir.326,hsa.mir.505,hsa.mir.3116.1,SHP.2_pY542,hsa.mir.708,hsa.mir.181d,hsa.mir.638,hsa.mir.3133,hsa.mir.105.1,Fibronectin,...,hsa.mir.181b.2,mTOR_pS2448,AMPK_pT172,hsa.mir.1291,hsa.mir.549,hsa.mir.3929,hsa.mir.2115,hsa.mir.1245,hsa.mir.3676,hsa.mir.3928
0,0.015493,0.072874,0.0,0.202014,0.154311,0.029491,0,0.0,0.043026,0.782654,...,0.090909,0.532247,0.609486,0.057143,0.090909,0.0,0.183879,0.072,0.005988,0.070707
1,0.015493,0.065587,0.0,0.200541,0.102927,0.036640,0,0.0,0.000000,0.493293,...,0.026608,0.540128,0.797494,0.057143,0.363636,0.0,0.032746,0.112,0.009980,0.020202
2,0.011268,0.054251,0.0,0.243913,0.189628,0.084540,0,0.0,0.000236,0.746711,...,0.023282,0.570756,0.540565,0.114286,0.000000,0.0,0.000000,0.200,0.043912,0.090909
3,0.022535,0.105263,0.0,0.255635,0.210150,0.018409,0,0.5,0.000236,0.423429,...,0.035477,0.618668,0.715885,0.142857,0.000000,0.0,0.017632,0.208,0.013972,0.070707
4,0.132394,0.044534,0.0,0.152205,0.062997,0.029312,0,0.5,0.000473,0.297057,...,0.060976,0.619638,0.417004,0.000000,0.000000,0.0,0.000000,0.000,0.065868,0.030303
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
622,0.012676,0.051822,0.0,0.308738,0.040566,0.021269,0,0.0,0.000000,1.000000,...,0.005543,0.556632,0.248128,0.000000,0.000000,0.0,0.042821,0.168,0.005988,0.020202
623,0.043662,0.133603,0.0,0.297456,0.178015,0.029491,0,0.0,0.004255,0.442900,...,0.150776,0.636403,0.535566,0.028571,0.090909,0.0,0.000000,0.144,0.000000,0.050505
624,0.084507,0.102024,0.0,0.326805,0.092905,0.050402,0,1.0,0.000473,0.386972,...,0.029933,0.616971,0.652278,0.000000,0.090909,0.0,0.032746,0.208,0.005988,0.101010
625,0.023944,0.159514,0.0,0.317352,0.179605,0.031457,0,0.0,0.000000,0.348027,...,0.042129,0.591668,0.832184,0.085714,0.090909,0.0,0.068010,0.104,0.027944,0.050505


In [39]:
df_dataset_proteins_mirna_mrna_removed = load_or_store_removed_correlated(r_proteins_mirna_mrna_dataset, 'r_dataset_proteins_mirna_mrna_removed')

dim(X) before starting remove correlation:  19413 627 
niter =  0 


R[write to console]: 



RRuntimeError: StopIteration

In [None]:
df_dataset_proteins_mirna_mrna_removed

In [None]:
print('DATI MANIPOLATI CON LA FUNZIONE R ELIMINANDO LE FEATURE CORRELATE')

# Intrinsic dimension estimation using the TwoNN algorithm

In [None]:
def execute_instrinsic_dimension_estimation(data):
    twoNN = skdim.id.TwoNN().fit(data)
    #estimate local intrinsic dimension (dimension in k-nearest-neighborhoods around each point):
    lpca = skdim.id.TwoNN().fit_pw(data,
                                  n_neighbors = 100,
                                  n_jobs = 1)

    #get estimated intrinsic dimension
    return twoNN.dimension_, np.mean(lpca.dimension_pw_)

In [None]:
intr_dimension_proteins, _ = execute_instrinsic_dimension_estimation(df_proteins_removed)

In [None]:
intr_dimension_mirna, _ = execute_instrinsic_dimension_estimation(df_mirna_removed)

In [None]:
intr_dimension_mrna, _ = execute_instrinsic_dimension_estimation(df_mrna_removed)

In [None]:
intr_dimension_cnv, _ = execute_instrinsic_dimension_estimation(df_cnv_removed)

In [None]:
intr_dimension_whole_dataset, _ = execute_instrinsic_dimension_estimation(df_dataset_whole_dataset_removed)

In [None]:
intr_dimension_proteins_mirna_dataset, _ = execute_instrinsic_dimension_estimation(df_dataset_proteins_mirna_removed)

In [None]:
intr_dimension_proteins_mirna_mrna_dataset, _ = execute_instrinsic_dimension_estimation(df_dataset_proteins_mirna_mrna_removed)

In [None]:
print('CALCOLATE LE DIMENSIONI INTRINSECHE')

# Feature extraction using umap

In [None]:
def execute_umap(n_components, X, y):
    return umap.UMAP(n_neighbors=5, n_components=n_components, random_state=42).fit_transform(X, y)

In [None]:
r_proteins_umap = execute_umap(math.trunc(intr_dimension_proteins), df_proteins_removed, label)

In [None]:
r_miRNA_umap = execute_umap(math.trunc(intr_dimension_mirna), df_mirna_removed, label)

In [None]:
r_mRNA_umap = execute_umap(math.trunc(intr_dimension_mrna), df_mrna_removed, label)

In [None]:
r_cnv_umap = execute_umap(math.trunc(intr_dimension_cnv), df_cnv_removed, label)

In [None]:
r_dataset_whole_dataset_umap = execute_umap(math.trunc(intr_dimension_whole_dataset), df_dataset_whole_dataset_removed, label)

In [None]:
r_dataset_proteins_mirna_dataset_umap = execute_umap(math.trunc(intr_dimension_proteins_mirna_dataset), df_dataset_proteins_mirna_removed, label)

In [None]:
r_dataset_proteins_mirna_mrna_dataset_umap = execute_umap(math.trunc(intr_dimension_proteins_mirna_mrna_dataset), df_dataset_proteins_mirna_mrna_removed, label)

In [None]:
print('ESTRATTE LE FEATURE USANDO UMAP')

# Train using intrinsic dimensionality and umap calculated externally

In [None]:
def train_using_intrinsic_dimensionality_and_umap(X, label, num_external_fold,
                                          type_dataset, tecnique_feature_selection,
                                          path):
    
    """
    X : data using number of features of intrinsic dimensionality and appliying umap features extraction.
    """
    
    aucs = []
    y_score_concatenated = []
    y_test_fold_concatenated = []
    result_dict = {}
    result_dict[type_dataset] = {}
    result_dict[type_dataset][tecnique_feature_selection] = {}
    iterator = 1
    
    label_numpy = label['x'].to_numpy()
    
    external_fold = StratifiedKFold(n_splits=num_external_fold, shuffle=True, random_state=1)
    
    for train_index, test_index in tqdm(external_fold.split(X, label_numpy), desc=f"Running fold ", dynamic_ncols=True, leave=False):
        
        X_train_fold, X_test_fold = X[train_index], X[test_index]
        y_train_fold, y_test_fold = label_numpy[train_index], label_numpy[test_index]
        
        clf = GridSearchCV(estimator=RandomForestClassifier(),
                   scoring='average_precision',
                   param_grid=model_selection_grid_DT, 
                   cv=2, 
                   n_jobs=-1)
        clf.fit(X_train_fold, y_train_fold)
        
        # Get the predicited probability of testing data
        y_score = clf.predict_proba(X_test_fold)[:, 1]
        
        #calculate precision and recall
        precision, recall, thresholds = precision_recall_curve(y_test_fold, y_score)

        #plot_precision_recall_curve(classifier, X_test_selected_feature, y_test_fold)
        
        y_score_concatenated = np.concatenate((y_score_concatenated, y_score), axis=None)
        y_test_fold_concatenated = np.concatenate((y_test_fold_concatenated, y_test_fold), axis=None)
              
        # Use AUC function to calculate the area under the curve of precision recall curve
        auc_precision_recall = auc(recall, precision)
        
        aucs.append(auc_precision_recall)
        #print('auc: {:.3f} using params {}:'.format(auc_precision_recall, clf.best_params_))
        
        PrecisionRecallDisplay.from_predictions(y_test_fold, y_score)
        plt.savefig(path+tecnique_feature_selection+'/'+type_dataset+'_'+str(iterator)+'_AUPRC_Internal.pdf')
        iterator = iterator+1
    
    internal_auc = np.mean(aucs)
    
    #calculate precision and recall
    precision, recall, thresholds = precision_recall_curve(y_test_fold_concatenated, y_score_concatenated)
    
    external_auc = auc(recall, precision)
    #plot precision recall curve
    PrecisionRecallDisplay.from_predictions(y_test_fold_concatenated, y_score_concatenated)
    #plt.show()
    
    result_dict[type_dataset][tecnique_feature_selection]['internal'] = '{0:.4f}'.format(internal_auc)
    result_dict[type_dataset][tecnique_feature_selection]['external'] = '{0:.4f}'.format(external_auc)
    result_dict[type_dataset][tecnique_feature_selection]['best_estimator'] = str(clf.best_estimator_)
    
    plt.savefig(path+tecnique_feature_selection+'/'+type_dataset+'_'+'AUPRC_External.pdf')
    
    with open(path+tecnique_feature_selection+'/'+type_dataset+'.json', 'w', encoding='utf-8') as fp:
        json.dump(result_dict[type_dataset], fp, ensure_ascii=False, indent=4)
    
    return '{0:.4f}'.format(internal_auc), '{0:.4f}'.format(external_auc), clf.best_estimator_

In [None]:
model_selection_grid_DT = [
    {'criterion': ['gini', 'entropy'],
     'max_leaf_nodes': [None, 2, 5, 10],
     'max_features': [None, 'sqrt', 'log2'],
    'n_estimators': [51, 101, 251, 500]}
]

# Train proteins

In [None]:
!mkdir 'result_Experiments_using_R_Function_intrinsic_and_umap_whole_dataset'
!mkdir 'result_Experiments_using_R_Function_intrinsic_and_umap_whole_dataset/r_function_intrinsic_with_umap'
train_using_intrinsic_dimensionality_and_umap(r_proteins_umap, label, 10, 'proteins', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_intrinsic_and_umap_whole_dataset/')

In [None]:
print('FINITO ADDESTRAMENTO PROTEINS')

# Train miRNA

In [None]:
train_using_intrinsic_dimensionality_and_umap(r_miRNA_umap, label, 10, 'mirna', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_intrinsic_and_umap_whole_dataset/')

print('FINITO ADDESTRAMENTO MIRNA')

# Train mRNA

In [None]:
train_using_intrinsic_dimensionality_and_umap(r_mRNA_umap, label, 10, 'mrna', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_intrinsic_and_umap_whole_dataset/')

print('FINITO ADDESTRAMENTO MRNA')

# Train cnv

In [None]:
train_using_intrinsic_dimensionality_and_umap(r_cnv_umap, label, 10, 'cnv', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_intrinsic_and_umap_whole_dataset/')

print('FINITO ADDESTRAMENTO CNV')

# Train whole dataset

In [None]:
train_using_intrinsic_dimensionality_and_umap(r_whole_dataset_umap, label, 10, 'whole_dataset', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_intrinsic_and_umap_whole_dataset/')

print('FINITO ADDESTRAMENTO WHOLE DATASET')

# Train proteins + mirna

In [None]:
train_using_intrinsic_dimensionality_and_umap(r_dataset_proteins_mirna_dataset_umap, label, 10, 'proteins_mirna_dataset', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_intrinsic_and_umap_whole_dataset/')

print('FINITO ADDESTRAMENTO PROTEINS+MIRNA DATASET')

# Train proteins+mirna+mrna

In [None]:
train_using_intrinsic_dimensionality_and_umap(r_dataset_proteins_mirna_mrna_dataset_umap, label, 10, 'proteins_mirna_mrna_dataset', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_intrinsic_and_umap_whole_dataset/')

print('FINITO ADDESTRAMENTO PROTEINS+MIRNA+MRNA DATASET')

# Train using intrinsic dimensionality and umap calculated internally

In [None]:
def train_using_intrinsic_dimensionality_and_umap_internally(X, label, num_external_fold,
                                          type_dataset, tecnique_feature_selection,
                                          path):
    
    """
    X : data using number of features of intrinsic dimensionality and appliying umap features extraction.
    """
    
    aucs = []
    y_score_concatenated = []
    y_test_fold_concatenated = []
    result_dict = {}
    iterator = 1
    result_dict[type_dataset] = {}
    result_dict[type_dataset][tecnique_feature_selection] = {}
    X_numpy = X.to_numpy()
    
    label_numpy = label['x'].to_numpy()
    
    external_fold = StratifiedKFold(n_splits=num_external_fold, shuffle=True, random_state=1)
    
    for train_index, test_index in tqdm(external_fold.split(X_numpy, label_numpy), desc=f"Running fold ", dynamic_ncols=True, leave=False):
        
        X_train_fold, X_test_fold = X_numpy[train_index], X_numpy[test_index]
        y_train_fold, y_test_fold = label_numpy[train_index], label_numpy[test_index]
        
        X_train_intr_dimension_proteins, _ = execute_instrinsic_dimension_estimation(X_train_fold)

        X_train_r_proteins_umap = execute_umap(math.trunc(X_train_intr_dimension_proteins), X_train_fold, y_train_fold)
        X_test_r_proteins_umap = execute_umap(math.trunc(X_train_intr_dimension_proteins), X_test_fold, y_test_fold)
        
        clf = GridSearchCV(estimator=RandomForestClassifier(),
                   scoring='average_precision',
                   param_grid=model_selection_grid_DT, 
                   cv=2, 
                   n_jobs=-1)
        clf.fit(X_train_r_proteins_umap, y_train_fold)
        
        # Get the predicited probability of testing data
        y_score = clf.predict_proba(X_test_r_proteins_umap)[:, 1]
        
        #calculate precision and recall
        precision, recall, thresholds = precision_recall_curve(y_test_fold, y_score)

        #plot_precision_recall_curve(classifier, X_test_selected_feature, y_test_fold)
        
        y_score_concatenated = np.concatenate((y_score_concatenated, y_score), axis=None)
        y_test_fold_concatenated = np.concatenate((y_test_fold_concatenated, y_test_fold), axis=None)
        
        
        # Use AUC function to calculate the area under the curve of precision recall curve
        auc_precision_recall = auc(recall, precision)
        
        aucs.append(auc_precision_recall)
        #print('auc: {:.3f} using params {}:'.format(auc_precision_recall, clf.best_params_))
        
        PrecisionRecallDisplay.from_predictions(y_test_fold, y_score)
        plt.savefig(path+tecnique_feature_selection+'/'+type_dataset+'_'+str(iterator)+'_AUPRC_Internal.pdf')
        iterator = iterator +1
    
    internal_auc = np.mean(aucs)
    
    #calculate precision and recall
    precision, recall, thresholds = precision_recall_curve(y_test_fold_concatenated, y_score_concatenated)
    
    external_auc = auc(recall, precision)
    #plot precision recall curve
    PrecisionRecallDisplay.from_predictions(y_test_fold_concatenated, y_score_concatenated)
    #plt.show()
    
    result_dict[type_dataset][tecnique_feature_selection]['internal'] = '{0:.4f}'.format(internal_auc)
    result_dict[type_dataset][tecnique_feature_selection]['external'] = '{0:.4f}'.format(external_auc)
    result_dict[type_dataset][tecnique_feature_selection]['best_estimator'] = str(clf.best_estimator_)
    
    plt.savefig(path+tecnique_feature_selection+'/'+type_dataset+'_'+'AUPRC_External.pdf')
    
    with open(path+tecnique_feature_selection+'/'+type_dataset+'.json', 'w', encoding='utf-8') as fp:
        json.dump(result_dict[type_dataset], fp, ensure_ascii=False, indent=4)
    
    return '{0:.4f}'.format(internal_auc), '{0:.4f}'.format(external_auc), clf.best_estimator_

In [None]:
!mkdir 'result_Experiments_using_R_Function_whole_intrinsic_and_umap_on_fold'
!mkdir 'result_Experiments_using_R_Function_whole_intrinsic_and_umap_on_fold/r_function_intrinsic_with_umap'
train_using_intrinsic_dimensionality_and_umap_internally(df_proteins_removed, label, 10, 'proteins', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_whole_intrinsic_and_umap_on_fold/')

In [None]:
train_using_intrinsic_dimensionality_and_umap_internally(df_mirna_removed, label, 10, 'mirna', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_whole_intrinsic_and_umap_on_fold/')

In [None]:
train_using_intrinsic_dimensionality_and_umap_internally(df_mrna_removed, label, 10, 'mrna', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_whole_intrinsic_and_umap_on_fold/')

In [None]:
train_using_intrinsic_dimensionality_and_umap_internally(df_cnv_removed, label, 10, 'cnv', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_whole_intrinsic_and_umap_on_fold/')

In [None]:
train_using_intrinsic_dimensionality_and_umap_internally(df_whole_dataset_removed, label, 10, 'whole_dataset', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_whole_intrinsic_and_umap_on_fold/')

In [None]:
train_using_intrinsic_dimensionality_and_umap_internally(df_dataset_proteins_mirna_removed, label, 10, 'proteins_mirna_dataset', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_whole_intrinsic_and_umap_on_fold/')

In [None]:
train_using_intrinsic_dimensionality_and_umap_internally(df_dataset_proteins_mirna_mrna_removed, label, 10, 'proteins_mirna_mrna_dataset', 'r_function_intrinsic_with_umap', 'result_Experiments_using_R_Function_whole_intrinsic_and_umap_on_fold/')