## Import

In [134]:
seed = 42

# Import libraries
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

import random
random.seed(seed)
from sklearn.utils import shuffle
import numpy as np

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline

from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score, precision_recall_curve, auc, confusion_matrix
from imblearn.over_sampling import SMOTE

from sklearn.feature_selection import mutual_info_classif
from scipy.spatial.distance import pdist, squareform
from scipy.stats import ttest_ind


from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier

from sklearn.feature_selection import SelectKBest, SelectPercentile, f_classif, f_regression, SelectFromModel
sc = StandardScaler()

## Caricamento Dati

In [145]:

file_path = "/Users/alessiamenozzi/Desktop/ThesisPlaques/data_rad_clin_DEF.csv"
#file_path = "C:\\Users\\bsbar\\Desktop\\Tesi\\ThesisPlaques\\data_rad_clin_DEF.csv"
data = pd.read_csv(file_path)

# Lista degli ID da escludere
ids_to_exclude = ["patient_TC_19", "patient_TC_40", "patient_TC_88", "patient_TC_150", "patient_TC_193", "patient_TC_200", "patient_TC_17", "patient_TC_107", "patient_TC_127" ]


# Filtra il DataFrame per escludere le righe con gli ID specificati
filtered_data = data[~data['IDs_new'].isin(ids_to_exclude)]

# Estrae i valori dalla colonna 'label' del DataFrame filtrato
labels_column = filtered_data['label']

# Converte i valori della colonna 'label' in numeri interi
labels = labels_column.astype(int).tolist()

labels=np.array(labels)
print("Labels:", Y_train)
print("Number of labels:", len(Y_train))


# Carica il file CSV
file_path = "/Users/alessiamenozzi/Desktop/ThesisPlaques/features_radiomiche.csv"  # Sostituisci con il percorso corretto
df = pd.read_csv(file_path, sep=';')

# Colonne da rimuovere
columns_to_remove = [
    'diagnostics_Image-original_Mean',
    'diagnostics_Image-original_Minimum',
    'diagnostics_Image-original_Maximum',
    'diagnostics_Mask-original_VoxelNum',
    'diagnostics_Mask-original_VolumeNum',
]

# Rimuovi le colonne specificate
df_cleaned = df.drop(columns=columns_to_remove)


# Rimuovi la colonna 'Paziente' per ottenere solo le feature
df_features = df_cleaned.drop(columns=['Paziente'])

# Converti le features in un array numpy
features = df_features.to_numpy()

# Stampa i risultati
print(features)
print(features.shape)  # (120, num_features)


Labels: [0 1 1 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 0 0 0 1
 1 0 0 1 0 1 1 1 1 1 0 0 0 1 0 0 1 0 0 0 0 1 1 1 0 1 0 1 0 0 1 0 0 1 0 1 0
 1 1 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 1 1 1 0]
Number of labels: 96
[[6.42638351e-01 4.34811323e-01 1.16096656e+01 ... 2.69972505e+03
  2.80963106e-02 1.00514866e+01]
 [6.29082835e-01 5.34401455e-01 1.06699125e+01 ... 5.06730171e+03
  4.13964322e-02 9.21642811e+00]
 [5.33839928e-01 4.15372150e-01 1.11671922e+01 ... 3.95715175e+03
  1.13273940e-02 2.75076797e+01]
 ...
 [3.60780175e-01 3.10120345e-01 1.09248810e+01 ... 2.38010395e+03
  3.21141803e-02 3.22969245e+00]
 [4.66390756e-01 3.69394607e-01 1.32282545e+01 ... 1.36263582e+02
  6.99679621e-02 4.97074683e-01]
 [4.85039433e-01 3.32916705e-01 9.71060636e+00 ... 4.02397749e+03
  7.17804108e-02 4.97083572e+00]]
(120, 107)


## Funzioni

In [146]:

def compute_correlation(X, threshold=0.85):
    corr_matrix = np.corrcoef(X, rowvar=False)
    upper_triangle = np.triu(corr_matrix, k=1)
    to_drop = [column for column in range(upper_triangle.shape[0]) if any(abs(upper_triangle[column, :]) > threshold)]
    return to_drop


def remove_highly_correlated_features(X, threshold=0.85):
    corr_matrix = np.corrcoef(X, rowvar=False)
    upper_triangle = np.triu(corr_matrix, k=1)
    to_drop = [column for column in range(upper_triangle.shape[0]) if any(abs(upper_triangle[column, :]) > threshold)]
    X_reduced = np.delete(X, to_drop, axis=1)
    return X_reduced, to_drop


def remove_high_pvalue_features(X, y, alpha=0.05):
    selector = SelectKBest(score_func=f_classif, k='all')
    selector.fit(X, y)
    p_values = selector.pvalues_
    features_to_keep = np.where(p_values < alpha)[0]
    X_reduced = X[:, features_to_keep]
    return X_reduced, features_to_keep

## FEATURE SELECTION LASSO
def select_features_with_lasso(X, y, alpha=0.001):
    
    # Fit Lasso regression model
    lasso = Lasso(alpha=alpha)
    lasso.fit(X, y)

    # Get coefficients
    coefficients = lasso.coef_

    # Select features with non-zero coefficients
    selected_features = np.where(coefficients != 0)[0]

    # Create new feature matrix with only selected features
    X_selected = X[:, selected_features]

    return X_selected, selected_features

## FEATURE SELECTION LOGISTIC
def logistic_regression_feature_selection(X, y, num_features):
    lr = LogisticRegression(max_iter=1000, random_state=42)
    lr.fit(X, y)
    coef_abs = np.abs(lr.coef_)
    feature_importances = np.mean(coef_abs, axis=0)
    selected_features = feature_importances.argsort()[-num_features:][::-1]
    # Create new feature matrix with only selected features
    X_selected = X[:, selected_features]
    return X_selected, selected_features


def mrmr_feature_selection(X, y, num_features):
    # Calcolare l'informazione mutua tra ogni caratteristica e il target
    mi = mutual_info_classif(X, y)
    
    # Standardizzare le caratteristiche
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    
    # Calcolare la distanza euclidea tra le caratteristiche
    distances = squareform(pdist(X_scaled.T, 'euclidean'))
    
    selected_features = []
    selected_indices = []
    
    # Selezionare la prima caratteristica con la massima informazione mutua
    first_feature_index = np.argmax(mi)
    selected_features.append(first_feature_index)
    selected_indices.append(first_feature_index)
    
    # Iterare per selezionare le caratteristiche rimanenti
    for _ in range(num_features - 1):
        max_relevance = -np.inf
        selected_feature_index = -1
        
        for i in range(X.shape[1]):
            if i in selected_indices:
                continue
            
            relevance = mi[i]
            redundancy = np.mean(distances[i, selected_indices])
            
            mrmr_score = relevance - redundancy
            
            if mrmr_score > max_relevance:
                max_relevance = mrmr_score
                selected_feature_index = i
        
        selected_features.append(selected_feature_index)
        selected_indices.append(selected_feature_index)

    X_selected = X[:, selected_indices]
    return X_selected, selected_indices



## FEATURE SELECTION RANDOM FOREST
def rf_feature_selection(X, y, num_features):
    # Inizializza il classificatore Random Forest
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    # Addestra il modello
    rf.fit(X, y)
    # Ottieni l'importanza delle caratteristiche
    feature_importances = rf.feature_importances_
    # Seleziona gli indici delle caratteristiche più importanti
    selected_features = np.argsort(feature_importances)[-num_features:][::-1]
    # Crea una nuova matrice di caratteristiche con solo le caratteristiche selezionate
    X_selected = X[:, selected_features]
    return X_selected, selected_features


def p_value_feature_selection(X, num_features):
    """
    Seleziona le prime `num_features` caratteristiche dal vettore di caratteristiche.

    Args:
    X (np.ndarray): Matrice delle caratteristiche (numero di campioni, numero di caratteristiche).
    num_features (int): Numero di caratteristiche da selezionare.

    Returns:
    np.ndarray: Nuova matrice delle caratteristiche con solo le caratteristiche selezionate.
    np.ndarray: Indici delle caratteristiche selezionate.
    """
    # Controlla se num_features è maggiore del numero totale di caratteristiche
    if num_features > X.shape[1]:
        raise ValueError(f"num_features ({num_features}) è maggiore del numero totale di caratteristiche ({X.shape[1]})")

    # Seleziona i primi num_features indici
    selected_features = np.arange(num_features)
    
    # Crea una nuova matrice di caratteristiche con solo le caratteristiche selezionate
    X_selected = X[:, selected_features]
    
    return X_selected, selected_features

def select_features_by_p_value1(x_train_expanded, y_train_expanded, num_features=None):
    """
    Seleziona e ordina le feature basate sui p-value con un test t di Student.
    Ordina le feature in base al p-value in ordine crescente e seleziona le prime `num_features` caratteristiche.

    Args:
    x_train_expanded (np.ndarray): Matrice delle caratteristiche (numero di campioni, numero di caratteristiche).
    y_train_expanded (np.ndarray): Etichette binarie (numero di campioni,).
    num_features (int, opzionale): Numero di caratteristiche da selezionare.

    Returns:
    np.ndarray: Nuova matrice delle caratteristiche con solo le caratteristiche selezionate.
    np.ndarray: Indici delle caratteristiche selezionate, ordinate per p-value.
    """
    p_values = []
    num_features_total = x_train_expanded.shape[1]

    # Calcolo dei p-value per ciascuna caratteristica
    for i in range(num_features_total):
        feature = x_train_expanded[:, i]
        group_0 = feature[y_train_expanded == 0]
        group_1 = feature[y_train_expanded == 1]
        t_stat, p_val = ttest_ind(group_0, group_1, equal_var=False)
        p_values.append(p_val)

    # Convertire i p-value in un array numpy per ordinare più facilmente
    p_values = np.array(p_values)

    # Ordinare tutte le caratteristiche in base ai p-value (dal più piccolo al più grande)
    sorted_indices = np.argsort(p_values)

    # Se num_features è specificato, selezionare solo le prime num_features
    if num_features is not None:
        sorted_indices = sorted_indices[:num_features]

    # Selezionare le colonne della matrice originale che soddisfano il numero di feature
    x_train_selected = x_train_expanded[:, sorted_indices]

    return x_train_selected, sorted_indices



## FUNZIONE PER RIMUOVERE FEATURES SELEZIONATE
def filter_patients_features(filtered_patients, selected_features):
    """
    Removes the non-selected features from the filtered_patients array.

    Parameters:
    filtered_patients (list of numpy.ndarray): The list containing patients' images' features.
    selected_features (numpy.ndarray): The indices of the selected features.

    Returns:
    list of numpy.ndarray: The new filtered_patients array with only the selected features.
    """
    filtered_patients_selected = []

    for patient_features in filtered_patients:
        # Select only the features specified in selected_features
        patient_features_selected = patient_features[:, selected_features]
        filtered_patients_selected.append(patient_features_selected)

    return filtered_patients_selected


def select_features_by_p_value(x_train_expanded, y_train_expanded, p_value_threshold=0.01):

    p_values = []
    num_features = x_train_expanded.shape[1]

    for i in range(num_features):
        feature = x_train_expanded[:, i]
        group_0 = feature[y_train_expanded == 0]
        group_1 = feature[y_train_expanded == 1]
        t_stat, p_val = ttest_ind(group_0, group_1, equal_var=False)
        p_values.append(p_val)

    # Convertire i p-value in un array numpy per ordinare più facilmente
    p_values = np.array(p_values)

    # Selezionare le caratteristiche con p-value < soglia
    selected_features_indices = np.where(p_values < p_value_threshold)[0]

    # Ordinare le caratteristiche selezionate in base ai p-value
    sorted_indices = selected_features_indices[np.argsort(p_values[selected_features_indices])]

    x_train_expanded = x_train_expanded[:, sorted_indices]

    return x_train_expanded, sorted_indices



In [147]:


def classification_method(selector, classifier, alpha, x_train_expanded, y_train_expanded, x_test, y_test, num_features, mode="Val", selected_features=[0], thresholds=np.arange(0.3, 0.6, 0.01)):
    best_f1_score = 0
    best_case = None

    if mode == "Val":
        selected_features = None  # Inizializziamo selected_features per prevenire l'errore UnboundLocalError

        if num_features != len(x_train_expanded[0]) or alpha != 0:
            if selector == "lasso":
                X_selected, selected_features = select_features_with_lasso(x_train_expanded, y_train_expanded, alpha)
            elif selector == "logistic":
                X_selected, selected_features = logistic_regression_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "mrmr":
                X_selected, selected_features = mrmr_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "rf":
                X_selected, selected_features = rf_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "p_value":
                X_selected, selected_features = select_features_by_p_value1(x_train_expanded,y_train_expanded, num_features=num_features)
            else:
                print("Wrong selector. Choose between: mrmr, rf, logistic, lasso")
                return

            x_test = x_test[:, selected_features]  # Applichiamo la selezione delle feature anche su x_test
        else:
            X_selected = x_train_expanded
            selected_features = list(range(len(x_train_expanded[0])))  # Selezioniamo tutte le feature se non si fa feature selection

        number_features = len(selected_features)  # Numero di feature selezionate

        # Addestriamo il classificatore
        classifier.fit(X_selected, y_train_expanded)


    if (mode == "Test"):
        x_test = x_test[:, selected_features]
        number_features = len(selected_features)
    
        # Prevediamo le probabilità sul set di test
    y_proba_test = classifier.predict_proba(x_test)[:, 1]


    if(isinstance(thresholds, np.ndarray)== False):
        thresholds=[thresholds]
        
    
    for threshold in thresholds:
            # Previsioni usando la soglia custom
            y_pred_custom_test = (y_proba_test >= threshold).astype(int)

            # Calcolo metriche
            accuracy = accuracy_score(y_test, y_pred_custom_test)
            f1 = f1_score(y_test, y_pred_custom_test)
            roc_auc = roc_auc_score(y_test, y_proba_test)

            # Precision-recall curve e AUC
            precision, recall, _ = precision_recall_curve(y_test, y_proba_test)
            pr_auc = auc(recall, precision)

            # Miglior precisione e richiamo (basato su soglia personalizzata)
            best_precision = precision[np.argmax(recall)]
            best_recall = recall[np.argmax(recall)]

            # Matrice di confusione
            conf = confusion_matrix(y_test, y_pred_custom_test)

            # Se il nuovo risultato è migliore rispetto al migliore attuale, aggiorniamo
            if f1 > best_f1_score or (f1 == best_f1_score and pr_auc > (best_case['pr_auc'] if best_case else 0)):
                best_f1_score = f1
                best_case = {
                    'alpha': alpha,
                    'num_features': number_features,
                    'selected_features': selected_features,
                    'pr_auc': pr_auc,
                    'best_precision': best_precision,
                    'best_recall': best_recall,
                    'roc_auc': roc_auc,
                    'f1': f1,
                    'accuracy': accuracy,
                    'confusion_matrix': conf,
                    'best_threshold': threshold
                }

    return best_case


## split

In [148]:
#y_train1, y_test, x_train1, X_test= train_test_split(Y_train, x_train, test_size=0.2, shuffle=False, random_state=1)
#y_train, y_val, X_train, X_val= train_test_split(y_train1, x_train1, test_size=0.3, shuffle=True, stratify=y_train1, random_state=2)
#y_train, y_val, X_train, X_val= train_test_split(y_train1, x_train1, test_size=0.3, shuffle=False, random_state=7)
Y_train, y_test, X_train, X_test= train_test_split(labels, features, test_size=0.2, shuffle=False, random_state=1)

## feature correlation and p_value

In [149]:

X_train_reduced, dropped_features = remove_highly_correlated_features(X_train, 0.9)

# Riduci X_val e X_test usando le feature rimosse
#X_val_reduced = np.delete(X_val, dropped_features, axis=1)
X_test_reduced = np.delete(X_test, dropped_features, axis=1)

print(X_train_reduced.shape)

print(X_test_reduced.shape)
#print(X_val_reduced.shape)



# Rimozione delle feature con p-value elevato
#X_train_reduced, features_to_keep = remove_high_pvalue_features(X_train_reduced, y_train, alpha=0.05)
#print(features_to_keep)
#X_val_reduced = X_val_reduced[:, features_to_keep]
#X_test_reduced = X_test_reduced[:, features_to_keep]


print(X_train_reduced.shape)

print(X_test_reduced.shape)
#print(X_val_reduced.shape)

(96, 36)
(24, 36)
(96, 36)
(24, 36)


## parametri

In [150]:
alpha_values = np.linspace(0, 0.006, 30).tolist()
alpha=0.003

thresholds=np.arange(0.001, 0.501, 0.001) 

#selectors=['p_value', 'mrmr','rf', 'logistic', 'lasso']
selectors=['p_value', 'mrmr','rf', 'logistic']
classifiers=['XgBoost', 'MLP', 'SVM', 'ensemble','RandomForest', 'Logistic']
#classifiers=['RandomForest', 'XgBoost', 'MLP', 'SVM']

## Loop

In [151]:
n_folds=5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=42)

scores = []
results_val = [[{} for _ in range(len(classifiers))] for _ in range(n_folds)]


for fold_idx, (train_index, val_index) in enumerate(skf.split(X_train_reduced, Y_train)):
    print("Starting with fold:", fold_idx)

    x_train_reduced, X_val_reduced = X_train_reduced[train_index], X_train_reduced[val_index]
    y_train, y_val = Y_train[train_index], Y_train[val_index]

    for i, classifier in enumerate(classifiers):
        print("Starting with classifier:", classifier)
        for j, selector in enumerate(selectors):
            print("Starting with selector:", selector)

            results_val[fold_idx][i][j] = {
                'fold': fold_idx,
                'classifier': classifier,
                'selector': selector,
                'alpha': [],
                'num_features': [],
                'pr_auc': [],
                'best_precision': [],
                'best_recall': [],
                'roc_auc': [],
                'f1': [],
                'accuracy': [],
                'confusion_matrix': [],
                'best_threshold': [],
                'selected_features': []
            }

            best_f1 = 0
            best_case = None

            limit=len(x_train_reduced[0]) + 1
            for t in range(2, limit):
                #print("Number of features ", t)

                # Selezione del classificatore
                if classifier == 'RandomForest':
                    classi = RandomForestClassifier(n_estimators=100, random_state=42)
                    selector_rf = SelectFromModel(classi, threshold='mean')
                    X_train_rf = selector_rf.fit_transform(x_train_reduced, y_train)
                    X_test_rf = selector_rf.transform(X_test_reduced)
                    X_val_rf = selector_rf.transform(X_val_reduced)
                    if(t>len(X_train_rf[0])):
                        continue

                elif classifier == 'Logistic':
                    classi = LogisticRegression()
                    selector_lr = SelectFromModel(classi, threshold='mean')
                    X_train_lr = selector_lr.fit_transform(x_train_reduced, y_train)
                    X_test_lr = selector_lr.transform(X_test_reduced)
                    X_val_lr = selector_lr.transform(X_val_reduced)
                    if(t>len(X_train_lr[0])):
                        continue
                    
                elif classifier == 'SVM':
                    classi = SVC(kernel='rbf', probability=True, random_state=42)
                elif classifier == 'XgBoost':
                    classi = XGBClassifier()
                elif classifier == 'MLP':
                    classi = MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=1000, random_state=42, early_stopping=True, learning_rate='adaptive')
                elif classifier == 'ensemble':
                    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
                    logistic_model = LogisticRegression(random_state=42)
                    svc_model = SVC(kernel='rbf', probability=True, random_state=42)

                    # Crea l'ensemble con VotingClassifier
                    classi = VotingClassifier(
                        estimators=[
                            ('random_forest', rf_model),
                            ('logistic', logistic_model),
                            ('svc', svc_model)
                        ],
                        voting='soft'  # 'soft' usa le probabilità di classe, 'hard' usa le predizioni di classe
                        )

                
                if classifier == 'RandomForest':
                    best_case = classification_method(selector, classi, alpha, X_train_rf, y_train, X_val_rf, y_val, num_features=t, mode="Val", selected_features=[0])
                elif classifier == 'Logistic':
                    best_case = classification_method(selector, classi, alpha, X_train_lr, y_train, X_val_lr, y_val, num_features=t, mode="Val", selected_features=[0])
                else:
                    best_case = classification_method(selector, classi, alpha, x_train_reduced, y_train, X_val_reduced, y_val, num_features=t, mode="Val", selected_features=[0])


                if best_case:
                    results_val[fold_idx][i][j]['alpha'].append(best_case['alpha'])
                    results_val[fold_idx][i][j]['selected_features'].append(best_case['selected_features'])
                    results_val[fold_idx][i][j]['num_features'].append(best_case['num_features'])
                    results_val[fold_idx][i][j]['pr_auc'].append(best_case['pr_auc'])
                    results_val[fold_idx][i][j]['best_precision'].append(best_case['best_precision'])
                    results_val[fold_idx][i][j]['best_recall'].append(best_case['best_recall'])
                    results_val[fold_idx][i][j]['roc_auc'].append(best_case['roc_auc'])
                    results_val[fold_idx][i][j]['f1'].append(best_case['f1'])
                    results_val[fold_idx][i][j]['accuracy'].append(best_case['accuracy'])
                    results_val[fold_idx][i][j]['confusion_matrix'].append(best_case['confusion_matrix'])
                    results_val[fold_idx][i][j]['best_threshold'].append(best_case['best_threshold'])




Starting with fold: 0
Starting with classifier: XgBoost
Starting with selector: p_value
Starting with selector: mrmr
Starting with selector: rf
Starting with selector: logistic
Starting with classifier: MLP
Starting with selector: p_value
Starting with selector: mrmr
Starting with selector: rf
Starting with selector: logistic
Starting with classifier: SVM
Starting with selector: p_value
Starting with selector: mrmr
Starting with selector: rf
Starting with selector: logistic
Starting with classifier: ensemble
Starting with selector: p_value
Starting with selector: mrmr
Starting with selector: rf
Starting with selector: logistic
Starting with classifier: RandomForest
Starting with selector: p_value
Starting with selector: mrmr
Starting with selector: rf
Starting with selector: logistic
Starting with classifier: Logistic
Starting with selector: p_value
Starting with selector: mrmr
Starting with selector: rf
Starting with selector: logistic
Starting with fold: 1
Starting with classifier: X

In [153]:
### per trovare la migliore configurazione

diz = [[{} for _ in range(len(selectors))] for _ in range(len(classifiers))]

for i, classifier in enumerate(classifiers):
    for j, selector in enumerate(selectors):
        # Inizializza un dizionario per contenere le medie dei fold
        combined_result = {
            'classifier': classifier,
            'selector': selector,
            'alpha': [],
            'num_features': [],
            'pr_auc': [],
            'best_precision': [],
            'best_recall': [],
            'roc_auc': [],
            'f1': [],
            'accuracy': [],
            'best_threshold': [],
            # Lasciamo selected_features e confusion_matrix da parte
            'selected_features': [],  # Lasciamo come lista vuota per ora
            'confusion_matrix': []    # Lasciamo come lista vuota per ora
        }
        
        # Per ogni fold, aggiungi i valori corrispondenti per calcolare la media
        for fold_idx in range(n_folds):
            for key in results_val[fold_idx][i][j].keys():
                if key in ['classifier', 'selector', 'selected_features', 'confusion_matrix']:  
                    # Non calcoliamo la media su 'selected_features' e 'confusion_matrix'
                    continue
                if key not in combined_result:
                    combined_result[key] = []
                
                # Aggiungi la media dei valori del fold
                combined_result[key].append(np.mean(results_val[fold_idx][i][j][key]))
        
        # Calcola la media per ogni chiave e salva nel dizionario finale
        for key in combined_result.keys():
            if key not in ['classifier', 'selector', 'selected_features', 'confusion_matrix']:
                combined_result[key] = np.mean(combined_result[key])
        
        # Lascia selected_features e confusion_matrix vuoti o trattali diversamente se necessario
        combined_result['selected_features'] = "N/A"  # O qualsiasi altro trattamento
        combined_result['confusion_matrix'] = "N/A"   # O qualsiasi altro trattamento
        
        # Salva il risultato medio nel dizionario generale
        diz[i][j] = combined_result

# Lista per contenere le configurazioni migliori
best_configurations = []

# Cerca le due configurazioni migliori, prima ordinando per f1 e poi per pr_auc
for i in range(len(classifiers)):
    for j in range(len(selectors)):
        result = diz[i][j]
        best_configurations.append((result['f1'], result['pr_auc'], result['classifier'], result['selector'], result['num_features'], result['best_threshold']))

# Ordina le configurazioni in base all'f1 e in caso di parità al pr_auc
best_configurations = sorted(best_configurations, key=lambda x: (-x[0], -x[1]))  # Ordina in modo decrescente per f1 e pr_auc

# Estrai le due migliori configurazioni
best_config_1 = best_configurations[0]
best_config_2 = best_configurations[1]


print("La migliore configurazione è:")
print(f"Classifier: {best_config_1[2]}, Selector: {best_config_1[3]}, Num Features: {best_config_1[4]}, F1: {best_config_1[0]}, PR AUC: {best_config_1[1]}, Best Threshold: {best_config_1[5]}")

print("\nLa seconda migliore configurazione è:")
print(f"Classifier: {best_config_2[2]}, Selector: {best_config_2[3]}, Num Features: {best_config_2[4]}, F1: {best_config_2[0]}, PR AUC: {best_config_2[1]}, Best Threshold: {best_config_2[5]}")


La migliore configurazione è:
Classifier: SVM, Selector: logistic, Num Features: 19.0, F1: 0.6593509399275043, PR AUC: 0.46944936230738976, Best Threshold: 0.3649714285714286

La seconda migliore configurazione è:
Classifier: ensemble, Selector: logistic, Num Features: 19.0, F1: 0.6569500317523371, PR AUC: 0.47170518111823584, Best Threshold: 0.3551428571428572


In [154]:
if best_config_2[2] == 'RandomForest':
                    classi = RandomForestClassifier(n_estimators=100, random_state=42)
                    selector_rf = SelectFromModel(classi, threshold='mean')
                    X_train_rf = selector_rf.fit_transform(X_train_reduced, y_train)
                    X_test_rf = selector_rf.transform(X_test_reduced)

elif best_config_2[2] == 'Logistic':
                    classi = LogisticRegression()
                    selector_lr = SelectFromModel(classi, threshold='mean')
                    X_train_lr = selector_lr.fit_transform(X_train_reduced, y_train)
                    X_test_lr = selector_lr.transform(X_test_reduced)
                    
elif best_config_2[2] == 'SVM':
                    classi = SVC(kernel='rbf', probability=True, random_state=42)
elif best_config_2[2] == 'XgBoost':
                    classi = XGBClassifier()
elif best_config_2[2] == 'MLP':
                    classi = MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=1000, random_state=42, early_stopping=True, learning_rate='adaptive')
elif best_config_2[2] == 'ensemble':
                    rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
                    logistic_model = LogisticRegression(random_state=42)
                    svc_model = SVC(kernel='rbf', probability=True, random_state=42)

In [155]:
## qui applico la classificazione usando i parametri miglori però bisogna capire come trattare il numero di features e la best_threshold 

best_case = classification_method('p_value', classi, alpha, X_train_reduced, Y_train, X_test_reduced, y_test, num_features=19, mode="Val", selected_features=[0], thresholds=0.5)

#best_case = classification_method('p_value', classi, alpha, X_train_lr, y_train, X_test_lr, y_test, num_features=6, mode="Val", selected_features=[0], thresholds=best_config_1[5])

In [68]:
print("Metrics from best_case:")
print(f"Alpha: {best_case['alpha']}")
print(f"Number of Features: {best_case['num_features']}")
print(f"Selected Features: {best_case['selected_features']}")
print(f"Precision-Recall AUC: {best_case['pr_auc']}")
print(f"Best Precision: {best_case['best_precision']}")
print(f"Best Recall: {best_case['best_recall']}")
print(f"ROC AUC: {best_case['roc_auc']}")
print(f"F1 Score: {best_case['f1']}")
print(f"Accuracy: {best_case['accuracy']}")
print(f"Confusion Matrix: \n{np.array(best_case['confusion_matrix'])}")
print(f"Best Threshold: {best_case['best_threshold']}")

Metrics from best_case:
Alpha: 0.003
Number of Features: 19
Selected Features: [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18]
Precision-Recall AUC: 0.33403890145008147
Best Precision: 0.3333333333333333
Best Recall: 1.0
ROC AUC: 0.2734375
F1 Score: 0.41379310344827586
Accuracy: 0.2916666666666667
Confusion Matrix: 
[[ 1 15]
 [ 2  6]]
Best Threshold: 0.5


In [168]:
import numpy as np

def find_best_configuration(results_val):
    best_config = None
    best_f1_mean = 0

    # Struttura per memorizzare le statistiche della configurazione migliore
    best_stats = {
        'classifier': None,
        'selector': None,
        'num_features': None,
        'f1_mean': 0,
        'f1_std': 0,
        'threshold_mean': 0,
        'thresholds': []
    }

    # Iterare sui fold, sui classificatori e sui selettori
    for fold_idx in range(len(results_val)):
        for i, classifier_results in enumerate(results_val[fold_idx]):
            for j, selector_results in enumerate(classifier_results):

                # Estrai tutti i valori di f1 e threshold per i diversi fold
                f1_scores = results_val[fold_idx][i][j]['f1']
                thresholds = results_val[fold_idx][i][j]['best_threshold']
                
                if f1_scores:
                    # Calcola media e deviazione standard di f1
                    f1_mean = np.mean(f1_scores)
                    f1_std = np.std(f1_scores)

                    # Calcola la media delle soglie
                    threshold_mean = np.mean(thresholds)

                    # Confronta con la miglior media di f1 finora trovata
                    if f1_mean > best_f1_mean:
                        best_f1_mean = f1_mean
                        best_config = results_val[fold_idx][i][j]

                        # Aggiorna le statistiche migliori
                        best_stats['classifier'] = best_config['classifier']
                        best_stats['selector'] = best_config['selector']
                        best_stats['num_features'] = best_config['num_features'][-1]  # Ultima iterazione con il numero di feature
                        best_stats['f1_mean'] = f1_mean
                        best_stats['f1_std'] = f1_std
                        best_stats['threshold_mean'] = threshold_mean
                        best_stats['thresholds'] = thresholds  # Salva le soglie di ogni fold

    return best_stats


# Esegui il processo per trovare la configurazione migliore
best_configuration = find_best_configuration(results_val)

# Stampa i risultati
print("Best configuration:")
print(f"Classifier: {best_configuration['classifier']}")
print(f"Selector: {best_configuration['selector']}")
print(f"Num Features: {best_configuration['num_features']}")
print(f"F1 Mean: {best_configuration['f1_mean']:.4f}")
print(f"F1 Std: {best_configuration['f1_std']:.4f}")
print(f"Threshold Mean: {best_configuration['threshold_mean']:.4f}")
print(f"Thresholds across folds: {best_configuration['thresholds']}")


Best configuration:
Classifier: RandomForest
Selector: mrmr
Num Features: 15
F1 Mean: 0.7121
F1 Std: 0.0385
Threshold Mean: 0.3686
Thresholds across folds: [0.3, 0.35000000000000003, 0.39000000000000007, 0.39000000000000007, 0.34, 0.3, 0.3, 0.35000000000000003, 0.49000000000000016, 0.36000000000000004, 0.38000000000000006, 0.4000000000000001, 0.39000000000000007, 0.4200000000000001]


In [166]:

classi = RandomForestClassifier(n_estimators=100, random_state=42)
selector_rf = SelectFromModel(classi, threshold='mean')
X_train_rf = selector_rf.fit_transform(X_train_reduced, Y_train)
X_test_rf = selector_rf.transform(X_test_reduced)


best_case = classification_method('mrmr', classi, alpha, X_train_rf, Y_train, X_test_rf, y_test, num_features=15, mode="Val", selected_features=[0])


In [167]:
print("Metrics from best_case:")
print(f"Alpha: {best_case['alpha']}")
print(f"Number of Features: {best_case['num_features']}")
print(f"Selected Features: {best_case['selected_features']}")
print(f"Precision-Recall AUC: {best_case['pr_auc']}")
print(f"Best Precision: {best_case['best_precision']}")
print(f"Best Recall: {best_case['best_recall']}")
print(f"ROC AUC: {best_case['roc_auc']}")
print(f"F1 Score: {best_case['f1']}")
print(f"Accuracy: {best_case['accuracy']}")
print(f"Confusion Matrix: \n{np.array(best_case['confusion_matrix'])}")
print(f"Best Threshold: {best_case['best_threshold']}")

Metrics from best_case:
Alpha: 0.003
Number of Features: 15
Selected Features: [0, 12, 11, 9, 8, 10, 5, 3, 2, 1, 7, 4, 6, -1, -1]
Precision-Recall AUC: 0.5820754817078346
Best Precision: 0.3333333333333333
Best Recall: 1.0
ROC AUC: 0.77734375
F1 Score: 0.7368421052631579
Accuracy: 0.7916666666666666
Confusion Matrix: 
[[12  4]
 [ 1  7]]
Best Threshold: 0.47000000000000014
