## import

In [2]:
seed = 42

# Import libraries
import tensorflow as tf
import warnings
warnings.filterwarnings('ignore')

import random
random.seed(seed)
from sklearn.utils import shuffle
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, Lasso
from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score, precision_recall_curve, auc, confusion_matrix
from imblearn.over_sampling import SMOTE

from sklearn.feature_selection import mutual_info_classif, SelectKBest, SelectPercentile, f_classif, f_regression, SelectFromModel
from scipy.spatial.distance import pdist, squareform
from scipy.stats import ttest_ind
from xgboost import XGBClassifier
import statistics
from sklearn.decomposition import PCA
from sklearn.metrics import balanced_accuracy_score

import pickle

## caricamento dati

### caricamento labels pazienti

In [3]:

file_path = "../CSV/data_rad_clin_DEF.csv"

data = pd.read_csv(file_path)
labels_column = data['label']
labels = labels_column.astype(int).tolist()

labels=np.array(labels)

# Estrazione dei numeri dai nomi dei pazienti
loaded_patients = data['IDs_new'].str.extract(r'(\d+)').astype(int).squeeze().tolist()

print("Labels:", labels)
print("Number of labels:", len(labels))
print("Patient Names: ", loaded_patients )




Labels: [0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 0
 0 0 1 1 0 0 1 0 1 1 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 1 0 1 0 1 0 0 1 0 0
 0 1 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1]
Number of labels: 129
Patient Names:  [5, 12, 15, 16, 17, 19, 22, 23, 24, 25, 26, 27, 29, 30, 31, 33, 35, 36, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 50, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 64, 65, 68, 69, 70, 71, 74, 75, 76, 78, 79, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 98, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 123, 124, 126, 127, 128, 129, 133, 135, 136, 137, 138, 139, 141, 142, 144, 146, 147, 149, 150, 153, 155, 158, 159, 161, 163, 166, 168, 169, 170, 171, 175, 176, 178, 182, 183, 188, 189, 190, 193, 197, 199, 200, 205]


### caricamento features encoder

In [4]:
file_path = "../CSV/EncodersSliceMaggiore/VGG19_Slice_Maggiore.csv"
#file_path = "../CSV/EncodersSliceMaggiore/InceptionV3_Slice_Maggiore.csv"
#file_path = "../CSV/EncodersSliceMaggiore/RESNET50_Slice_Maggiore.csv"

df = pd.read_csv(file_path, sep=',')


df['Unnamed: 0'] = df['Unnamed: 0'].astype(int)

df_ordered = df.set_index('Unnamed: 0').loc[loaded_patients].reset_index()

df_features = df_ordered.drop(columns=['Unnamed: 0'])

features = df_features.to_numpy()

print(features)
print(features.shape)


[[ 10.644694  42.666046   0.       ...   0.        18.299337   6.166822]
 [  0.       102.23023    0.       ...  18.42938   31.46421    0.      ]
 [  0.946728  28.357668   0.       ...   0.        18.290043   9.094667]
 ...
 [ 11.950816  10.536694  14.736543 ...   0.        27.75255   13.213755]
 [  0.       161.865      0.       ...   0.        19.546183  16.224407]
 [  0.       129.88449    0.       ...   0.        48.395874  11.344899]]
(129, 512)


### caricamento features radiomica

In [210]:
#file_path = "../CSV/EncodersSliceMaggiore/Radiomica_Wavelet_2D.csv"
file_path = "../CSV/EncodersSliceMaggiore/Radiomica_2D.csv"

df = pd.read_csv(file_path, sep=',')
#df = df.astype(float)

# Colonne da rimuovere SOLO PER RADIOMICA
columns_to_remove = [
    'Slice',
    'diagnostics_Image-original_Mean',
    'diagnostics_Image-original_Minimum',
    'diagnostics_Image-original_Maximum',
    'diagnostics_Mask-original_VoxelNum',
    'diagnostics_Mask-original_VolumeNum',
]

df_cleaned = df.drop(columns=columns_to_remove)
df_features = df_cleaned.drop(columns=['Paziente'])

features = df_features.to_numpy()

print(features)
print(features.shape)  

[[5.83888273e-01 2.49810487e+01 2.57099203e+01 ... 1.13404359e+03
  2.37470314e-01 7.12523395e+01]
 [8.68120272e-01 2.78353641e+01 2.75136330e+01 ... 2.79627909e+03
  1.66740377e-01 9.88514518e+01]
 [6.68428011e-01 3.34967625e+01 3.44818793e+01 ... 2.84190381e+02
  4.27515541e-02 4.71863205e+01]
 ...
 [8.95387032e-01 3.24479655e+01 2.80178515e+01 ... 6.33694339e+01
  1.64536668e-01 1.17728372e+01]
 [7.82116308e-01 2.65896102e+01 2.56320112e+01 ... 3.36424176e+03
  3.35445375e-01 6.76993135e+01]
 [5.58702485e-01 3.61138047e+01 3.58468967e+01 ... 2.19527898e+03
  2.01081360e-01 7.96408761e+01]]
(129, 102)


## funzioni

In [5]:

## Rimozione feature correlation
def remove_highly_correlated_features(X, threshold=0.85):
    corr_matrix = np.corrcoef(X, rowvar=False)
    upper_triangle = np.triu(corr_matrix, k=1)
    to_drop = [column for column in range(upper_triangle.shape[0]) if any(abs(upper_triangle[column, :]) > threshold)]
    X_reduced = np.delete(X, to_drop, axis=1)
    return X_reduced, to_drop

## Rimozione features p_value
def remove_high_pvalue_features(X, y, alpha=0.05):
    selector = SelectKBest(score_func=f_classif, k='all')
    selector.fit(X, y)
    p_values = selector.pvalues_
    features_to_keep = np.where(p_values < alpha)[0]
    X_reduced = X[:, features_to_keep]
    return X_reduced, features_to_keep

## FEATURE SELECTION LASSO
def select_features_with_lasso(X, y, alpha=0.001):
    
    lasso = Lasso(alpha=alpha)
    lasso.fit(X, y)
    coefficients = lasso.coef_
    selected_features = np.where(coefficients != 0)[0]
    X_selected = X[:, selected_features]

    return X_selected, selected_features

## FEATURE SELECTION LOGISTIC
def logistic_regression_feature_selection(X, y, num_features):
    lr = LogisticRegression(max_iter=2000, random_state=42)
    lr.fit(X, y)
    coef_abs = np.abs(lr.coef_)
    feature_importances = np.mean(coef_abs, axis=0)
    selected_features = feature_importances.argsort()[-num_features:][::-1]
    X_selected = X[:, selected_features]
    return X_selected, selected_features

## FEATURE SELECTION MRMR
def mrmr_feature_selection(X, y, num_features):
    mi = mutual_info_classif(X, y, random_state=42)
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(X)
    distances = squareform(pdist(X_scaled.T, 'euclidean'))
    
    selected_features = []
    selected_indices = []

    first_feature_index = np.argmax(mi)
    selected_features.append(first_feature_index)
    selected_indices.append(first_feature_index)
    
    for _ in range(num_features - 1):
        max_relevance = -np.inf
        selected_feature_index = -1
        
        for i in range(X.shape[1]):
            if i in selected_indices:
                continue
            
            relevance = mi[i]
            redundancy = np.mean(distances[i, selected_indices])
            
            mrmr_score = relevance - redundancy
            
            if mrmr_score > max_relevance:
                max_relevance = mrmr_score
                selected_feature_index = i
        
        selected_features.append(selected_feature_index)
        selected_indices.append(selected_feature_index)

    X_selected = X[:, selected_indices]
    return X_selected, selected_indices

## FEATURE SELECTION RANDOM FOREST
def rf_feature_selection(X, y, num_features):
    rf = RandomForestClassifier(n_estimators=100, random_state=42)
    rf.fit(X, y)
    feature_importances = rf.feature_importances_
    selected_features = np.argsort(feature_importances)[-num_features:][::-1]
    X_selected = X[:, selected_features]
    return X_selected, selected_features


## FEATURE SELECTION P_VALUE
# Seleziona e ordina le feature basate sui p-value con un test t di Student poi 
# ordina le feature in base al p-value in ordine crescente e seleziona le prime `num_features` caratteristiche.

def select_features_by_p_value(x_train_expanded, y_train_expanded, num_features):
    p_values = []
    num_features_total = x_train_expanded.shape[1]

    # Calcolo dei p-value per ciascuna feature
    for i in range(num_features_total):
        feature = x_train_expanded[:, i]
        group_0 = feature[y_train_expanded == 0]
        group_1 = feature[y_train_expanded == 1]
        t_stat, p_val = ttest_ind(group_0, group_1, equal_var=False)
        p_values.append(p_val)


    p_values = np.array(p_values)

    # Ordinare tutte le caratteristiche in base ai p-value (dal più piccolo al più grande)
    sorted_indices = np.argsort(p_values)
    sorted_indices = sorted_indices[:num_features]

    x_train_selected = x_train_expanded[:, sorted_indices]

    return x_train_selected, sorted_indices



## FUNZIONE PER RIMUOVERE FEATURES SELEZIONATE
def filter_patients_features(filtered_patients, selected_features):
    filtered_patients_selected = []

    for patient_features in filtered_patients:
        # Select only the features specified in selected_features
        patient_features_selected = patient_features[:, selected_features]
        filtered_patients_selected.append(patient_features_selected)

    return filtered_patients_selected


In [6]:
## classificazione completa che ritorna la threshold migliore per la classificazione
def classification_method(selector, classifier, alpha, x_train_expanded, y_train_expanded, x_test, y_test, num_features, mode="Val", selected_features=[0], thresholds=np.arange(0.4, 0.6, 0.01)):
    best_f1_score = 0
    best_case = None

    if mode == "Val":
        selected_features = None 

        if num_features != len(x_train_expanded[0]) or alpha != 0:
            if selector == "lasso":
                X_selected, selected_features = select_features_with_lasso(x_train_expanded, y_train_expanded, alpha)
                if(len(selected_features)==0):
                    return 0
            elif selector == "logistic":
                X_selected, selected_features = logistic_regression_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "mrmr":
                X_selected, selected_features = mrmr_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "rf":
                X_selected, selected_features = rf_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "p_value":
                X_selected, selected_features = select_features_by_p_value(x_train_expanded,y_train_expanded, num_features=num_features)
            else:
                print("Wrong selector. Choose between: mrmr, rf, logistic, p_value, lasso")
                return

            x_test = x_test[:, selected_features]  # selezione delle feature anche su x_test
        else:
            X_selected = x_train_expanded
            selected_features = list(range(len(x_train_expanded[0])))  # Selezioniamo tutte le feature se non si fa feature selection

        number_features = len(selected_features)  # Numero di feature selezionate

        # Training del classificatore
        classifier.fit(X_selected, y_train_expanded)


    if (mode == "Test"): ## non addestra il classificatore e non fa feature selection
        x_test = x_test[:, selected_features]
        number_features = len(selected_features)
    

    y_proba_test = classifier.predict_proba(x_test)[:, 1]

    if(isinstance(thresholds, np.ndarray)== False): ## se la threshold viene data fissa
        thresholds=[thresholds]
        
    
    for threshold in thresholds:

            y_pred_custom_test = (y_proba_test >= threshold).astype(int)

            accuracy = accuracy_score(y_test, y_pred_custom_test)
            f1 = f1_score(y_test, y_pred_custom_test)
            roc_auc = roc_auc_score(y_test, y_proba_test)

            precision, recall, _ = precision_recall_curve(y_test, y_proba_test)
            pr_auc = auc(recall, precision)

            conf = confusion_matrix(y_test, y_pred_custom_test)
            
            bal_acc = balanced_accuracy_score(y_test, y_pred_custom_test)


            # Se il nuovo risultato è migliore rispetto al migliore attuale (in base all'f1 e altrimenti pr_auc)
            if f1 > best_f1_score or (f1 == best_f1_score and pr_auc > (best_case['pr_auc'] if best_case else 0)):
                best_f1_score = f1
                best_case = {
                    'alpha': alpha,
                    'num_features': number_features,
                    'selected_features': selected_features,
                    'pr_auc': pr_auc,
                    'roc_auc': roc_auc,
                    'f1': f1,
                    'accuracy': accuracy,
                    'confusion_matrix': conf,
                    'best_threshold': threshold,
                    'balanced accuracy': bal_acc
                }

    return best_case


#####################################################################################################################################

### questo ritorna le il vettore di probabilità senza fare la classificazione
def classification_method_withoutThreshold(selector, classifier, alpha, x_train_expanded, y_train_expanded, x_test, y_test, num_features, mode="Val", selected_features=[0]):

    if mode == "Val":
        selected_features = None 

        if num_features != len(x_train_expanded[0]) or alpha != 0:
            if selector == "lasso":
                X_selected, selected_features = select_features_with_lasso(x_train_expanded, y_train_expanded, alpha)
                if (len(selected_features)==0):
                    return [0],0,[0]
            elif selector == "logistic":
                X_selected, selected_features = logistic_regression_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "mrmr":
                X_selected, selected_features = mrmr_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "rf":
                X_selected, selected_features = rf_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "p_value":
                X_selected, selected_features = select_features_by_p_value(x_train_expanded,y_train_expanded, num_features=num_features)
            else:
                print("Wrong selector. Choose between: mrmr, rf, logistic, p_value, lasso")
                return

            x_test = x_test[:, selected_features]  # selezione delle feature anche su x_test
        else:
            X_selected = x_train_expanded
            selected_features = list(range(len(x_train_expanded[0])))  # Selezioniamo tutte le feature se non si fa feature selection

        number_features = len(selected_features)  # Numero di feature selezionate

        # Training del classificatore
        classifier.fit(X_selected, y_train_expanded)


    if (mode == "Test"): ## non addestra il classificatore e non fa feature selection
        x_test = x_test[:, selected_features]
        number_features = len(selected_features)
    

    y_proba_test = classifier.predict_proba(x_test)[:, 1]

 
    return y_proba_test, number_features, selected_features


#####################################################################################################################################


### classificazione effettuata con una threshold specifica
def classification_threshold(y_proba_test,y_test, threshold, alpha, number_features, selected_features):
        
            best_case = None

            y_pred_custom_test = (y_proba_test >= threshold).astype(int)
            accuracy = accuracy_score(y_test, y_pred_custom_test)
            f1 = f1_score(y_test, y_pred_custom_test)
            roc_auc = roc_auc_score(y_test, y_proba_test)

            precision, recall, _ = precision_recall_curve(y_test, y_proba_test)
            pr_auc = auc(recall, precision)
            bal_acc = balanced_accuracy_score(y_test, y_pred_custom_test)

            conf = confusion_matrix(y_test, y_pred_custom_test)
            best_case = {
                    'alpha': alpha,
                    'num_features': number_features,
                    'selected_features': selected_features,
                    'pr_auc': pr_auc,
                    'roc_auc': roc_auc,
                    'f1': f1,
                    'accuracy': accuracy,
                    'confusion_matrix': conf,
                    'threshold': threshold,
                    'balanced accuracy': bal_acc
                }
                
            if not best_case:
                 print("Attenzione caso vuoto") 
            return best_case

#####################################################################################################################################


# metodo che definisce la threshold ottimale attraverso Youden's J statistic (threshold_selection= 'y')
# oppure attraverso la distanza euclidea dalla curva ROC (threshold_selection= 'd')
def classification_method_selection(selector, classifier, alpha, x_train_expanded, y_train_expanded, x_test, y_test, num_features, threshold_selection, mode="Val", selected_features=[0]):
    best_case = None

    if mode == "Val":
        selected_features = None 

        if num_features != len(x_train_expanded[0]) or alpha != 0:
            if selector == "lasso":
                X_selected, selected_features = select_features_with_lasso(x_train_expanded, y_train_expanded, alpha)
                if(len(selected_features)==0):
                    return 0
            elif selector == "logistic":
                X_selected, selected_features = logistic_regression_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "mrmr":
                X_selected, selected_features = mrmr_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "rf":
                X_selected, selected_features = rf_feature_selection(x_train_expanded, y_train_expanded, num_features)
            elif selector == "p_value":
                X_selected, selected_features = select_features_by_p_value(x_train_expanded,y_train_expanded, num_features=num_features)
            else:
                print("Wrong selector. Choose between: mrmr, rf, logistic, p_value, lasso")
                return

            x_test = x_test[:, selected_features]  # selezione delle feature anche su x_test
        else:
            X_selected = x_train_expanded
            selected_features = list(range(len(x_train_expanded[0])))  # Selezioniamo tutte le feature se non si fa feature selection

        number_features = len(selected_features)  # Numero di feature selezionate

        classi=classifierinitialization(classifier, X_selected, y_train_expanded )
        # Training del classificatore
        classi.fit(X_selected, y_train_expanded)


    if (mode == "Test"): ## non addestra il classificatore e non fa feature selection
        x_test = x_test[:, selected_features]
        number_features = len(selected_features)
    

    y_proba_test = classifier.predict_proba(x_test)[:, 1]

    roc_auc = roc_auc_score(y_test, y_proba_test)
    precision, recall, _ = precision_recall_curve(y_test, y_proba_test)
    pr_auc = auc(recall, precision)
        
    fpr,tpr,threshold=roc_curve(y_test,y_proba_test,pos_label=1)
    youden_j = tpr - fpr
    optimal_threshold = threshold[np.argmax(youden_j)]

    ## due modalità 
    if threshold_selection == 'y':
        youden_j = tpr - fpr
        optimal_threshold = threshold[np.argmax(youden_j)]
    elif threshold_selection == 'd':
        distances = np.sqrt((1 - tpr) ** 2 + fpr ** 2)
        optimal_threshold = threshold[np.argmin(distances)]
    else:
        print('Threshold non valida!')
        return None

    
    y_pred_custom_test = (y_proba_test >= optimal_threshold).astype(int)

    accuracy = accuracy_score(y_test, y_pred_custom_test)
    f1 = f1_score(y_test, y_pred_custom_test)
    conf = confusion_matrix(y_test, y_pred_custom_test)


    best_case = {
        'alpha': alpha,
        'num_features': number_features,
        'selected_features': selected_features,
        'pr_auc': pr_auc,
        'roc_auc': roc_auc,
        'f1': f1,
        'accuracy': accuracy,
        'confusion_matrix': conf,
        'best_threshold': optimal_threshold,
        'threshold_mode': threshold_selection
    }

    return best_case

def classifierinitialization(classifier):
    if classifier == 'RandomForest':
                            classi = RandomForestClassifier(n_estimators=100, random_state=42)
    elif classifier == 'Logistic':
                            classi = LogisticRegression(random_state=42, max_iter=2000)
    elif classifier == 'SVM':
                            classi = SVC(kernel='rbf', probability=True, random_state=42)
    elif classifier == 'XgBoost':
                            classi = XGBClassifier(random_state=42)
    elif classifier == 'MLP':
                            classi = MLPClassifier(hidden_layer_sizes=(128, 64, 32), max_iter=1000, random_state=42, early_stopping=True, learning_rate='adaptive', activation = 'logistic')
    elif classifier == 'ensemble':
                            rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
                            logistic_model = LogisticRegression(random_state=42, max_iter=2000)
                            svc_model = SVC(kernel='rbf', probability=True, random_state=42)
                            classi = VotingClassifier(
                                estimators=[
                                    ('random_forest', rf_model),
                                    ('logistic', logistic_model),
                                    ('svc', svc_model)
                                ],
                                voting='soft'
                                )
    return classi

## split

In [7]:
Y_train, y_test, X_train, X_test= train_test_split(labels, features, test_size=0.3, shuffle=False)


print("Number of train patients: ", len(X_train))
print("Number of test patients: ", len(y_test))

print("Number of features for every image: ", X_train[0].shape[0] )


Number of train patients:  90
Number of test patients:  39
Number of features for every image:  512


## correlation e p_value

In [8]:

## FEATURE CORRELATION

X_train_reduced, dropped_features = remove_highly_correlated_features(X_train, 0.8)
X_test_reduced = np.delete(X_test, dropped_features, axis=1)


print(X_train_reduced.shape)
print(X_test_reduced.shape)


# RIMOZIONE FEATURES CON P_VALUE ELEVATO

X_train_reduced, features_to_keep = remove_high_pvalue_features(X_train_reduced, Y_train, alpha=0.01)
X_test_reduced = X_test_reduced[:, features_to_keep]

print(X_train_reduced.shape)
print(X_test_reduced.shape)



(90, 491)
(39, 491)
(90, 42)
(39, 42)


## parametri

In [33]:


#alpha_1 = np.linspace(0.01, 0.6, 30).tolist() ## RANGE PER RESNET

alpha_1 = np.linspace(0.005, 0.5, 30).tolist() ## RANGE PER VGG

#alpha_2 = np.linspace(0, 0.005, 21).tolist()

alpha_2 = np.linspace(0.001, 0.05, 30).tolist() ## range per radiomica
alpha_values=alpha_1
#alpha_values.remove(0.0)


#thresholds=np.arange(0.4, 0.61, 0.01) 

thresholds=[0.5]

#selectors=['lasso', 'mrmr','rf', 'logistic']

#classifiers=['XgBoost',  'SVM', 'ensemble','RandomForest', 'Logistic', 'MLP']
classifiers=['SVM', 'ensemble','RandomForest', 'Logistic']
selectors=['mrmr','rf', 'logistic', 'lasso']


## Loop per Validation seed SPECIFICI

In [34]:

template_dict = {
                'fold': None,
                'classifier': None,
                'selector': None,
                'alpha': None,
                'num_features': None,
                'pr_auc': None,
                'roc_auc': None,
                'f1': None,
                'accuracy': None,
                'confusion_matrix': [],
                'selected_features': [],
                'balanced accuracy': None
            }


results_val_others = [template_dict.copy() for _ in range(5000)]
results_val_others.append(template_dict.copy())

results_val_lasso = [template_dict.copy() for _ in range(5000)]
results_val_lasso.append(template_dict.copy())

results_test_others = [template_dict.copy() for _ in range(5000)]
results_test_others.append(template_dict.copy())

results_test_lasso = [template_dict.copy() for _ in range(5000)]
results_test_lasso.append(template_dict.copy())


smote = SMOTE(random_state=11)
 
k=0
u=0
n_folds=5

skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=2)


In [None]:
for fold_idx, (train_index, val_index) in enumerate(skf.split(X_train_reduced, Y_train)):
    print("Starting with fold:", fold_idx)

    x_train_reduced, X_val_reduced = X_train_reduced[train_index], X_train_reduced[val_index]
    y_train, y_val = Y_train[train_index], Y_train[val_index]

    x_train_reduced, y_train = smote.fit_resample(x_train_reduced, y_train)

    #X_train_reduced, Y_train = smote.fit_resample(X_train_reduced, Y_train)


    for i, classifier in enumerate(classifiers):
            print("Starting with classifier:", classifier)
            for j, selector in enumerate(selectors):
                print("Starting with selector:", selector)

                if(selector=='lasso'):

                    for alpha in alpha_values:
                        #print("Doing alpha ", alpha )
                        classi= classifierinitialization(classifier)
                        best_case_val= classification_method(selector, classi, alpha, x_train_reduced, y_train, X_val_reduced, y_val, 0, mode="Val", selected_features=[0], thresholds=0.5)
                        
                        if(best_case_val==0):
                            continue
                        
                        results_val_lasso[k] = {
                                            'fold': fold_idx,
                                            'classifier': classifier,
                                            'selector': selector,
                                            'alpha': alpha,
                                            'num_features': best_case_val['num_features'],
                                            'selected_features': best_case_val['selected_features'],
                                            'pr_auc': best_case_val['pr_auc'],
                                            'roc_auc': best_case_val['roc_auc'],
                                            'f1': best_case_val['f1'],
                                            'accuracy': best_case_val['accuracy'],
                                            'confusion_matrix': best_case_val['confusion_matrix'],
                                            'balanced accuracy': best_case_val['balanced accuracy'],
                                            }

                        #print(best_case_val['num_features'])
                        

                        if(fold_idx==0):
                            classi= classifierinitialization(classifier)
                            best_case_test= classification_method(selector, classi, alpha, X_train_reduced, Y_train, X_test_reduced, y_test, 0, mode="Val", selected_features=[0], thresholds=0.5)

                            if(best_case_test==0):
                                continue
                            results_test_lasso[u] = {
                                                'classifier': classifier,
                                                'selector': selector,
                                                'alpha': alpha,
                                                'num_features': best_case_test['num_features'],
                                                'selected_features': best_case_test['selected_features'],
                                                'pr_auc': best_case_test['pr_auc'],
                                                'roc_auc': best_case_test['roc_auc'],
                                                'f1': best_case_test['f1'],
                                                'accuracy': best_case_test['accuracy'],
                                                'confusion_matrix': best_case_test['confusion_matrix'],
                                                'balanced accuracy': best_case_test['balanced accuracy'],
                                                }
                            u=u+1
                        k = k + 1

                else:
                    #limit=len(x_train_reduced[0]) + 1
                    limit=30
                    for t in range(1, limit):
                            classi= classifierinitialization(classifier)

                            best_case_val= classification_method(selector, classi, 0, x_train_reduced, y_train, X_val_reduced, y_val, t, mode="Val", selected_features=[0], thresholds=0.5)
                    
                                
                            results_val_others[k] = {
                                                'fold': fold_idx,
                                                'classifier': classifier,
                                                'selector': selector,
                                                'alpha': 0,
                                                'num_features': t,
                                                'selected_features': best_case_val['selected_features'],
                                                'pr_auc': best_case_val['pr_auc'],
                                                'roc_auc': best_case_val['roc_auc'],
                                                'f1': best_case_val['f1'],
                                                'accuracy': best_case_val['accuracy'],
                                                'confusion_matrix': best_case_val['confusion_matrix'],
                                                'balanced accuracy': best_case_val['balanced accuracy'],
                                                }
                            #print(results_val_others[k]['f1'])

                            if(fold_idx==0):
                                classi= classifierinitialization(classifier)
                                best_case_test= classification_method(selector, classi, 0, X_train_reduced, Y_train, X_test_reduced, y_test, t, mode="Val", selected_features=[0], thresholds=0.5)
                                
                                results_test_others[u] = {
                                                    'classifier': classifier,
                                                    'selector': selector,
                                                    'alpha': 0,
                                                    'num_features': t,
                                                    'selected_features': best_case_test['selected_features'],
                                                    'pr_auc': best_case_test['pr_auc'],
                                                    'roc_auc': best_case_test['roc_auc'],
                                                    'f1': best_case_test['f1'],
                                                    'accuracy': best_case_test['accuracy'],
                                                    'confusion_matrix': best_case_test['confusion_matrix'],
                                                    'balanced accuracy': best_case_test['balanced accuracy'],
                                                    }
                                u=u+1

                            k = k + 1

In [36]:
results_test_lasso = [entry for entry in results_test_lasso if entry['classifier'] is not None]
print(f"Filtered results_test: {len(results_test_lasso)} entries remaining")
results_val_lasso= [entry for entry in results_val_lasso if entry['classifier'] is not None]
print(f"Filtered results_test: {len(results_val_lasso)} entries remaining")
results_test_others = [entry for entry in results_test_others if entry['classifier'] is not None]
print(f"Filtered results_test: {len(results_test_others)} entries remaining")
results_val_others = [entry for entry in results_val_others if entry['classifier'] is not None]
print(f"Filtered results_test: {len(results_val_others)} entries remaining")

Filtered results_test: 120 entries remaining
Filtered results_test: 600 entries remaining
Filtered results_test: 348 entries remaining
Filtered results_test: 1740 entries remaining


### sorting per val

In [37]:
#num_features_range = list(range(1, (len(x_train_reduced[0]) + 1)))

num_features_range = list(range(1, 30))

grid_results_others = {}
grid_results_lasso = {}

selectors = ['mrmr', 'rf', 'logistic']

# Itera su tutte le combinazioni di parametri (classifier, selector, num_features, threshold)
for classifier in classifiers:
    #print(f"Sto iniziando classifier {classifier}")
    for selector in selectors:
            #print(f"Sto iniziando selector {selector}")
            for num_features in num_features_range:
                    
                    # Filtra i risultati che corrispondono a questa combinazione di parametri
                    filtered_results=[]
                    for res in results_val_others:
                        ## qui filtro per num_features
                        if (res['classifier'] == classifier and res['selector'] == selector and res['num_features'] == num_features):
                            filtered_results.append(res)
                
                    if filtered_results:
                        f1_values = [res['f1'] for res in filtered_results]
                        balaccuracy_values = [res['balanced accuracy'] for res in filtered_results]
                        roc_values=[res['roc_auc'] for res in filtered_results]

                        # Calcola le medie delle metriche
                        avg_f1 = sum(f1_values) / len(f1_values)
                        avg_balaccuracy = sum(balaccuracy_values) / len(balaccuracy_values)
                        avg_roc = sum(roc_values) / len(roc_values)

                        # Calcola la deviazione standard delle metriche
                        std_f1 = statistics.stdev(f1_values) if len(f1_values) > 1 else 0
                        std_balaccuracy = statistics.stdev(balaccuracy_values) if len(balaccuracy_values) > 1 else 0
                        std_roc_auc = statistics.stdev(roc_values) if len(roc_values) > 1 else 0

                        # Memorizza i risultati medi e la deviazione standard di questa combinazione
                        grid_results_others[(classifier, selector, num_features)] = {
                            'avg_f1': avg_f1,
                            'std_f1': std_f1,
                            'avg_balaccuracy': avg_balaccuracy,
                            'std_balaccuracy': std_balaccuracy,
                            'avg_roc_auc': avg_roc,
                            'std_roc_auc': std_roc_auc
                        }



## ORA PER LASSO
selectors = ['lasso']
for classifier in classifiers:
    #print(f"Sto iniziando classifier {classifier}")
    for selector in selectors:
        #print(f"Sto iniziando selector {selector}")
        for alpha in alpha_values:
                filtered_results = []
                for res in results_val_lasso:
                    ## qui filtro per alpha
                    if (res['classifier'] == classifier and res['selector'] == selector and res['alpha'] == alpha):
                        filtered_results.append(res)

                if filtered_results:
                        f1_values = [res['f1'] for res in filtered_results]
                        balaccuracy_values = [res['balanced accuracy'] for res in filtered_results]
                        roc_values=[res['roc_auc'] for res in filtered_results]

                        # Calcola le medie delle metriche
                        avg_f1 = sum(f1_values) / len(f1_values)
                        avg_balaccuracy = sum(balaccuracy_values) / len(balaccuracy_values)
                        avg_roc = sum(roc_values) / len(roc_values)

                        # Calcola la deviazione standard delle metriche
                        std_f1 = statistics.stdev(f1_values) if len(f1_values) > 1 else 0
                        std_balaccuracy = statistics.stdev(balaccuracy_values) if len(balaccuracy_values) > 1 else 0
                        std_roc_auc = statistics.stdev(roc_values) if len(roc_values) > 1 else 0

                        # Memorizza i risultati medi e la deviazione standard di questa combinazione
                        grid_results_lasso[(classifier, selector, alpha)] = {
                            'avg_f1': avg_f1,
                            'std_f1': std_f1,
                            'avg_balaccuracy': avg_balaccuracy,
                            'std_balaccuracy': std_balaccuracy,
                            'avg_roc_auc': avg_roc,
                            'std_roc_auc': std_roc_auc
                        }


# Ordina le combinazioni per 'avg_f1', e in caso di parità, per 'avg_pr_auc'
sorted_results_others = sorted(grid_results_others.items(), key=lambda x: (x[1]['avg_balaccuracy'], x[1]['avg_roc_auc']),reverse=True)
sorted_results_lasso = sorted(grid_results_lasso.items(), key=lambda x: (x[1]['avg_balaccuracy'], x[1]['avg_roc_auc']), reverse=True)

#sorted_results_others = sorted(grid_results_others.items(), key=lambda x: (x[1]['avg_roc_auc'], x[1]['avg_balaccuracy']),reverse=True)
#sorted_results_lasso = sorted(grid_results_lasso.items(), key=lambda x: (x[1]['avg_roc_auc'], x[1]['avg_balaccuracy']), reverse=True)
# Combina i risultati di entrambi i grid search
sorted_results = sorted_results_others + sorted_results_lasso

sorted_results = sorted(sorted_results, key=lambda x: (x[1]['avg_balaccuracy'], x[1]['avg_roc_auc']), reverse=True)
#sorted_results = sorted(sorted_results, key=lambda x: (x[1]['avg_roc_auc'], x[1]['avg_balaccuracy']), reverse=True)



### mostro migliori combo

In [None]:
n=10
best_combinations = sorted_results[:n] ## mostrando le n migliori configurazioni

print(f"Migliori {n} combinazioni di parametri:")
for i, (params, metrics) in enumerate(best_combinations, start=1):

    print(f"\n#{i}:")
    print(f"Classifier: {params[0]}")
    print(f"Selector: {params[1]}")
    if (params[1]=='lasso'):
        print(f"Alpha: {params[2]}")
    else:
        print(f"Num_features: {params[2]}")

    print(f"Performance medie sul val set: \nROC AUC = {metrics['avg_roc_auc']} (std = {metrics['std_roc_auc']}), "f"Balanced Accuracy = {metrics['avg_balaccuracy']} (std = {metrics['std_balaccuracy']})")



    for p in range (0, len(results_test_others)):
            if(params[1]=='lasso'):
                if(results_test_lasso[p]['classifier']==params[0] and results_test_lasso[p]['alpha']==params[2]):
                        best_case=results_test_lasso[p]
                        break
            else:     
                if(results_test_others[p]['classifier']==params[0] and results_test_others[p]['selector']==params[1] and results_test_others[p]['num_features']==params[2]):
                        best_case=results_test_others[p]
                        break

    
    print("Metrics on the TEST set:")

    print(f"Selected Features: {best_case['selected_features']}")
    print(f"ROC AUC: {best_case['roc_auc']}")
    print(f"F1 Score: {best_case['f1']}")
    print(f"Accuracy: {best_case['accuracy']}")
    print(f"Balanced Accuracy: {best_case['balanced accuracy']}")
    print(f"Confusion Matrix: \n{np.array(best_case['confusion_matrix'])}")


## FINETUNAMENTO SEEDs


In [40]:

seeds = list(range(1, 8)) 
print(seeds)


#alpha_1 = np.linspace(0.01, 0.6, 30).tolist() ## RANGE PER RESNET

alpha_1 = np.linspace(0.005, 0.5, 30).tolist() ## RANGE PER VGG

#alpha_2 = np.linspace(0, 0.005, 21).tolist()

alpha_values=alpha_1 


thresholds=[0.5]

#selectors=['lasso', 'mrmr','rf', 'logistic']

#classifiers=['XgBoost',  'SVM', 'ensemble','RandomForest', 'Logistic', 'MLP']

classifiers=['SVM', 'ensemble','RandomForest', 'Logistic']
selectors=['mrmr','rf', 'logistic', 'lasso']


[1, 2, 3, 4, 5, 6, 7]


In [41]:
template_dict = {
                'seedSmote':None,
                'seedKFold': None,
                'fold': None,
                'classifier': None,
                'selector': None,
                'alpha': None,
                'num_features': None,
                'pr_auc': None,
                'roc_auc': None,
                'f1': None,
                'accuracy': None,
                'confusion_matrix': [],
                'selected_features': [],
                'balanced accuracy': None
                }


results_val_others = [template_dict.copy() for _ in range(900000)]
results_val_others.append(template_dict.copy())

results_val_lasso = [template_dict.copy() for _ in range(900000)]
results_val_lasso.append(template_dict.copy())

results_test_others = [template_dict.copy() for _ in range(900000)]
results_test_others.append(template_dict.copy())

results_test_lasso = [template_dict.copy() for _ in range(900000)]
results_test_lasso.append(template_dict.copy())

k=0
u=0
n_folds=5

for seed1 in seeds:
     print("Starting with seed ", seed1)
     
     for seed2 in seeds:
        
        smote = SMOTE(random_state=seed1)

        skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=seed2)

        for fold_idx, (train_index, val_index) in enumerate(skf.split(X_train_reduced, Y_train)):
            #print("Starting with fold:", fold_idx)

            x_train_reduced, X_val_reduced = X_train_reduced[train_index], X_train_reduced[val_index]
            y_train, y_val = Y_train[train_index], Y_train[val_index]

            x_train_reduced, y_train = smote.fit_resample(x_train_reduced, y_train)

            #X_train_reduced, Y_train = smote.fit_resample(X_train_reduced, Y_train)


            for i, classifier in enumerate(classifiers):
                    #print("Starting with classifier:", classifier)
                    for j, selector in enumerate(selectors):
                        #print("Starting with selector:", selector)

                        if(selector=='lasso'):

                            for alpha in alpha_values:
                                #print("Doing alpha ", alpha )
                                classi= classifierinitialization(classifier)
                                best_case_val= classification_method(selector, classi, alpha, x_train_reduced, y_train, X_val_reduced, y_val, 0, mode="Val", selected_features=[0], thresholds=0.5)
                                
                                if(best_case_val==0):
                                    continue
                                
                                results_val_lasso[k] = {
                                                    'seedSmote': seed1,
                                                    'seedKFold': seed2,
                                                    'fold': fold_idx,
                                                    'classifier': classifier,
                                                    'selector': selector,
                                                    'alpha': alpha,
                                                    'num_features': best_case_val['num_features'],
                                                    'selected_features': best_case_val['selected_features'],
                                                    'pr_auc': best_case_val['pr_auc'],
                                                    'roc_auc': best_case_val['roc_auc'],
                                                    'f1': best_case_val['f1'],
                                                    'accuracy': best_case_val['accuracy'],
                                                    'confusion_matrix': best_case_val['confusion_matrix'],
                                                    'balanced accuracy': best_case_val['balanced accuracy'],
                                                    }

                                #print(best_case_val['num_features'])
                                

                                if(fold_idx==0):
                                    classi= classifierinitialization(classifier)
                                    best_case_test= classification_method(selector, classi, alpha, X_train_reduced, Y_train, X_test_reduced, y_test, 0, mode="Val", selected_features=[0], thresholds=0.5)

                                    if(best_case_test==0):
                                        continue
                                    results_test_lasso[u] = {
                                                        'seedSmote': seed1,
                                                        'seedKFold': seed2,
                                                        'classifier': classifier,
                                                        'selector': selector,
                                                        'alpha': alpha,
                                                        'num_features': best_case_test['num_features'],
                                                        'selected_features': best_case_test['selected_features'],
                                                        'pr_auc': best_case_test['pr_auc'],
                                                        'roc_auc': best_case_test['roc_auc'],
                                                        'f1': best_case_test['f1'],
                                                        'accuracy': best_case_test['accuracy'],
                                                        'confusion_matrix': best_case_test['confusion_matrix'],
                                                        'balanced accuracy': best_case_test['balanced accuracy'],
                                                        }
                                    u=u+1
                                k = k + 1

                        else:
                            #limit=len(x_train_reduced[0]) + 1
                            limit=30
                            for t in range(1, limit):
                                    classi= classifierinitialization(classifier)

                                    best_case_val= classification_method(selector, classi, 0, x_train_reduced, y_train, X_val_reduced, y_val, t, mode="Val", selected_features=[0], thresholds=0.5)
                            
                                        
                                    results_val_others[k] = {
                                                        'seedSmote': seed1,
                                                        'seedKFold': seed2,
                                                        'fold': fold_idx,
                                                        'classifier': classifier,
                                                        'selector': selector,
                                                        'alpha': 0,
                                                        'num_features': t,
                                                        'selected_features': best_case_val['selected_features'],
                                                        'pr_auc': best_case_val['pr_auc'],
                                                        'roc_auc': best_case_val['roc_auc'],
                                                        'f1': best_case_val['f1'],
                                                        'accuracy': best_case_val['accuracy'],
                                                        'confusion_matrix': best_case_val['confusion_matrix'],
                                                        'balanced accuracy': best_case_val['balanced accuracy'],
                                                        }
                                    #print(results_val_others[k]['f1'])

                                    if(fold_idx==0):
                                        classi= classifierinitialization(classifier)
                                        best_case_test= classification_method(selector, classi, 0, X_train_reduced, Y_train, X_test_reduced, y_test, t, mode="Val", selected_features=[0], thresholds=0.5)
                                        
                                        results_test_others[u] = {
                                                            'seedSmote': seed1,
                                                            'seedKFold': seed2,
                                                            'classifier': classifier,
                                                            'selector': selector,
                                                            'alpha': 0,
                                                            'num_features': t,
                                                            'selected_features': best_case_test['selected_features'],
                                                            'pr_auc': best_case_test['pr_auc'],
                                                            'roc_auc': best_case_test['roc_auc'],
                                                            'f1': best_case_test['f1'],
                                                            'accuracy': best_case_test['accuracy'],
                                                            'confusion_matrix': best_case_test['confusion_matrix'],
                                                            'balanced accuracy': best_case_test['balanced accuracy'],
                                                            }
                                        u=u+1

                                    k = k + 1

Starting with seed  1


In [11]:
results_test_lasso = [entry for entry in results_test_lasso if entry['classifier'] is not None]
results_val_lasso= [entry for entry in results_val_lasso if entry['classifier'] is not None]
results_test_others = [entry for entry in results_test_others if entry['classifier'] is not None]
results_val_others = [entry for entry in results_val_others if entry['classifier'] is not None]

In [13]:
num_features_range= range(1, limit)
grid_results_others = {}
grid_results_lasso = {}

selectors = ['mrmr', 'rf', 'logistic']

for seed1 in seeds:
     for seed2 in seeds:
        # Itera su tutte le combinazioni di parametri (classifier, selector, num_features, threshold)
        for classifier in classifiers:
                    #print(f"Sto iniziando classifier {classifier}")
                    for selector in selectors:
                            #print(f"Sto iniziando selector {selector}")
                            for num_features in num_features_range:
                                    
                                    # Filtra i risultati che corrispondono a questa combinazione di parametri
                                    filtered_results=[]
                                    for res in results_val_others:
                                        ## qui filtro per num_features
                                        if (res['classifier'] == classifier and res['selector'] == selector and res['num_features'] == num_features and res['seedKFold']== seed2 and res['seedSmote']== seed1):
                                            filtered_results.append(res)
                                
                                    if filtered_results:
                                        f1_values = [res['f1'] for res in filtered_results]
                                        balaccuracy_values = [res['balanced accuracy'] for res in filtered_results]
                                        roc_values=[res['roc_auc'] for res in filtered_results]

                                        # Calcola le medie delle metriche
                                        avg_f1 = sum(f1_values) / len(f1_values)
                                        avg_balaccuracy = sum(balaccuracy_values) / len(balaccuracy_values)
                                        avg_roc = sum(roc_values) / len(roc_values)

                                        # Calcola la deviazione standard delle metriche
                                        std_f1 = statistics.stdev(f1_values) if len(f1_values) > 1 else 0
                                        std_balaccuracy = statistics.stdev(balaccuracy_values) if len(balaccuracy_values) > 1 else 0
                                        std_roc_auc = statistics.stdev(roc_values) if len(roc_values) > 1 else 0

                                        # Memorizza i risultati medi e la deviazione standard di questa combinazione
                                        grid_results_others[(classifier, selector, num_features, seed1, seed2)] = {
                                            'avg_f1': avg_f1,
                                            'std_f1': std_f1,
                                            'avg_balaccuracy': avg_balaccuracy,
                                            'std_balaccuracy': std_balaccuracy,
                                            'avg_roc_auc': avg_roc,
                                            'std_roc_auc': std_roc_auc,
                                            'seedSmote': seed1,
                                            'seedKFold': seed2
                                        }



## ORA PER LASSO
selectors = ['lasso']
for seed1 in seeds:
     for seed2 in seeds:
        for classifier in classifiers:
                    #print(f"Sto iniziando classifier {classifier}")
                    for selector in selectors:
                        #print(f"Sto iniziando selector {selector}")
                        for alpha in alpha_values:
                                filtered_results = []
                                for res in results_val_lasso:
                                    ## qui filtro per alpha
                                    if (res['classifier'] == classifier and res['selector'] == selector and res['alpha'] == alpha and res['seedKFold']== seed2 and res['seedSmote']== seed1):
                                        filtered_results.append(res)

                                if filtered_results:
                                        f1_values = [res['f1'] for res in filtered_results]
                                        balaccuracy_values = [res['balanced accuracy'] for res in filtered_results]
                                        roc_values=[res['roc_auc'] for res in filtered_results]

                                        # Calcola le medie delle metriche
                                        avg_f1 = sum(f1_values) / len(f1_values)
                                        avg_balaccuracy = sum(balaccuracy_values) / len(balaccuracy_values)
                                        avg_roc = sum(roc_values) / len(roc_values)

                                        # Calcola la deviazione standard delle metriche
                                        std_f1 = statistics.stdev(f1_values) if len(f1_values) > 1 else 0
                                        std_balaccuracy = statistics.stdev(balaccuracy_values) if len(balaccuracy_values) > 1 else 0
                                        std_roc_auc = statistics.stdev(roc_values) if len(roc_values) > 1 else 0

                                        # Memorizza i risultati medi e la deviazione standard di questa combinazione
                                        grid_results_lasso[(classifier, selector, alpha, seed1, seed2)] = {
                                            'avg_f1': avg_f1,
                                            'std_f1': std_f1,
                                            'avg_balaccuracy': avg_balaccuracy,
                                            'std_balaccuracy': std_balaccuracy,
                                            'avg_roc_auc': avg_roc,
                                            'std_roc_auc': std_roc_auc,
                                            'seedSmote': seed1,
                                            'seedKFold': seed2
                                        }

In [14]:
# Liste per memorizzare i migliori risultati, con seed inclusi nei valori
best_results_others = []
best_results_lasso = []

# Funzione di confronto che tiene conto prima di "balanced accuracy" e poi di "roc_auc"
def is_better(result1, result2):
    if result1['avg_balaccuracy'] > result2['avg_balaccuracy']:
        return True
    elif result1['avg_balaccuracy'] == result2['avg_balaccuracy']:
        return result1['avg_roc_auc'] > result2['avg_roc_auc']
    return False

# Filtraggio per grid_results_others: scegli il migliore per ogni coppia (seed1, seed2)
for key, result in grid_results_others.items():
    classifier, selector, num_features, seed1, seed2 = key
    seed_pair = (seed1, seed2)
    
    # Cerca se esiste già una combinazione con la stessa coppia di seed
    found = False
    for entry in best_results_others:
        if (entry['seed1'] == seed1 and entry['seed2'] == seed2):
            found = True
            # Se esiste, confronta le metriche e tieni il migliore
            if is_better(result, entry):
                entry.update({
                    'classifier': classifier,
                    'selector': selector,
                    'num_features': num_features,
                    'avg_f1': result['avg_f1'],
                    'std_f1': result['std_f1'],
                    'avg_balaccuracy': result['avg_balaccuracy'],
                    'std_balaccuracy': result['std_balaccuracy'],
                    'avg_roc_auc': result['avg_roc_auc'],
                    'std_roc_auc': result['std_roc_auc'],
                    'seed1': seed1,
                    'seed2': seed2
                })
            break
    
    # Se non esiste, aggiungi la nuova combinazione
    if not found:
        best_results_others.append({
            'classifier': classifier,
            'selector': selector,
            'num_features': num_features,
            'avg_f1': result['avg_f1'],
            'std_f1': result['std_f1'],
            'avg_balaccuracy': result['avg_balaccuracy'],
            'std_balaccuracy': result['std_balaccuracy'],
            'avg_roc_auc': result['avg_roc_auc'],
            'std_roc_auc': result['std_roc_auc'],
            'seed1': seed1,
            'seed2': seed2
        })

# Filtraggio per grid_results_lasso: scegli il migliore per ogni coppia (seed1, seed2)
for key, result in grid_results_lasso.items():
    classifier, selector, alpha, seed1, seed2 = key
    seed_pair = (seed1, seed2)
    
    # Cerca se esiste già una combinazione con la stessa coppia di seed
    found = False
    for entry in best_results_lasso:
        if (entry['seed1'] == seed1 and entry['seed2'] == seed2):
            found = True
            # Se esiste, confronta le metriche e tieni il migliore
            if is_better(result, entry):
                entry.update({
                    'classifier': classifier,
                    'selector': selector,
                    'alpha': alpha,
                    'avg_f1': result['avg_f1'],
                    'std_f1': result['std_f1'],
                    'avg_balaccuracy': result['avg_balaccuracy'],
                    'std_balaccuracy': result['std_balaccuracy'],
                    'avg_roc_auc': result['avg_roc_auc'],
                    'std_roc_auc': result['std_roc_auc'],
                    'seed1': seed1,
                    'seed2': seed2
                })
            break
    
    # Se non esiste, aggiungi la nuova combinazione
    if not found:
        best_results_lasso.append({
            'classifier': classifier,
            'selector': selector,
            'alpha': alpha,
            'avg_f1': result['avg_f1'],
            'std_f1': result['std_f1'],
            'avg_balaccuracy': result['avg_balaccuracy'],
            'std_balaccuracy': result['std_balaccuracy'],
            'avg_roc_auc': result['avg_roc_auc'],
            'std_roc_auc': result['std_roc_auc'],
            'seed1': seed1,
            'seed2': seed2
        })

# Ora best_results_others e best_results_lasso contengono solo un elemento per ogni coppia di seed (seed1, seed2).


In [24]:
s=0

print(f"Combinazioni con balanced accuracy > 0.7 sul test set:")

# Itera su tutte le migliori combinazioni trovate nei risultati di validazione di 'others'
for i, result in enumerate(best_results_others, start=1):
    best_case = None  # Inizializza best_case
    # Cerca la combinazione corrispondente nei risultati del test set 'others'
    for p in range(len(results_test_others)):
        if (results_test_others[p]['classifier'] == result['classifier'] and
            results_test_others[p]['selector'] == result['selector'] and
            results_test_others[p]['num_features'] == result['num_features'] and
            results_test_others[p]['seedKFold'] == result['seed2'] and
            results_test_others[p]['seedSmote'] == result['seed1']):
            
            best_case = results_test_others[p]
            break
    
    # Controlla se la balanced accuracy sul test è maggiore di 0.7
    if best_case and best_case['balanced accuracy'] > 0.71:
        s=s+1
        print(f"\n#{s}:")
        print(f"Classifier: {result['classifier']}")
        print(f"Selector: {result['selector']}")
        print(f"Num_features: {result['num_features']}")
        print(f"Seed Kfold: {result['seed2']}")
        print(f"Seed Smote: {result['seed1']}")
        
        # Stampa le performance medie dal validation set
        print(f"Performance medie sul validation set: \nROC AUC = {result['avg_roc_auc']} (std = {result['std_roc_auc']}), "
              f"Balanced Accuracy = {result['avg_balaccuracy']} (std = {result['std_balaccuracy']})")
        
        # Stampa le metriche sul test set
        print("Metrics on the TEST set:")
        print(f"Selected Features: {best_case['selected_features']}")
        print(f"ROC AUC: {best_case['roc_auc']}")
        print(f"F1 Score: {best_case['f1']}")
        print(f"Accuracy: {best_case['accuracy']}")
        print(f"Balanced Accuracy: {best_case['balanced accuracy']}")
        print(f"Confusion Matrix: \n{np.array(best_case['confusion_matrix'])}")



Combinazioni con balanced accuracy > 0.7 sul test set:

#1:
Classifier: SVM
Selector: logistic
Num_features: 6
Seed Kfold: 1
Seed Smote: 11
Performance medie sul validation set: 
ROC AUC = 0.6860493827160493 (std = 0.0995209815742899), Balanced Accuracy = 0.7133333333333334 (std = 0.0967079653159736)
Metrics on the TEST set:
Selected Features: [12 21  8 19 18 22]
ROC AUC: 0.7037037037037036
F1 Score: 0.6060606060606061
Accuracy: 0.6666666666666666
Balanced Accuracy: 0.712962962962963
Confusion Matrix: 
[[16 11]
 [ 2 10]]

#2:
Classifier: SVM
Selector: logistic
Num_features: 6
Seed Kfold: 4
Seed Smote: 10
Performance medie sul validation set: 
ROC AUC = 0.7204012345679013 (std = 0.07241317283625524), Balanced Accuracy = 0.7030555555555555 (std = 0.05207314715206859)
Metrics on the TEST set:
Selected Features: [12 21  8 19 18 22]
ROC AUC: 0.7037037037037036
F1 Score: 0.6060606060606061
Accuracy: 0.6666666666666666
Balanced Accuracy: 0.712962962962963
Confusion Matrix: 
[[16 11]
 [ 2 10]]

In [22]:

# Itera su tutte le migliori combinazioni trovate nei risultati di validazione di 'lasso'
for i, result in enumerate(best_results_lasso, start=len(best_results_others) + 1):
    best_case = None  # Inizializza best_case
    # Cerca la combinazione corrispondente nei risultati del test set 'lasso'
    for p in range(len(results_test_lasso)):
        if (results_test_lasso[p]['classifier'] == result['classifier'] and
            results_test_lasso[p]['alpha'] == result['alpha'] and
            results_test_lasso[p]['seedKFold'] == result['seed1'] and
            results_test_lasso[p]['seedSmote'] == result['seed2']):
            
            best_case = results_test_lasso[p]
            break
    
    # Controlla se la balanced accuracy sul test è maggiore di 0.7
    if best_case and best_case['balanced accuracy'] > 0.7:
        s=s+1
        print(f"\n#{s}:")
        print(f"Classifier: {result['classifier']}")
        print(f"Selector: {result['selector']}")
        print(f"Alpha: {result['alpha']}")
        print(f"Seed Kfold: {result['seed1']}")
        print(f"Seed Smote: {result['seed2']}")
        
        # Stampa le performance medie dal validation set
        print(f"Performance medie sul validation set: \nROC AUC = {result['avg_roc_auc']} (std = {result['std_roc_auc']}), "
              f"Balanced Accuracy = {result['avg_balaccuracy']} (std = {result['std_balaccuracy']})")
        
        # Stampa le metriche sul test set
        print("Metrics on the TEST set:")
        print(f"Selected Features: {best_case['selected_features']}")
        print(f"ROC AUC: {best_case['roc_auc']}")
        print(f"F1 Score: {best_case['f1']}")
        print(f"Accuracy: {best_case['accuracy']}")
        print(f"Balanced Accuracy: {best_case['balanced accuracy']}")
        print(f"Confusion Matrix: \n{np.array(best_case['confusion_matrix'])}")



#16:
Classifier: ensemble
Selector: lasso
Alpha: 0.19275862068965519
Seed Kfold: 1
Seed Smote: 1
Performance medie sul validation set: 
ROC AUC = 0.7151851851851851 (std = 0.1769195260280399), Balanced Accuracy = 0.6852777777777778 (std = 0.14257808668399616)
Metrics on the TEST set:
Selected Features: [ 0  3  4  5  6  7  8  9 10 11 13 15 16 17 18 19 20 21 22 23 25 28 29 32
 33 35 36 37 38 39 40 41]
ROC AUC: 0.6666666666666667
F1 Score: 0.6
Accuracy: 0.6923076923076923
Balanced Accuracy: 0.7083333333333333
Confusion Matrix: 
[[18  9]
 [ 3  9]]

#17:
Classifier: RandomForest
Selector: lasso
Alpha: 0.4658620689655173
Seed Kfold: 1
Seed Smote: 7
Performance medie sul validation set: 
ROC AUC = 0.7743827160493826 (std = 0.08894705837908318), Balanced Accuracy = 0.7419444444444444 (std = 0.08923759726388927)
Metrics on the TEST set:
Selected Features: [ 0  4  5  8 11 13 15 17 18 19 20 21 22 23 25 29 30 33 34 37 38 39 40 41]
ROC AUC: 0.6635802469135803
F1 Score: 0.6
Accuracy: 0.692307692307

In [17]:
import pickle

# Salva i risultati in un file pickle
with open('/Users/alessiamenozzi/Desktop/best_results_others_VGG.pkl', 'wb') as f:
    pickle.dump(best_results_others, f)

with open('/Users/alessiamenozzi/Desktop/best_results_lasso_VGG.pkl', 'wb') as f:
    pickle.dump(best_results_lasso, f)


## Fine Tuning Threshold su risultati migliori

In [18]:
n_folds=5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=11)

thresholds=np.arange(0.3, 0.71, 0.01) 
thresholds=[0.5]

classifier='SVM'
selector='logistic'
num_features=16

template_dict = {
                'fold': None,
                'classifier': None,
                'selector': None,
                'alpha': None,
                'num_features': None,
                'pr_auc': None,
                'roc_auc': None,
                'f1': None,
                'accuracy': None,
                'confusion_matrix': [],
                'threshold': None,
                'selected_features': [],
                'balanced accuracy': None
            }

## creazione di dizionari vuoti (range con numero elevato casuale)
results_val_2_others = [template_dict.copy() for _ in range(3000)]
results_val_2_others.append(template_dict.copy())


smote = SMOTE(random_state=7)

k=0
old_count = 0
for fold_idx, (train_index, val_index) in enumerate(skf.split(X_train_reduced, Y_train)):
    print("Starting with fold:", fold_idx)

    x_train_reduced, X_val_reduced = X_train_reduced[train_index], X_train_reduced[val_index]
    y_train, y_val = Y_train[train_index], Y_train[val_index]

    x_train_reduced, y_train = smote.fit_resample(x_train_reduced, y_train)

    classi=classifierinitialization(classifier)
                    
    y_proba_test, number_features, selected_features = classification_method_withoutThreshold(selector, classi, 0, x_train_reduced, y_train, X_val_reduced, y_val, num_features=num_features, mode="Val", selected_features=[0])

    best_f1_val = -1
    best_threshold_val = 0
    best_case_val = None
                        
    for threshold in thresholds:
            
        best_case_val = classification_threshold(y_proba_test, y_val, threshold, 0, num_features, selected_features)
                                
        if best_case_val:
            results_val_2_others[k] = {
                                    'fold': fold_idx,
                                    'alpha': 0,
                                    'num_features': num_features,
                                    'selected_features': best_case_val['selected_features'],
                                    'roc_auc': best_case_val['roc_auc'],
                                    'f1': best_case_val['f1'],
                                    'accuracy': best_case_val['accuracy'],
                                    'confusion_matrix': best_case_val['confusion_matrix'],
                                    'threshold': threshold, 
                                    'balanced accuracy': best_case_val['balanced accuracy']
                                }
            k = k + 1
                        

Starting with fold: 0
Starting with fold: 1
Starting with fold: 2
Starting with fold: 3
Starting with fold: 4


In [19]:
num_features_range = list(range(1, (len(x_train_reduced[0]) + 1)))
num_features_range=list(range(1,30))

grid_results_others = {}
grid_results_lasso = {}



for threshold in thresholds:
                    # Filtra i risultati che corrispondono a questa combinazione di parametri
                    filtered_results=[]
                    for res in results_val_2_others:
                        if (res['threshold'] == threshold):
                            filtered_results.append(res)
                
                    if filtered_results:
                        f1_values = [res['f1'] for res in filtered_results]
                        balaccuracy_values = [res['balanced accuracy'] for res in filtered_results]
                        roc_values=[res['roc_auc'] for res in filtered_results]

                        # Calcola le medie delle metriche
                        avg_f1 = sum(f1_values) / len(f1_values)
                        avg_balaccuracy = sum(balaccuracy_values) / len(balaccuracy_values)
                        avg_roc = sum(roc_values) / len(roc_values)

                        # Calcola la deviazione standard delle metriche
                        std_f1 = statistics.stdev(f1_values) if len(f1_values) > 1 else 0
                        std_balaccuracy = statistics.stdev(balaccuracy_values) if len(balaccuracy_values) > 1 else 0
                        std_roc_auc = statistics.stdev(roc_values) if len(roc_values) > 1 else 0

                        # Memorizza i risultati medi e la deviazione standard di questa combinazione
                        grid_results_others[(threshold)] = {
                            'avg_f1': avg_f1,
                            'std_f1': std_f1,
                            'avg_balaccuracy': avg_balaccuracy,
                            'std_balaccuracy': std_balaccuracy,
                            'avg_roc_auc': avg_roc,
                            'std_roc_auc': std_roc_auc
                        }

# Ordina le combinazioni per 'avg_f1', e in caso di parità, per 'avg_pr_auc'
sorted_results_2 = sorted(grid_results_others.items(), key=lambda x: (x[1]['avg_balaccuracy'], x[1]['avg_roc_auc']),reverse=True)


In [21]:
sorted_results_2

[(0.5,
  {'avg_f1': 0.6534866578210231,
   'std_f1': 0.04685519921994917,
   'avg_balaccuracy': 0.6683333333333332,
   'std_balaccuracy': 0.05896974271230588,
   'avg_roc_auc': 0.7007716049382716,
   'std_roc_auc': 0.10522649008962921})]