## PREPROCESSING

In [1]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve, balanced_accuracy_score

import pandas as pd
import numpy as np  # Needed for NaN check
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from scipy.stats import ttest_ind
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import roc_auc_score, precision_recall_curve, auc, f1_score, balanced_accuracy_score


In [2]:
file_path = "C:\\Users\\bsbar\\Desktop\\Tesi\\ThesisPlaques\\CSV\\data_rad_clin_DEF.csv"

data = pd.read_csv(file_path)
labels_column = data['label']
labels = labels_column.astype(int).tolist()

labels=np.array(labels)

# Estrazione dei numeri dai nomi dei pazienti
loaded_patients = data['IDs_new'].str.extract(r'(\d+)').astype(int).squeeze().tolist()

print("Labels:", labels)
print("Number of labels:", len(labels))
print("Patient Names: ", loaded_patients )

# Load the data
file_path = "C:\\Users\\bsbar\\Desktop\\Radiomica_2_5D.csv"
data = pd.read_csv(file_path)

# Filter columns that start with 'original'
filtered_columns = [col for col in data.columns if col.startswith('original')]



patients = []

for patient_id in loaded_patients:

    # Filter the data for the specific patient
    patient_data = data[data['Paziente'] == patient_id]
    
    slices = []
    
    for _, slice_row in patient_data.iterrows():
        # Select only the filtered columns for each slice
        slice_features = slice_row[filtered_columns].tolist()
        
        # Check for NaN values in the slice, and only append if there are no NaNs
        if not any(np.isnan(value) for value in slice_features):
            slices.append(slice_features)
    
    patients.append(slices)

# Optional: Check if all slices with NaN were removed successfully
for i, patient in enumerate(patients):
    for j, slice_features in enumerate(patient):
        assert not any(np.isnan(value) for value in slice_features), f"NaN found in patient {i}, slice {j}"

print("All slices with NaN values have been removed.")


Labels: [0 1 1 0 0 0 0 0 1 0 0 1 1 0 0 1 1 0 1 1 0 1 1 0 1 1 0 1 0 0 0 1 1 1 0 1 0
 0 0 1 1 0 0 1 0 1 1 1 1 1 0 0 0 1 0 0 0 1 0 0 0 0 1 1 1 0 1 0 1 0 0 1 0 0
 0 1 0 1 0 1 1 0 1 0 1 0 0 0 1 0 0 1 0 0 0 0 0 0 1 1 1 0 0 1 0 0 1 0 0 0 0
 0 1 1 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1]
Number of labels: 129
Patient Names:  [5, 12, 15, 16, 17, 19, 22, 23, 24, 25, 26, 27, 29, 30, 31, 33, 35, 36, 38, 39, 40, 41, 42, 43, 44, 46, 47, 48, 50, 52, 53, 54, 56, 57, 58, 59, 60, 61, 62, 64, 65, 68, 69, 70, 71, 74, 75, 76, 78, 79, 81, 82, 84, 85, 86, 87, 88, 89, 90, 91, 92, 94, 95, 96, 98, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 123, 124, 126, 127, 128, 129, 133, 135, 136, 137, 138, 139, 141, 142, 144, 146, 147, 149, 150, 153, 155, 158, 159, 161, 163, 166, 168, 169, 170, 171, 175, 176, 178, 182, 183, 188, 189, 190, 193, 197, 199, 200, 205]
All slices with NaN values have been removed.


In [3]:
def continue_array(filtered_patients, labels):
    all_features = []
    for patient in filtered_patients:
        for image_features in patient:
            all_features.append(image_features)

    all_features_array = np.array(all_features)
    expanded_labels = []
    expanded_patient_ids = []

    for i in range(len(filtered_patients)):
        num_images = len(filtered_patients[i])
        expanded_labels.extend([labels[i]] * num_images)
        expanded_patient_ids.extend([loaded_patients[i]] * num_images)

    expanded_labels_array = np.array(expanded_labels)
    expanded_patient_ids_array = np.array(expanded_patient_ids)

    return all_features_array, expanded_labels_array, expanded_patient_ids_array


## funzioni per feature correlation
def filter_highly_correlated_features(df, corr, threshold=0.85):
    columns = np.full((corr.shape[0],), True, dtype=bool)
    removed_features = []

    for i in range(corr.shape[0]):
        for j in range(i + 1, corr.shape[0]):
            if corr.iloc[i, j] >= threshold:
                if columns[j]:
                    columns[j] = False
                    removed_features.append(df.columns[j])

    return removed_features


def perform_correlation(z_train, y_train, numero = 32, threshold = 0.85):
    all_images, _, _= continue_array(z_train, y_train)

    df = pd.DataFrame(all_images, columns=[f'feature_{i}' for i in range(numero)])

    corr_matrix = df.corr()

    features_selected = filter_highly_correlated_features(df, corr_matrix, threshold)
    
    return features_selected

def select_features_by_p_value(x_train_expanded, y_train_expanded, p_value_threshold=0.05):

    p_values = []
    num_features = x_train_expanded.shape[1]

    for i in range(num_features):
        feature = x_train_expanded[:, i]
        group_0 = feature[y_train_expanded == 0]
        group_1 = feature[y_train_expanded == 1]
        t_stat, p_val = ttest_ind(group_0, group_1, equal_var=False)
        p_values.append(p_val)

    p_values = np.array(p_values)

    selected_features_indices = np.where(p_values < p_value_threshold)[0]

    sorted_indices = selected_features_indices[np.argsort(p_values[selected_features_indices])]

    x_train_expanded = x_train_expanded[:, sorted_indices]

    return x_train_expanded, sorted_indices



## funzione per rimozione di features specifiche
def remove_features_from_patients(patients, features_to_remove):
    feature_indices_to_remove = [int(feature.split('_')[1]) for feature in features_to_remove]
    
    final_patients = []
    for patient in patients:
        new_patients = []
        for image_features in patient:
            new_patient = np.delete(image_features, feature_indices_to_remove, axis=0)
            new_patients.append(new_patient)
        final_patients.append(np.array(new_patients))    

    return final_patients

## funzione per lasciare solo le features indicate per array di array
def keep_features_in_patients(patients, features_to_keep):

    feature_indices_to_keep = [int(feature) for feature in features_to_keep]

    final_patients = []
    for patient in patients:
        new_patients = []
        for image_features in patient:
            new_patient = np.take(image_features, feature_indices_to_keep, axis=0)
            new_patients.append(new_patient)
        final_patients.append(np.array(new_patients))

    return final_patients

In [4]:

# patients_train contiene il nome dei pazienti (5,12 etc)
# y_train contiene le labels
# features_train contiene array di array dove ogni paziente ha varie immagini rappresentate da n features

patients_train1, patients_test, y_train1, y_test, features_train1, features_test= train_test_split(loaded_patients, labels, patients, test_size=0.3, shuffle=False, random_state=1)
patients_train, patients_val, y_train, y_val, features_train, features_val= train_test_split(patients_train1, y_train1, features_train1, test_size=0.3, shuffle=True, stratify=y_train1, random_state=3)
#patients_train, patients_val, y_train, y_val, features_train, features_val= train_test_split(patients_train1, y_train1, features_train1, test_size=0.3, shuffle=False, random_state=1)

print("Number of train patients: ", len(features_train))
print("Number of test patients: ", len(features_test))
print("Number of val patients: ", len(features_val))

print("Number of features for every image: ", len(features_train[0][0]) )


Number of train patients:  63
Number of test patients:  39
Number of val patients:  27
Number of features for every image:  102


In [5]:
starting_features = len(features_train[0][0])
features=perform_correlation(features_train, y_train, starting_features, 0.8)

final_patients_train=remove_features_from_patients(features_train, features)
final_patients_test=remove_features_from_patients(features_test, features)
final_patients_val=remove_features_from_patients(features_val, features)
final_patients_train1=remove_features_from_patients(features_train1, features)
print(final_patients_train1[0].shape)
x_train_expanded, y_train_expanded, _ = continue_array(final_patients_train, y_train)
x_train_expanded, sf= select_features_by_p_value(x_train_expanded, y_train_expanded, 0.01)
print("x_train_expanded", x_train_expanded.shape)


final_patients_test=keep_features_in_patients(final_patients_test, sf)
final_patients_val=keep_features_in_patients(final_patients_val, sf)
final_patients_train1=keep_features_in_patients(final_patients_train1, sf)
print("final_patients_val", final_patients_val[0].shape)
print("final_patients_test", final_patients_test[0].shape)
print("final_patients_train1", final_patients_train1[0].shape)
print(f"Scelte {len(sf)} features\n")


x_train_expanded1, y_train_expanded1, _ = continue_array(final_patients_train1, y_train1)
print("x_train_expanded1", x_train_expanded1.shape)

(39, 24)
x_train_expanded (2553, 11)
final_patients_val (39, 11)
final_patients_test (41, 11)
final_patients_train1 (39, 11)
Scelte 11 features

x_train_expanded1 (3517, 11)


## RICERCA

In [6]:
def calculate_metrics_with_voting(y_true, predictions_slices, probabilities_slices, voting_type='majority'):
    if voting_type == 'majority':
        y_pred_final = majority_voting(predictions_slices)
    elif voting_type == 'mean':
        y_pred_final = mean_voting(probabilities_slices)
    else:
        raise ValueError("Voting type not supported.")
    
    # Calcolo della probabilità media per ogni paziente per la classe positiva
    y_prob_final = np.array([np.mean(p[:, 1]) for p in probabilities_slices])
    
    # Calcolo delle metriche a livello di paziente
    f1 = f1_score(y_true, y_pred_final)
    roc_auc = roc_auc_score(y_true, y_prob_final)
    precision, recall, _ = precision_recall_curve(y_true, y_prob_final)
    pr_auc = auc(recall, precision)
    balanced_acc = balanced_accuracy_score(y_true, y_pred_final)
    
    return f1, roc_auc, pr_auc, balanced_acc

# Classificatori supportati
def get_classifier(name, params):
    if name == 'random_forest':
        return RandomForestClassifier(**params, random_state=42)
    elif name == 'logistic':
        return LogisticRegression(**params, random_state=42)
    elif name == 'mlp':
        return MLPClassifier(**params, random_state=42)
    elif name == 'xgboost':
        return XGBClassifier(**params,random_state=42)
    elif name == 'svm':
        return SVC(probability=True, **params, random_state=42)
    else:
        raise ValueError(f"Classifier {name} not supported.")
    
def majority_voting(predictions):
    return np.array([np.bincount(pred).argmax() for pred in predictions])

def mean_voting(predictions_prob):
    return np.array([np.mean(pred, axis=0).argmax() for pred in predictions_prob])

In [7]:
def find_best_classifier(X_train, y_train, X_val, y_val, classifier_name, param_grid):
    # Usa la funzione continue_array per trasformare l'array di validation nel formato di train
    X_val_slices, y_val_slices,_ = continue_array(X_val, y_val)
    
    # Unisci train e validation
    X_train_combined = np.vstack([X_train, X_val_slices])
    y_train_combined = np.hstack([y_train, y_val_slices])
    
    # Definizione del classificatore
    classifier = get_classifier(classifier_name, {})
    
    # Ricerca dei migliori parametri con GridSearchCV
    grid_search = GridSearchCV(classifier, param_grid, cv=3, scoring='f1', verbose=3, n_jobs=-1)
    grid_search.fit(X_train_combined, y_train_combined)
    
    # Ritorna il miglior classificatore
    best_clf = grid_search.best_estimator_
    print(f"Best params for {classifier_name}: {grid_search.best_params_}")
    
    return best_clf, grid_search.best_params_

def test_with_voting(classifier, X_data, y_data, voting_type='majority'):
    # Predizioni e probabilità per ogni paziente
    predictions_slices = []
    probabilities_slices = []
    
    for patient_slices in X_data:
        patient_pred = classifier.predict(patient_slices)
        patient_prob = classifier.predict_proba(patient_slices)
        predictions_slices.append(patient_pred)
        probabilities_slices.append(patient_prob)
    
    # Applica majority o mean voting e calcola le metriche
    f1, roc_auc, pr_auc, balanced_acc = calculate_metrics_with_voting(y_data, predictions_slices, probabilities_slices, voting_type)
    
    print(f"Metrics with {voting_type} voting: F1 = {f1}, ROC AUC = {roc_auc}, PR AUC = {pr_auc}, Balanced Accuracy = {balanced_acc}")
    
    return f1, roc_auc, pr_auc, balanced_acc

def retrain_and_test(classifier, X_train, y_train, X_val, y_val, X_test, y_test, voting_type='majority'):
    # Usa la funzione continue_array per trasformare il validation set
    X_val_slices, y_val_slices = continue_array(X_val, y_val)
    
    # Unire i dati di train e validation
    X_train_combined = np.vstack([X_train, X_val_slices])
    y_train_combined = np.hstack([y_train, y_val_slices])
    
    # Shuffle i dati uniti
    X_train_combined, y_train_combined = shuffle(X_train_combined, y_train_combined, random_state=42)
    
    # Riallenamento sui dati combinati
    classifier.fit(X_train_combined, y_train_combined)
    
    # Test sul test set
    print("Test metrics:")
    test_with_voting(classifier, X_test, y_test, voting_type)

In [8]:
def run_experiment(X_train, y_train, X_val, y_val, X_test, y_test, classifier_name, voting_type='majority'):
    # Parametri per i vari classificatori (come già definito in precedenza)
    param_grid = {
    'random_forest': {
        'n_estimators': [50, 100, 200],
        'max_depth': [10, 15, 20, None],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4],
        'bootstrap': [True, False]
    },
    'logistic': [
        {
            'C': [0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2'],
            'solver': ['liblinear'],
            'max_iter': [5000, 7000, 10000]
        },
        # For 'saga' solver: supports 'l1', 'l2', and 'elasticnet' penalties
        {
            'C': [0.01, 0.1, 1, 10, 100],
            'penalty': ['l1', 'l2', 'elasticnet'],
            'solver': ['saga'],
            'l1_ratio': [0.5],  # Only relevant for 'elasticnet'
            'max_iter': [5000, 7000, 10000]
        }
    ],
    'mlp': {
        'hidden_layer_sizes': [(64,), (128,), (128, 64), (128, 64, 32)],
        'activation': ['tanh', 'relu'],
        'solver': ['adam', 'sgd'],
        'alpha': [0.0001, 0.001, 0.01],
        'learning_rate': ['constant', 'adaptive'],
        'learning_rate_init': [0.001, 0.01, 0.1],
        'max_iter': [300, 500, 1000]
    },
    'xgboost': {
        'n_estimators': [100, 200, 300],
        'learning_rate': [0.01, 0.05, 0.1],
        'max_depth': [3, 6, 9],
        'subsample': [0.7, 0.8, 1.0],
        'colsample_bytree': [0.7, 0.8, 1.0],
        'gamma': [0, 0.1, 0.3],
        'min_child_weight': [1, 3, 5]
    },
    'svm': {
        'C': [0.1, 1, 10, 100],
        'kernel': ['linear', 'rbf', 'poly', 'sigmoid'],
        'gamma': ['scale', 'auto']
    }
    }
    
    
    # Trova il miglior classificatore unendo train e validation
    best_clf, best_params = find_best_classifier(X_train, y_train, X_val, y_val, classifier_name, param_grid[classifier_name])
    
    # Test sul test set
    print("Test metrics:")
    test_with_voting(best_clf, X_test, y_test, voting_type)
    
    return best_params

In [9]:
# test
classificatori = ['random_forest', 'mlp', 'svm', 'xgboost', 'logistic']
voting = ['majority', 'mean']
best_params = run_experiment(x_train_expanded, y_train_expanded, final_patients_val, y_val, final_patients_test, y_test, classificatori[2], voting[1])

Fitting 3 folds for each of 32 candidates, totalling 96 fits


KeyboardInterrupt: 