## IMPORTS and UTILS

In [1]:
print("hi")

import os
import pandas as pd
import subprocess
import re
import numpy as np
import json
import csv

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

import matplotlib.pyplot as plt

from pptx import Presentation
from pptx.util import Inches
import os

from scripts import combat_info
from scripts import combat_quick_apply
from scripts import combat_quick_QC


CAMCAN = "./DONNES/CamCAN.md.raw.csv.gz"
COMPILATION = "./DONNES/adni_compilation.csv.gz"

SYNTHETIC_SITES = "ROBUST/SYNTHETIC_SITES"

MAINFOLDER = "ROBUST"

RAWFOLDER = "RAW"

ANALYSISFOLDER = "ANALYSIS"

hi


In [2]:
def get_info(mov_data_file):
    [df,bundles] = combat_info.info(mov_data_file)
    nb_hc = int(re.findall('HC\(n=(\d+)',df["DetailInfos"]["Disease"])[0])
    nb_total = df["DetailInfos"]["Number of Subject"]
    nb_sick = nb_total - nb_hc
    return [nb_total,nb_hc,nb_sick]

In [3]:
def get_bundles(mov_data_file):
    return combat_info.get_bundles(mov_data_file)

In [4]:
def robust_text(x):
    return "NoRobust" if x == 'No' else x

def rwp_text(x):
    return "RWP" if x else "NoRWP"
def get_site(mov_data_file):
    mov_data = pd.read_csv(mov_data_file)
    return mov_data.site.unique()[0]


In [5]:
def add_nb_patients_and_diseased(df):
  df['num_patients'] = df['site'].str.extract(r'(\d+)_patients')[0].astype(int)
  df['disease_ratio'] = df['site'].str.extract(r'(\d+)_percent')[0].astype(int)
  df['num_diseased'] = (df['num_patients'] * df['disease_ratio']/100).astype(int)
  return df

In [6]:
def scatter(df1,df2, title, bundle='mni_MCP'):
    df1_bundle = df1[df1['bundle'] == bundle]
    df2_bundle = df2[df2['bundle'] == bundle]

    plt.figure(figsize=(10, 5))
    plt.scatter(df1_bundle['age'], df1_bundle['mean'], label='Train', alpha=0.5, color='green')
    plt.scatter(df2_bundle['age'], df2_bundle['mean'], label='Test', alpha=0.5, color='red')
    plt.xlabel('Age')
    plt.ylabel('Mean')
    plt.title(title)
    plt.legend()
    plt.show()

## SITE GENERATION

In [7]:
def split_train_test(file_path, test_size=0.2, random_state=None):
    """
    Split the DataFrame into training and testing sets, ensuring the same proportion of HC and non-HC patients
    and that data from the same sid are in the same dataset.

    Parameters:
    file_path (str): The path to the CSV file to split.
    test_size (float): The proportion of the dataset to include in the test split.
    random_state (int): Random seed for reproducibility.

    Returns:
    pd.DataFrame: Training set.
    pd.DataFrame: Testing set.
    """
    df = pd.read_csv(file_path)
    
    # Group by 'sid' and get unique sids
    unique_sids = df.groupby('sid').first().reset_index()
    
    # Split the unique sids into train and test sets
    train_sids, test_sids = train_test_split(unique_sids, test_size=test_size, random_state=random_state, stratify=unique_sids['disease'])
    
    # Create train and test DataFrames by filtering the original DataFrame
    train_df = df[df['sid'].isin(train_sids['sid'])]
    test_df = df[df['sid'].isin(test_sids['sid'])]
    
    return train_df, test_df

In [8]:
def sample_patients(df, num_patients, disease_ratio,index):
    # Lire le fichier CSV dans un DataFrame
    
    # Calculer le nombre de patients malades et sains
    num_diseased = int(num_patients * disease_ratio)
    num_healthy = num_patients - num_diseased
    
    # Filtrer les patients en santé (HC) et malades
    healthy_patients = df[df['disease'] == 'HC']
    diseased_patients = df[df['disease'] != 'HC']
    
    # S'assurer qu'il y a assez de patients pour chaque catégorie
    if len(healthy_patients['sid'].unique()) < num_healthy or len(diseased_patients['sid'].unique()) < num_diseased:
        raise ValueError("Nombre insuffisant de patients en santé ou malades pour l'échantillon demandé.")
    
    # Sélectionner un échantillon aléatoire de patients sains et malades
    sampled_healthy = healthy_patients.groupby('sid').sample(frac=1).head(num_healthy * df['bundle'].nunique())
    sampled_diseased = diseased_patients.groupby('sid').sample(frac=1).head(num_diseased * df['bundle'].nunique())
    
    # Combiner les échantillons pour obtenir le DataFrame final
    sampled_df = pd.concat([sampled_healthy, sampled_diseased])
    # Modifier les valeurs de 'site' pour toutes les lignes
    sampled_df['site'] = f"{num_patients}_patients_{int(disease_ratio*100)}_percent_{index}"
    
    # Retourner le DataFrame final
    return sampled_df

In [9]:
def generate_biaised_data(df1, df2, 
                additive_uniform_low=-3, additive_uniform_high=3, 
                multiplicative_uniform_low=0.5, multiplicative_uniform_high=2, 
                additive_std_low=0.01, additive_std_high=0.1, 
                multiplicative_std_low=0.01, multiplicative_std_high=0.1):
    """
    Génère des biais additifs et multiplicatifs pour chaque bundle en fonction de df1, puis applique ces biais à df1 et df2
    de manière indépendante en tenant compte des covariables (âge, sexe, latéralité) et en centrant les résidus.

    Parameters:
    - df1, df2 (pd.DataFrame): Les DataFrames sur lesquels appliquer les biais.
    - additive_uniform_low, additive_uniform_high : paramètres pour le biais additif.
    - multiplicative_uniform_low, multiplicative_uniform_high : paramètres pour le biais multiplicatif.
    - additive_std_low, additive_std_high : paramètres pour l'écart-type du biais additif.
    - multiplicative_std_low, multiplicative_std_high : paramètres pour l'écart-type du biais multiplicatif.

    Returns:
    - tuple : Deux DataFrames avec les biais appliqués indépendamment.
    """
    
    # Dictionnaires pour stocker les biais par bundle
    additive_bias_per_bundle = {}
    multiplicative_bias_per_bundle = {}

    # # Tirer les moyennes de biais de distributions uniformes pour le bundle
    additive_mean = np.random.uniform(low=additive_uniform_low, high=additive_uniform_high)
    multiplicative_mean = np.random.uniform(low=multiplicative_uniform_low, high=multiplicative_uniform_high)
    
    # # Tirer les écarts-types de biais de distributions uniformes pour le bundle
    additive_std = np.random.uniform(low=additive_std_low, high=additive_std_high)
    multiplicative_std = np.random.uniform(low=multiplicative_std_low, high=multiplicative_std_high)

    # Calcul des biais pour chaque bundle unique dans df1
    for bundle in df1['bundle'].unique(): 
        # Générer un biais additif et multiplicatif spécifique au bundle
        additive_bias_per_bundle[bundle] = np.random.normal(loc=additive_mean, scale=additive_std)
        multiplicative_bias_per_bundle[bundle] = np.random.normal(loc=multiplicative_mean, scale=multiplicative_std)
   
    # Appliquer les biais indépendamment à df1 et df2 en utilisant les mêmes biais générés
    combined = pd.concat([df1, df2], ignore_index=True)
    biased_df = apply_bias(combined, additive_bias_per_bundle, multiplicative_bias_per_bundle)
    biased_df1 = biased_df[biased_df['sid'].isin(df1['sid'])]
    biased_df2 = biased_df[biased_df['sid'].isin(df2['sid'])]
    bias_parameters = {
        'additive_mean': additive_mean,
        'multiplicative_mean': multiplicative_mean,
        'additive_std': additive_std,
        'multiplicative_std': multiplicative_std
    }
    
    return biased_df1, biased_df2, additive_bias_per_bundle, multiplicative_bias_per_bundle, bias_parameters

def apply_bias(dataframe, additive_bias_per_bundle, multiplicative_bias_per_bundle):
    biased_df = dataframe.copy()
    
    # Application de la régression et des biais pour chaque bundle unique
    for bundle in biased_df['bundle'].unique():
        # Filtrer le DataFrame pour le bundle actuel
        bundle_df = biased_df[biased_df['bundle'] == bundle]

        # Préparer les covariables pour la régression
        X = bundle_df[['age', 'sex', 'handedness']]
        y = bundle_df['mean']
        
        # Ajuster le modèle de régression linéaire pour le bundle
        model = LinearRegression()
        model.fit(X, y)
        
        # Calculer les prédictions et les résidus pour le bundle
        predicted_mean = model.predict(X)
        residuals = y - predicted_mean

        # Récupérer les biais pour le bundle actuel
        additive_bias = additive_bias_per_bundle[bundle]
        multiplicative_bias = multiplicative_bias_per_bundle[bundle]
        
        # Appliquer les biais aux résidus centrés et réintégrer les effets des covariables
        biased_means_bundle = residuals * multiplicative_bias + additive_bias * np.std(residuals) + predicted_mean
        biased_df.loc[biased_df['bundle'] == bundle, 'mean'] = biased_means_bundle
    
    # Assigner les valeurs biaisées calculées au DataFrame
    return biased_df


In [10]:
#GENERATE SITES
def generate_sites(sample_sizes, disease_ratios, num_tests, SYNTHETIC_SITES_VERSION):
    directory = os.path.join(SYNTHETIC_SITES, SYNTHETIC_SITES_VERSION)
    train_df, test_df = split_train_test(COMPILATION, test_size=0.2, random_state=42)
    # Initialize DataFrames to store the results
    for sample_size in sample_sizes:
        for disease_ratio in disease_ratios:  
            sizeDir = os.path.join(directory, f"{sample_size}_{int(disease_ratio*100)}")
            for i in range(num_tests):
                
                tempDir = os.path.join(sizeDir, f"{i}")
                os.makedirs(tempDir, exist_ok=True)

                train_df_biaised, test_df_biaised, gammas, deltas, parameters= generate_biaised_data(train_df, test_df)

                sampled_df_biaied =  sample_patients(train_df_biaised, sample_size, disease_ratio,i)

                # Sauvegarder l'échantillon dans un fichier temporaire
                temp_train_file = os.path.join(tempDir, f"train_{sample_size}_{int(disease_ratio*100)}_{i}.csv")
                sampled_df_biaied.to_csv(temp_train_file, index=False)
                
                temp_test_file = os.path.join(tempDir, f"test_{sample_size}_{int(disease_ratio*100)}_{i}.csv")
                test_df_biaised.to_csv(temp_test_file, index=False)

                # Sauvegarde dans un fichier JSON
                with open(os.path.join(tempDir,'parameters.json'), 'w') as file:
                    json.dump({'parameters': parameters, 'gammas': gammas, 'deltas': deltas}, file, indent=4)

                cmd = (
                    "scripts/combat_visualize_data.py"
                    + " "
                    + COMPILATION
                    + " "
                    + temp_train_file
                    + " --out_dir "
                    + os.path.join(tempDir, "VIZ")
                    + " -f"
                    + " --bundles all"
                )
                subprocess.call(cmd, shell=True)
                cmd = (
                    "scripts/combat_visualize_data.py"
                    + " "
                    + COMPILATION
                    + " "
                    + temp_test_file
                    + " --out_dir "
                    + os.path.join(tempDir, "VIZ_TEST")
                    + " -f"
                    + " --bundles all"
                )
                subprocess.call(cmd, shell=True)

## HARMONIZATION

In [11]:
def fit(mov_data_file, robust, rwp, directory, hc,):
    ###########
    ### fit ###
    ###########
    output_model_filename = (
            get_site(mov_data_file)
            + "."
            + metric
            + "."
            + method
            + "."
            + robust_text(robust)
            + "."
            + rwp_text(rwp)
            + ".model.csv"
        )
    cmd = (
        "scripts/combat_quick_fit.py"
        + " "
        + CAMCAN
        + " "
        + mov_data_file
        + " --out_dir "
        + directory
        + " --output_model_filename "
        + output_model_filename
        + " --method "
        + method
        + " --robust "
        + robust
        + " -f "
    )
    if rwp:
        cmd += ' --rwp'
    if hc: 
        cmd += ' --hc'
    subprocess.call(cmd, shell=True)
    return output_model_filename

In [12]:
def apply(mov_data_file, model_filename, robust, rwp, directory):
    output_filename = os.path.join(
            directory,
            get_site(mov_data_file)
            + "."
            + metric
            + "."
            + method
            + "."
            + robust_text(robust)
            + "."
            + rwp_text(rwp)
            + ".csv"
        )
    combat_quick_apply.apply(mov_data_file, model_filename, output_filename)
    return output_filename

In [13]:
def visualize_harmonization(f, new_f, directory):
    cmd = (
        "scripts/combat_visualize_harmonization.py"
        + " "
        + CAMCAN
        + " "
        + f
        + " "
        + new_f
        + " --out_dir "
        + directory
        #+ " --bundles all"
        + " -f"
    )
    subprocess.call(cmd, shell=True)

In [14]:
def QC(output_filename, output_model_filename):
    return combat_quick_QC.QC(CAMCAN,output_filename, output_model_filename)

In [15]:
def compare_with_compilation(df):
    # Charger le DataFrame COMPILATION
    compilation_df = pd.read_csv(COMPILATION)
    
    # Filtrer les patients de COMPILATION qui sont dans df en utilisant les sid
    common_sids = df['sid'].unique()
    filtered_compilation_df = compilation_df[compilation_df['sid'].isin(common_sids)]
    
    # Initialiser une liste pour stocker les résultats
    comparison_df = pd.DataFrame()

    # Comparer la différence absolue de la colonne mean par bundle
    for bundle in df['bundle'].unique():
        df_bundle = df[df['bundle'] == bundle]
        compilation_bundle = filtered_compilation_df[filtered_compilation_df['bundle'] == bundle]
        
        # Fusionner les deux DataFrames sur les colonnes 'sid' et 'bundle'
        merged_df = pd.merge(df_bundle, compilation_bundle, on=['sid', 'bundle'], suffixes=('_df', '_compilation'))
        
        # Calculer la différence absolue de la colonne mean
        merged_df['abs_diff_mean'] = (merged_df['mean_df'] - merged_df['mean_compilation']).abs()
        # Calculer la somme des différences absolues pour le bundle
        comparison_df[bundle] = merged_df['abs_diff_mean']
           
    # Ajouter le site au DataFrame
    mean_df = pd.DataFrame(comparison_df.mean()).transpose()
    
    return mean_df

In [16]:
def create_presentation(directory):
    # Create a presentation object
    prs = Presentation()
    
    # Define the subdirectories
    subdirs = ["hc", "NoRobust", "robust", "robust_rwp"]
    # Get the list of images
    images = [img for img in os.listdir(os.path.join(directory, subdirs[0])) if method in img and img.endswith('.png')]
    
    for img in images:
        slide_layout = prs.slide_layouts[5]  # Use a blank slide layout
        slide = prs.slides.add_slide(slide_layout)
        
        for i, subdir in enumerate(subdirs):
            img_path = os.path.join(directory, subdir, img)
            left = Inches(0.5 + (i % 2) * 4.5)  # Positioning images in two columns
            top = Inches(0.2 + (i // 2) * 3.5)  # Positioning images in two rows with more space between rows
            
            # Add text above the image
            text_box = slide.shapes.add_textbox(left, top, width=Inches(4), height=Inches(0.5))
            text_frame = text_box.text_frame
            text_frame.text = subdir
            
            # Add the image
            slide.shapes.add_picture(img_path, left, top + Inches(0.5), width=Inches(4))
    
    # Save the presentation
    prs.save(os.path.join(directory, 'harmonization_results.pptx'))


In [17]:
def compare_distances(directory, site, hc_dists, no_robust_dists, robust_dists, robust_rwp_dists):
    comparison_results = {
        "hc_vs_no_robust": (np.array(hc_dists) - np.array(no_robust_dists))/np.array(no_robust_dists)*100,
        "robust_vs_no_robust": (np.array(robust_dists) - np.array(no_robust_dists))/np.array(no_robust_dists)*100,
        "robust_rwp_vs_no_robust": (np.array(robust_rwp_dists) - np.array(no_robust_dists))/np.array(no_robust_dists)*100
    }
    df = pd.DataFrame(comparison_results)
    
    # Calculer le nombre de comparaisons négatives et positives, et les moyennes et médianes
    results = []
    for method in comparison_results.keys():
        negative_values = df[method][df[method] < 0]
        positive_values = df[method][df[method] >= 0]
        
        num_negative = len(negative_values)
        num_positive = len(positive_values)
        
        mean_negative = negative_values.mean() if num_negative > 0 else 0
        mean_positive = positive_values.mean() if num_positive > 0 else 0
        
        median_negative = negative_values.median() if num_negative > 0 else 0
        median_positive = positive_values.median() if num_positive > 0 else 0
        
        mean_difference = df[method].mean()
        
        results.append({
            "site": site,
            "comparaison": method,
            "Nb comp. nég.": num_negative,
            "Nb comp. pos.": num_positive,
            "Moy. tot.": mean_difference,
            "Moy. val. nég.": mean_negative,
            "Moy. val. pos.": mean_positive,
            "Méd. val. nég.": median_negative,
            "Méd. val. pos.": median_positive
        })
    results_df = pd.DataFrame(results)
    results_df.to_csv(os.path.join(directory, f"{site}_comparison_results.csv"), index=False)
    return results_df


In [18]:
def harmonize(f_train, f_test, directory, robust, rwp,hc):
    os.makedirs(directory, exist_ok=True)
    print(f_train)
    
    # Fit the model
    output_model_filename = fit(f_train, robust, rwp, directory, hc)
    output_model_filename = os.path.join(directory, output_model_filename)
    # Apply the model
    output_filename = apply(f_test, output_model_filename, robust, rwp, directory) 
    
    # Perform quality control
    dists, bundle_names = QC(output_filename, output_model_filename)
    dists_df = pd.DataFrame([dists], columns=bundle_names)
    dists_df['site'] = get_site(f_train)
    
    # Visualize the harmonization
    visualize_harmonization(f_test, output_filename, directory)

    mea = compare_with_compilation(pd.read_csv(output_filename))
    mea['site'] = get_site(f_train)
    
    # If robust is not "No", load metrics and outliers
    if robust != "No":
        metrics_filename = os.path.join(directory, f"metrics_{get_site(f_train)}_{robust_text(robust)}_{rwp_text(rwp)}.csv")
        outliers_filename = os.path.join(directory, f"outliers_{get_site(f_train)}_{robust_text(robust)}_{rwp_text(rwp)}.csv")
        
        # Load metrics from CSV file
        loaded_metrics = pd.read_csv(metrics_filename, index_col=0)
        
        # Load outliers from CSV file
        loaded_outliers_df = pd.read_csv(outliers_filename, index_col=0)
        
        return [dists_df, mea, loaded_metrics, loaded_outliers_df]
    return[dists_df, mea, None, None]

In [19]:
def analyse_site(f_train,f_test, robust, directory):
    # 4 harmonization
    harmonization_hc = harmonize(f_train, f_test, os.path.join(directory, "hc"), "No", False, True)
    harmonization_no_robust = harmonize(f_train, f_test, os.path.join(directory, "NoRobust"), "No", False, False)
    harmonization_robust = harmonize(f_train, f_test, os.path.join(directory, "robust"), robust, False, False)
    harmonization_robust_rwp = harmonize(f_train, f_test, os.path.join(directory, "robust_rwp"), robust, True, False)


    create_presentation(directory)

    #dists_analyze = compare_distances(directory, get_site(f_train), harmonization_hc[0], harmonization_no_robust[0], harmonization_robust[0], harmonization_robust_rwp[0])
    # Combine distances in a single DataFrame
    distances_combined = pd.concat([harmonization_hc[0], harmonization_no_robust[0], harmonization_robust[0], harmonization_robust_rwp[0]], ignore_index=True)
    distances_combined['method'] = ['hc', 'no_robust', 'robust', 'robust_rwp']

    # Combine MEA in a single DataFrame
    mea_combined = pd.concat([harmonization_hc[1], harmonization_no_robust[1], harmonization_robust[1], harmonization_robust_rwp[1]], ignore_index=True)
    mea_combined['method'] = ['hc', 'no_robust', 'robust', 'robust_rwp']


    #TODO bundles et analyze outliers
    return distances_combined, mea_combined, harmonization_robust[2], harmonization_robust[3]

In [None]:
    
#Analyse Method
def analyse_method(sample_sizes, disease_ratios, num_tests, robust_method, SYNTHETIC_SITES_VERSION):
    # Split the data into training and testing sets
    directory = os.path.join(MAINFOLDER, robust_method)
    directory_site = os.path.join(SYNTHETIC_SITES ,SYNTHETIC_SITES_VERSION)
    # Initialize DataFrames to store the results
    metrics_compilation = pd.DataFrame()
    dists_compilation = pd.DataFrame()
    mea_compilation = pd.DataFrame()
    outliers_compilation = pd.DataFrame()
    for sample_size in sample_sizes:
        for disease_ratio in disease_ratios:        
            sizeDir = os.path.join(directory, f"{sample_size}_{int(disease_ratio*100)}")
            sizeDir_site = os.path.join(directory_site, f"{sample_size}_{int(disease_ratio*100)}")
            for i in range(num_tests):
                tempDir = os.path.join(sizeDir, f"{i}")
                tempDir_site = os.path.join(sizeDir_site, f"{i}")
                os.makedirs(tempDir, exist_ok=True)

                train_file_name = f"train_{sample_size}_{int(disease_ratio*100)}_{i}.csv"
                test_file_name = f"test_{sample_size}_{int(disease_ratio*100)}_{i}.csv"
                
                # Sauvegarder l'échantillon dans un fichier temporaire
                temp_file = os.path.join(tempDir_site,train_file_name )
                train_df = pd.read_csv(temp_file)
                train_df.to_csv(os.path.join(tempDir,train_file_name ), index=False)

                test_file = os.path.join(tempDir_site, test_file_name)
                test_df = pd.read_csv(test_file)
                test_df.to_csv(os.path.join(tempDir,test_file_name ), index=False)

                
                # Analyser le site pour le nouvel échantillon
                dists_analyze, mea_analyze, metrics, outliers = analyse_site(temp_file, test_file, robust_method, tempDir)
                metrics_compilation = pd.concat([metrics_compilation, metrics])
                dists_compilation = pd.concat([dists_compilation, dists_analyze])
                mea_compilation = pd.concat([mea_compilation, mea_analyze])
                outliers_compilation = pd.concat([outliers_compilation, outliers])
    # Save the metrics and distances compilation DataFrames to CSV files
    metrics_compilation.to_csv(os.path.join(directory, "metrics_compilation.csv"), index=False)
    dists_compilation.to_csv(os.path.join(directory, "dists_compilation.csv"), index=False)
    mea_compilation.to_csv(os.path.join(directory, "mea_compilation.csv"), index=False)
    outliers_compilation.to_csv(os.path.join(directory, "outliers_compilation.csv"), index=False)

## EXECUTOR

In [21]:
site_group = 'ADNI'
robust_method = 'IQR'
metric = "md"
method= "classic"

SYNTHETIC_SITES_VERSION = "v1"


sample_sizes = [30, 50, 100, 150, 200,300]  # Différentes tailles d'échantillon
disease_ratios = [0.1, 0.3, 0.5, 0.7]  # Différents pourcentages de malades
#sample_sizes = [150, 300]  # Différentes tailles d'échantillon
#disease_ratios = [0.3]  # Différents pourcentages de malades
num_tests = 10  # Nombre de tests à effectuer pour chaque combinaison

#generate_sites(sample_sizes, disease_ratios, num_tests, SYNTHETIC_SITES_VERSION)

#analyse_method(sample_sizes, disease_ratios, num_tests, robust_method, SYNTHETIC_SITES_VERSION)

## ANALYSYS

In [22]:
# Moyenne par site
# Nothing really interesting so far
directory = os.path.join(MAINFOLDER, robust_method)
dists_compilation = pd.read_csv(os.path.join(directory, "dists_compilation.csv"))
metrics_compilation = pd.read_csv(os.path.join(directory, "metrics_compilation.csv"))
directory = os.path.join(directory, ANALYSISFOLDER)
os.makedirs(directory, exist_ok=True)

dists_compilation['site'] = dists_compilation['site'].str.rsplit('_', n=1).str[0]
metrics_compilation['site'] = metrics_compilation['site'].str.rsplit('_', n=1).str[0]

# Display the means by site
dists_means_by_site = dists_compilation.groupby(['site','method']).mean().reset_index()
metrics_means_by_site = metrics_compilation.groupby(['site', 'metric']).mean().reset_index()

metrics_means_by_site.to_csv(os.path.join(directory, "metrics_compilation_mean.csv"), index=False)
dists_means_by_site.to_csv(os.path.join(directory, "dists_compilation_mean.csv"), index=False)
print("FINI")

FINI


In [None]:
# ANALYZE BEST BUNDLES for F1, precision etc
def calculate_precision_by_bundle(df):
    """
    Calcule le score de précision par bundle.

    Parameters:
    df (pd.DataFrame): Le DataFrame contenant les données avec les colonnes 'bundle' et 'is_malade'.

    Returns:
    pd.DataFrame: Un DataFrame avec les bundles et leurs scores de précision respectifs.
    """
    total = pd.DataFrame()
    df = add_nb_patients_and_diseased(df)

    for bundle_column in df.columns:
        if bundle_column in ['site','metric','num_patients','disease_ratio','num_diseased']:
            continue # Skip non-numeric columns
        bundle_df = df[[bundle_column, 'metric']].copy()
        grouped_df = bundle_df.groupby(['metric']).mean().reset_index()
        grouped_df.set_index('metric', inplace=True)
        total = pd.concat([total, grouped_df.T])
        
    return total
# Exemple d'utilisation
precision_df = calculate_precision_by_bundle(pd.read_csv(os.path.join(MAINFOLDER, robust_method, "metrics_compilation.csv")))
precision_df = precision_df.sort_values(by='precision', ascending=False)
precision_df.to_csv(os.path.join(directory, "metrics_per_bundle.csv"), index=False)

In [None]:
precision_df = precision_df.sort_values(by='precision', ascending=False)
precision_df.head()

In [None]:
precision_df = precision_df.sort_values(by='f1_score', ascending=False)
precision_df.head()

In [None]:
# COUNT BUNDLES PER OUTLIERS
def count_bundles_per_outliers(df):
    """
    Analyze outliers in the DataFrame and calculate the percentage of SIDs with a certain number of occurrences.

    Parameters:
    df (pd.DataFrame): The input DataFrame containing 'sid', 'is_outlier', and 'is_sick' columns.

    Returns:
    dict: A dictionary with the percentage of SIDs with a certain number of occurrences for sick and healthy groups.
    """
    
    # Count the number of occurrences of each SID
    # Count the number of occurrences of each combination of SID and site
    sid_counts = df.groupby(['sid', 'site', 'is_malade']).size().reset_index(name='count_bundle')
    
    # Divide the dataset into two groups: sick and healthy
    sick_sids = sid_counts[sid_counts['is_malade'] == 1]
    healthy_sids = sid_counts[sid_counts['is_malade'] == 0]
    
    # Calculate the percentage of SIDs with a certain number of occurrences for sick group
    sick_counts = sick_sids.groupby(['count_bundle']).size().reset_index(name='prct_occurence')
    sick_counts['prct_occurence'] = sick_counts['prct_occurence']/sick_counts['prct_occurence'].sum()*100
    # Calculate the percentage of SIDs with a certain number of occurrences for healthy group
    healthy_counts = healthy_sids.groupby(['count_bundle']).size().reset_index(name='prct_occurence')
    healthy_counts['prct_occurence'] = healthy_counts['prct_occurence']/healthy_counts['prct_occurence'].sum()*100

    total = pd.merge(sick_counts, healthy_counts, on=['count_bundle'], suffixes=('_sick', '_healthy'))
    
    return total

# Example usage
bundles_per_outliers = count_bundles_per_outliers(pd.read_csv(os.path.join(MAINFOLDER, robust_method, "outliers_compilation.csv")))
bundles_per_outliers.head(10)

In [None]:
# CREATION BOX PLOT POUR DISTANCES
def plot_bundle(df, prct, directory):
    """
    Crée un graphique pour chaque bundle dans le DataFrame donné.
    L'axe des X représente le nombre de patients et l'axe des Y représente la moyenne de la colonne du bundle.
    La courbe inclut une zone indiquant l'écart-type (std).

    Parameters:
    df (pd.DataFrame): Le DataFrame contenant les données.
    bundle_column (str): Le nom de la colonne du bundle à utiliser pour le graphique.
    """
    directory = os.path.join(directory, "DISTANCES_PLOTS", str(prct))
    df = df[df['disease_ratio'] == prct *100]
    os.makedirs(directory, exist_ok=True)
    
    for bundle_column in df.columns:
        if bundle_column in ['site','method','num_patients','disease_ratio','num_diseased']:
            continue # Skip non-numeric columns
        bundle_df = df[[bundle_column, 'site', 'method','num_patients','disease_ratio','num_diseased']].copy()
        methods = ["hc", "no_robust", "robust", "robust_rwp"]
        colors = ['blue', 'green', 'red', 'purple']
        
        plt.figure(figsize=(10, 5))
        width = 0.2  # the width of the bars
        x = np.arange(len(bundle_df['num_patients'].unique()))  # the label locations
        fig, ax = plt.subplots(figsize=(10, 5))

        for i, (method, color) in enumerate(zip(methods, colors)):
            method_df = bundle_df[bundle_df['method'] == method]
            data = [method_df[method_df['num_patients'] == patients][bundle_column].values 
                    for patients in bundle_df['num_patients'].unique()]
            
            # Ensure there is data for each num_patients
            if any(len(d) > 0 for d in data):
                positions = x + i * width  # Shift positions for each method
                ax.boxplot(data, positions=positions, widths=0.15, patch_artist=True, 
                        boxprops=dict(facecolor=color, color=color),
                        medianprops=dict(color='black'))
                
        ax.set_xlabel('Nombre de patients')
        ax.set_ylabel('Valeurs')
        ax.set_title(f'Boxplots pour le bundle: {bundle_column} avec {prct * 100}% de malades')
        ax.set_xticks(x + width * (len(methods) - 1) / 2)
        ax.set_xticklabels(bundle_df['num_patients'].unique())
        ax.legend(handles=[plt.Line2D([0], [0], color=color, lw=4, label=f'Method: {method}') for method, color in zip(methods, colors)])
        plt.savefig(os.path.join(directory, f'{bundle_column}_boxplot.png'))
        plt.close()

# Exemple d'utilisation
distances_df = pd.read_csv(os.path.join(MAINFOLDER, robust_method, "dists_compilation.csv"))
add_nb_patients_and_diseased(distances_df)
disease_ratios = [0.1, 0.3, 0.5, 0.7]
for disease_ratio in disease_ratios:
    plot_bundle(distances_df, disease_ratio, os.path.join(MAINFOLDER, robust_method, ANALYSISFOLDER))

In [None]:
# CREATION BOX PLOT POUR MEA
def plot_bundle(df, prct, directory):
    """
    Crée un graphique pour chaque bundle dans le DataFrame donné.
    L'axe des X représente le nombre de patients et l'axe des Y représente la moyenne de la colonne du bundle.
    La courbe inclut une zone indiquant l'écart-type (std).

    Parameters:
    df (pd.DataFrame): Le DataFrame contenant les données.
    bundle_column (str): Le nom de la colonne du bundle à utiliser pour le graphique.
    """
    directory = os.path.join(directory, "MEA_PLOTS", str(prct))
    df = df[df['disease_ratio'] == prct *100]
    os.makedirs(directory, exist_ok=True)
    
    for bundle_column in df.columns:
        if bundle_column in ['site','method','num_patients','disease_ratio','num_diseased']:
            continue # Skip non-numeric columns
        bundle_df = df[[bundle_column, 'site', 'method','num_patients','disease_ratio','num_diseased']].copy()
        methods = ["hc", "no_robust", "robust", "robust_rwp"]
        colors = ['blue', 'green', 'red', 'purple']
        
        plt.figure(figsize=(10, 5))
        width = 0.2  # the width of the bars
        x = np.arange(len(bundle_df['num_patients'].unique()))  # the label locations
        fig, ax = plt.subplots(figsize=(10, 5))

        for i, (method, color) in enumerate(zip(methods, colors)):
            method_df = bundle_df[bundle_df['method'] == method]
            data = [method_df[method_df['num_patients'] == patients][bundle_column].values 
                    for patients in bundle_df['num_patients'].unique()]
            
            # Ensure there is data for each num_patients
            if any(len(d) > 0 for d in data):
                positions = x + i * width  # Shift positions for each method
                ax.boxplot(data, positions=positions, widths=0.15, patch_artist=True, 
                        boxprops=dict(facecolor=color, color=color),
                        medianprops=dict(color='black'))
                
        ax.set_xlabel('Nombre de patients')
        ax.set_ylabel('Valeurs')
        ax.set_title(f'Boxplots pour le bundle: {bundle_column} avec {prct * 100}% de malades')
        ax.set_xticks(x + width * (len(methods) - 1) / 2)
        ax.set_xticklabels(bundle_df['num_patients'].unique())
        ax.legend(handles=[plt.Line2D([0], [0], color=color, lw=4, label=f'Method: {method}') for method, color in zip(methods, colors)])
        plt.savefig(os.path.join(directory, f'{bundle_column}_boxplot.png'))
        plt.close()

# Exemple d'utilisation
mea_df = pd.read_csv(os.path.join(MAINFOLDER, robust_method, "mea_compilation.csv"))
add_nb_patients_and_diseased(mea_df)
disease_ratios = [0.1, 0.3, 0.5, 0.7]
for disease_ratio in disease_ratios:
    plot_bundle(mea_df, disease_ratio, os.path.join(MAINFOLDER, robust_method, ANALYSISFOLDER))

## TESTS

In [None]:
# # # TEST ADD BIAIS
# # Split the data into training and testing sets
# directory = os.path.join(MAINFOLDER, "testBiais")
# os.makedirs(directory, exist_ok=True)
# train_df, test_df = split_train_test(CAMCAN, test_size=0.2, random_state=42)

# # Generate biased data
# # Save the original non-biased data to temporary files
# temp_train_file_original = os.path.join(directory, "temp_train_original.csv")
# temp_test_file_original = os.path.join(directory, "temp_test_original.csv")
# train_df.to_csv(temp_train_file_original, index=False)
# test_df.to_csv(temp_test_file_original, index=False)

# # Generate biased data
# sampled_df_biaied, test_df_biaised, gammas,deltas, ruffles= generate_biaised_data(train_df, test_df)

# # Save the biased data to temporary files
# temp_train_file = os.path.join(directory, "temp_train_biased.csv")
# temp_test_file = os.path.join(directory, "temp_test_biased.csv")
# sampled_df_biaied.to_csv(temp_train_file, index=False)
# test_df_biaised.to_csv(temp_test_file, index=False)

# # Run the combat_visualize_data script
# outname_train = os.path.join("visualize_train")
# cmd = (
#     "scripts/combat_visualize_data.py"
#     + " "
#     + temp_train_file_original
#     + " "
#     + temp_train_file
#     + " --out_dir "
#     + directory
#     + " --outname "
#     + outname_train
#     + " -f"
#     + " --bundles all"
# )
# subprocess.call(cmd, shell=True)

# # Display gammas and deltas along with their mean and standard deviation
# print("Gammas:", gammas)
# print("Deltas:", deltas)
# gammas = list(gammas.values())
# deltas = list(deltas.values())
# print("\nGamma Statistics:")
# print(f"Mean: {np.mean(gammas)}, Std: {np.std(gammas)}")

# print("\nDelta Statistics:")
# print(f"Mean: {np.mean(deltas)}, Std: {np.std(deltas)}")
# print("Ruffles:", ruffles)


In [None]:
# # TEST Powerpoint generation
# d  = os.path.join(MAINFOLDER, robust_method, "adni_100_Philips_3T")
# create_presentation(d)

In [None]:
# # TEST the sample_patients function with compilation data data
# sampled_df = sample_patients(COMPILATION, num_patients=100, disease_ratio=0.5)
# print(sampled_df)


In [None]:
# load_metrics("ROBUST/IQR/50_30/0/", "50_patients_30_percent_0")

In [None]:
# # Load the dists_compilation and metrics_compilation CSV files
# dists_compilation_path = os.path.join(directory, "dists_compilation.csv")
# metrics_compilation_path = os.path.join(directory, "metrics_compilation.csv")

# dists_compilation = pd.read_csv(dists_compilation_path)
# metrics_compilation = pd.read_csv(metrics_compilation_path)

# # Change the site column
# dists_compilation['site'] = dists_compilation['site'].str.rsplit('_', n=1).str[0]
# metrics_compilation['site'] = metrics_compilation['site'].str.rsplit('_', n=1).str[0]

# # Display the means by site
# dists_means_by_site = dists_compilation.groupby(['site','comparaison']).mean()
# metrics_means_by_site = metrics_compilation.groupby('site').mean()

# print(dists_means_by_site)
# print(metrics_means_by_site)

In [None]:
# # FIX METRICS COMPILATION
# directory = os.path.join(MAINFOLDER, robust_method)
# df = pd.read_csv(os.path.join(directory, "metrics_compilation.csv"))

# # Group by the site
# grouped = df.groupby('site')

# # Process each site
# cleaned_dfs = []
# for site, group in grouped:
#     # Reset index for easier manipulation
#     group = group.reset_index(drop=True)
    
#     # # The first row is the "bundle row" (new column names)
#     # new_columns = group.iloc[0].values  # Extract column names from the first row
#     # new_columns[-1] = 'site'
#     # group = group.iloc[1:]  # Remove the first row
    
#     # # Assign new column names
#     # group.columns = new_columns
    
#     # # Sort the columns alphabetically (excluding 'site')
#     # sorted = group.sort_index(axis=1)
#     # Add a new column 'nomm' with the value indicating the metric for each row
#     metrics = ['tp', 'fp', 'tn', 'fn', 'precision', 'recall', 'taux_faux_positifs', 'f1_score']
#     group['metric'] = metrics
    
#     # # Append the cleaned DataFrame for this site
#     cleaned_dfs.append(group)

# # Concatenate all cleaned DataFrames
# final_df = pd.concat(cleaned_dfs, ignore_index=True)

# # Save or display the result
# final_df.to_csv(os.path.join(directory, "metrics_compilation.csv"), index=False)


In [None]:
# REAL SITES
# directory = os.path.join(MAINFOLDER, robust_method)
# raw_directory = os.path.join(RAWFOLDER, site_group)
# for filename in sorted(os.listdir(raw_directory)):
#     f = os.path.join(raw_directory, filename)
#     # checking if it is a file
#     if os.path.isfile(f):
#         analyse_site(f, robust_method, directory)
        
