In [1]:
print("hi")

import os
import pandas as pd
import subprocess
import re
import numpy as np
import json
import csv

from pptx import Presentation
from pptx.util import Inches
import os

from scripts import combat_info
from scripts import combat_quick_apply
from scripts import combat_quick_QC


CAMCAN = "./DONNES/CamCAN.md.raw.csv.gz"

MAINFOLDER = "ROBUST"

RAWFOLDER = "RAW"

site_group = 'ADNI'
robust_method = 'IQR'
metric = "md"
method= "vanilla"

hi


In [2]:
def get_info(mov_data_file):
    [df,bundles] = combat_info.info(mov_data_file)
    nb_hc = int(re.findall('HC\(n=(\d+)',df["DetailInfos"]["Disease"])[0])
    nb_total = df["DetailInfos"]["Number of Subject"]
    nb_sick = nb_total - nb_hc
    return [nb_total,nb_hc,nb_sick]

In [3]:
def get_bundles(mov_data_file):
    return combat_info.get_bundles(mov_data_file)

In [4]:
def robust_text(x):
    return "NoRobust" if x == 'No' else x

def rwp_text(x):
    return "RWP" if x else "NoRWP"
def get_site(mov_data_file):
    mov_data = pd.read_csv(mov_data_file)
    return mov_data.site.unique()[0]


In [5]:
def fit(mov_data_file, robust, rwp, directory, hc,):
    ###########
    ### fit ###
    ###########
    output_model_filename = (
            get_site(mov_data_file)
            + "."
            + metric
            + "."
            + method
            + "."
            + robust_text(robust)
            + "."
            + rwp_text(rwp)
            + ".model.csv"
        )
    cmd = (
        "scripts/combat_quick_fit.py"
        + " "
        + CAMCAN
        + " "
        + mov_data_file
        + " --out_dir "
        + directory
        + " --output_model_filename "
        + output_model_filename
        + " --method "
        + method
        + " --robust "
        + robust
        + " -f "
    )
    if rwp:
        cmd += ' --rwp'
    if hc: 
        cmd += ' --hc'
    subprocess.call(cmd, shell=True)
    return output_model_filename

In [6]:
def apply(mov_data_file, model_filename, robust, rwp, directory):
    output_filename = os.path.join(
            directory,
            get_site(mov_data_file)
            + "."
            + metric
            + "."
            + method
            + "."
            + robust_text(robust)
            + "."
            + rwp_text(rwp)
            + ".csv"
        )
    return output_filename, combat_quick_apply.apply(mov_data_file, model_filename, output_filename)

In [7]:
def visualize_harmonization(f, new_f, directory):
    cmd = (
        "scripts/combat_visualize_harmonization.py"
        + " "
        + CAMCAN
        + " "
        + f
        + " "
        + new_f
        + " --out_dir "
        + directory
        #+ " --bundles all"
        + " -f"
    )
    subprocess.call(cmd, shell=True)

In [8]:
def QC(output_filename, output_model_filename):
    return combat_quick_QC.QC(CAMCAN,output_filename, output_model_filename)

In [9]:
def create_presentation(directory):
    # Create a presentation object
    prs = Presentation()
    
    # Define the subdirectories
    subdirs = ["hc", "NoRobust", "robust", "robust_rwp"]
    # Get the list of images
    images = [img for img in os.listdir(os.path.join(directory, subdirs[0])) if method in img and img.endswith('.png')]
    
    for img in images:
        slide_layout = prs.slide_layouts[5]  # Use a blank slide layout
        slide = prs.slides.add_slide(slide_layout)
        
        for i, subdir in enumerate(subdirs):
            img_path = os.path.join(directory, subdir, img)
            left = Inches(0.5 + (i % 2) * 4.5)  # Positioning images in two columns
            top = Inches(0.2 + (i // 2) * 3.5)  # Positioning images in two rows with more space between rows
            
            # Add text above the image
            text_box = slide.shapes.add_textbox(left, top, width=Inches(4), height=Inches(0.5))
            text_frame = text_box.text_frame
            text_frame.text = subdir
            
            # Add the image
            slide.shapes.add_picture(img_path, left, top + Inches(0.5), width=Inches(4))
    
    # Save the presentation
    prs.save(os.path.join(directory, 'harmonization_results.pptx'))


In [10]:
def compare_and_display_distances(hc_dists, no_robust_dists, robust_dists, robust_rwp_dists):
    comparison_results = {
        "hc_vs_no_robust": (np.array(hc_dists) - np.array(no_robust_dists))/np.array(no_robust_dists)*100,
        "robust_vs_no_robust": (np.array(robust_dists) - np.array(no_robust_dists))/np.array(no_robust_dists)*100,
        "robust_rwp_vs_no_robust": (np.array(robust_rwp_dists) - np.array(no_robust_dists))/np.array(no_robust_dists)*100
    }
    df = pd.DataFrame(comparison_results)
    
    # Calculer le nombre de comparaisons négatives et positives, et les moyennes et médianes
    results = []
    for method in comparison_results.keys():
        negative_values = df[method][df[method] < 0]
        positive_values = df[method][df[method] >= 0]
        
        num_negative = len(negative_values)
        num_positive = len(positive_values)
        
        mean_negative = negative_values.mean() if num_negative > 0 else 0
        mean_positive = positive_values.mean() if num_positive > 0 else 0
        
        median_negative = negative_values.median() if num_negative > 0 else 0
        median_positive = positive_values.median() if num_positive > 0 else 0
        
        mean_difference = df[method].mean()
        
        results.append({
            "Méthode": method,
            "Nb comp. nég.": num_negative,
            "Nb comp. pos.": num_positive,
            "Moy. tot.": mean_difference,
            "Moy. val. nég.": mean_negative,
            "Moy. val. pos.": mean_positive,
            "Méd. val. nég.": median_negative,
            "Méd. val. pos.": median_positive
        })
    results_df = pd.DataFrame(results)
    print(results_df)


In [11]:
def harmonize(f, directory, robust, rwp,hc):
    os.makedirs(directory, exist_ok=True)
    print(f)
    
    # Fit the model
    output_model_filename = fit(f, robust, rwp, directory, hc)
    output_model_filename = os.path.join(directory, output_model_filename)
    # Apply the model
    output_filename, y_harm = apply(f, output_model_filename, robust, rwp, directory) 
    
    # Perform quality control
    dists = QC(output_filename, output_model_filename)
    
    # Visualize the harmonization
    visualize_harmonization(f, output_filename, directory)
    
    # If robust is not "No", load metrics and outliers
    if robust != "No":
        metrics_filename = os.path.join(directory, f"metrics_{get_site(f)}_{robust_text(robust)}_{rwp_text(rwp)}.json")
        outliers_filename = os.path.join(directory, f"outliers_{get_site(f)}_{robust_text(robust)}_{rwp_text(rwp)}.csv")
        
        # Load metrics from JSON file
        with open(metrics_filename, 'r') as json_file:
            loaded_metrics = json.load(json_file)
        
        # Load outliers from CSV file
        loaded_outliers_df = pd.read_csv(outliers_filename, index_col=0)
        
        return [dists, loaded_metrics, loaded_outliers_df]
    return[dists, None, None]

In [12]:
def analyse_site(f, robust, directory):
    directory = os.path.join(directory, get_site(f))
    # 4 harmonization
    harmonization_hc = harmonize(f, os.path.join(directory, "hc"), "No", False, True)
    harmonization_no_robust = harmonize(f, os.path.join(directory, "NoRobust"), "No", False, False)
    harmonization_robust = harmonize(f, os.path.join(directory, "robust"), robust, False, False)
    harmonization_robust_rwp = harmonize(f, os.path.join(directory, "robust_rwp"), robust, True, False)


    create_presentation(directory)

    compare_and_display_distances(harmonization_hc[0], harmonization_no_robust[0], harmonization_robust[0], harmonization_robust_rwp[0])
    
    # mettre les dists de bat
    # faire les 4 visus et foutre ca dans un powerpoint
    # recolter metriques

In [13]:

directory = os.path.join(MAINFOLDER, robust_method)
raw_directory = os.path.join(RAWFOLDER, site_group)
for filename in sorted(os.listdir(raw_directory)):
    f = os.path.join(raw_directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        analyse_site(f, robust_method, directory)
        


RAW/ADNI/adni_100_Philips_3T.md.raw.csv.gz
      Mean Bhattacharrya distance: 0.014315 (min: 0.000022, max: 0.059056)
RAW/ADNI/adni_100_Philips_3T.md.raw.csv.gz
      Mean Bhattacharrya distance: 0.012078 (min: 0.000418, max: 0.044460)
RAW/ADNI/adni_100_Philips_3T.md.raw.csv.gz
Précision : 1 / 5 = 0.200
Rappel (Recall) :1 / 3 = 0.333
Taux de faux positifs : 4 / 9 = 0.444
F1 score : 0.250
Test OK: The metrics and outliers match.
      Mean Bhattacharrya distance: 0.028119 (min: 0.000197, max: 0.165114)
RAW/ADNI/adni_100_Philips_3T.md.raw.csv.gz
Précision : 1 / 5 = 0.200
Rappel (Recall) :1 / 3 = 0.333
Taux de faux positifs : 4 / 9 = 0.444
F1 score : 0.250
Test OK: The metrics and outliers match.




      Mean Bhattacharrya distance: 0.066590 (min: 0.000146, max: 0.207728)
                   Méthode  Nb comp. nég.  Nb comp. pos.    Moy. tot.  \
0          hc_vs_no_robust             15             30   112.263636   
1      robust_vs_no_robust             13             32   309.967964   
2  robust_rwp_vs_no_robust              6             39  1287.625766   

   Moy. val. nég.  Moy. val. pos.  Méd. val. nég.  Méd. val. pos.  
0      -85.410555      211.100732      -83.841923      125.943233  
1      -79.699204      468.270251      -82.327620      321.097487  
2      -77.363061     1497.624047      -76.403654      860.917545  
RAW/ADNI/adni_114_Siemens_3T.md.raw.csv.gz




      Mean Bhattacharrya distance: 0.046116 (min: 0.000017, max: 0.140915)
RAW/ADNI/adni_114_Siemens_3T.md.raw.csv.gz




      Mean Bhattacharrya distance: 0.184544 (min: 0.017435, max: 0.410976)
RAW/ADNI/adni_114_Siemens_3T.md.raw.csv.gz




Précision : 4 / 7 = 0.571
Rappel (Recall) :4 / 8 = 0.500
Taux de faux positifs : 3 / 6 = 0.500
F1 score : 0.533
Test OK: The metrics and outliers match.




      Mean Bhattacharrya distance: 0.156108 (min: 0.011485, max: 0.410094)
RAW/ADNI/adni_114_Siemens_3T.md.raw.csv.gz




Précision : 4 / 7 = 0.571
Rappel (Recall) :4 / 8 = 0.500
Taux de faux positifs : 3 / 6 = 0.500
F1 score : 0.533
Test OK: The metrics and outliers match.




      Mean Bhattacharrya distance: 0.187314 (min: 0.039446, max: 0.332923)
                   Méthode  Nb comp. nég.  Nb comp. pos.  Moy. tot.  \
0          hc_vs_no_robust             38              7 -16.862573   
1      robust_vs_no_robust             26             19   6.404353   
2  robust_rwp_vs_no_robust             25             20  95.581371   

   Moy. val. nég.  Moy. val. pos.  Méd. val. nég.  Méd. val. pos.  
0      -77.177345      310.560472      -84.815595      299.021903  
1      -30.687014       57.160959      -27.205327        4.121199  
2      -30.375039      253.026884      -22.618151      108.923467  
RAW/ADNI/adni_116_Siemens_3T.md.raw.csv.gz




      Mean Bhattacharrya distance: 0.052740 (min: 0.001497, max: 0.075169)
RAW/ADNI/adni_116_Siemens_3T.md.raw.csv.gz




      Mean Bhattacharrya distance: 0.036132 (min: 0.002422, max: 0.061454)
RAW/ADNI/adni_116_Siemens_3T.md.raw.csv.gz




Précision : 2 / 7 = 0.286
Rappel (Recall) :2 / 8 = 0.250
Taux de faux positifs : 5 / 11 = 0.455
F1 score : 0.267
Test OK: The metrics and outliers match.




      Mean Bhattacharrya distance: 0.272303 (min: 0.001994, max: 0.567521)
RAW/ADNI/adni_116_Siemens_3T.md.raw.csv.gz




Précision : 2 / 7 = 0.286
Rappel (Recall) :2 / 8 = 0.250
Taux de faux positifs : 5 / 11 = 0.455
F1 score : 0.267
Test OK: The metrics and outliers match.




      Mean Bhattacharrya distance: 0.392813 (min: 0.022461, max: 0.666784)
                   Méthode  Nb comp. nég.  Nb comp. pos.    Moy. tot.  \
0          hc_vs_no_robust              6             39   122.316333   
1      robust_vs_no_robust              1             44  1068.092436   
2  robust_rwp_vs_no_robust              1             44  1526.734067   

   Moy. val. nég.  Moy. val. pos.  Méd. val. nég.  Méd. val. pos.  
0      -64.750091      151.095783      -70.758172       45.542727  
1      -93.494115     1094.492131      -93.494115      623.297463  
2      -26.717048     1562.039774      -26.717048     1045.665035  
RAW/ADNI/adni_11_Siemens_3T.md.raw.csv.gz




      Mean Bhattacharrya distance: 0.025999 (min: 0.001154, max: 0.112095)
RAW/ADNI/adni_11_Siemens_3T.md.raw.csv.gz




      Mean Bhattacharrya distance: 0.143524 (min: 0.055698, max: 0.253601)
RAW/ADNI/adni_11_Siemens_3T.md.raw.csv.gz




Précision : 1 / 3 = 0.333
Rappel (Recall) :1 / 7 = 0.143
Taux de faux positifs : 2 / 7 = 0.286
F1 score : 0.200
Test OK: The metrics and outliers match.




      Mean Bhattacharrya distance: 0.144776 (min: 0.059422, max: 0.257646)
RAW/ADNI/adni_11_Siemens_3T.md.raw.csv.gz




Précision : 1 / 3 = 0.333
Rappel (Recall) :1 / 7 = 0.143
Taux de faux positifs : 2 / 7 = 0.286
F1 score : 0.200
Test OK: The metrics and outliers match.




      Mean Bhattacharrya distance: 0.256270 (min: 0.119516, max: 0.504812)
                   Méthode  Nb comp. nég.  Nb comp. pos.  Moy. tot.  \
0          hc_vs_no_robust             45              0 -80.222734   
1      robust_vs_no_robust              8             37   1.390313   
2  robust_rwp_vs_no_robust              3             42  84.419158   

   Moy. val. nég.  Moy. val. pos.  Méd. val. nég.  Méd. val. pos.  
0      -80.222734        0.000000      -85.994557        0.000000  
1       -3.874638        2.528681       -1.354086        2.168391  
2      -20.298625       91.898999      -16.157926       87.775264  
RAW/ADNI/adni_127_GE_3T.md.raw.csv.gz
      Mean Bhattacharrya distance: 0.006235 (min: 0.000357, max: 0.067493)
RAW/ADNI/adni_127_GE_3T.md.raw.csv.gz
      Mean Bhattacharrya distance: 0.248680 (min: 0.128598, max: 0.398275)
RAW/ADNI/adni_127_GE_3T.md.raw.csv.gz
Précision : 24 / 27 = 0.889
Rappel (Recall) :24 / 79 = 0.304
Taux de faux positifs : 3 / 73 = 0.041
F1 s

KeyboardInterrupt: 

In [35]:
d  = os.path.join(MAINFOLDER, robust_method, "adni_100_Philips_3T")
create_presentation(d)