# Data check
We will take a look at data sent in January 2024 (loaded into unprocessed); then look at it again after processing is applied

## The notebooks in this folder deal with new datasets. This is a temporary notebok to discuss the data we have

## There are unprocessed (except dropping two columns with deep wm ) harmonizeddatasets we will look at

### Load libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import sys
import glob
from functools import reduce

import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
import ipywidgets as widgets
from ipywidgets import interactive
import seaborn as sns

sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config
import cvasl.harmony as har


In [None]:
# Unprocessed Datasets for this work
EDIS_path =    'our_datasets_unprocessed/EDIS/'
HELIUS_pat =   'our_datasets_unprocessed/HELIUS/'
Insight46_path='our_datasets_unprocessed/Insight46/'
SABRE_path =   'our_datasets_unprocessed/SABRE/'
MRI_path =     'our_datasets_unprocessed/StrokeMRI/'
TOP_path =     'our_datasets_unprocessed/TOP/'
file_name = 'TrainingDataComplete.csv'

TOP_file = os.path.join(TOP_path, file_name)
MRI_file = os.path.join(MRI_path, file_name)
EDIS_file = os.path.join(EDIS_path, file_name)
#HELIUS_file = os.path.join(HELIUS_path, file_name)
Insight46_file = os.path.join(Insight46_path, file_name)
SABRE_file = os.path.join(SABRE_path, file_name)

EDIS = pd.read_csv(EDIS_file)
#HELIUS = pd.read_csv(HELIUS_file)
Insight46 = pd.read_csv(Insight46_file)
SABRE = pd.read_csv(SABRE_file)
TOP = pd.read_csv(TOP_file)
MRI = pd.read_csv(MRI_file)

In [None]:
# #example old datasets
# filepath_mri_old = '../open_work/internal_results/cleaned_pvc2s/' 
# filename_mri_old = os.path.join(filepath_mri_old,'StrokeMRI_pvc2c.csv') 
# filepath_top_old = '../open_work/internal_results/cleaned_pvc2s/' 
# filename_top_old = os.path.join(filepath_top_old,'TOP_pvc2c.csv') 
# TOP_old = pd.read_csv(filename_top_old)
# StrokeMRI_old = pd.read_csv(filename_mri_old)
# TOP_old = TOP_old.drop(TOP_old.columns[0],axis=1)
# StrokeMRI_old = StrokeMRI_old.drop(StrokeMRI_old.columns[0],axis=1)

In [None]:
TOP.shape

In [None]:
TOP.columns

## SO one minor note...this was not what we agreed to. We agreed to everything lower case but this was already fixed with the harmonizations

In [None]:
# we must discuss this with scientists

# Examine what harmonization outcomes are negative

In [None]:
negs = har.negative_harm_outcomes(
    'harmonizations/harm_results',
    'csv',
    number_columns=[
        'sex',
        'gm_vol',
        'wm_vol',
        'csf_vol',
        'gm_icvratio',
        'gmwm_icvratio',
        'wmhvol_wmvol',
        'wmh_count',
        #'deepwm_b_cov',
        'aca_b_cov',
        'mca_b_cov',
        'pca_b_cov',
        'totalgm_b_cov',
        #'deepwm_b_cbf',
        'aca_b_cbf',
        'mca_b_cbf',
        'pca_b_cbf',
        'totalgm_b_cbf',]
) 
#negs

In [None]:
# negs

# we were asked to preprocess before performing the harmonization.

Therefore we will make the files in our_data which create these proprocessed and deposited into the folder named holder_preprocessed

In [None]:
def preprocess(
    folder,
    file_extension,
    outcome_folder,
    log_columns=[],
    plus_one_log_columns = []
    
):
    """
    This function given a directory will
    search all subdirectory for noted file extension
    Copies of the files will be processed as specified
    which is the specified columns turned to log or +1 then log
    then put in the outcome folder
    """
    if not os.path.exists(outcome_folder):
            os.makedirs(outcome_folder)
    files = '**/*.' + file_extension

    suspects = glob.glob(
        os.path.join(folder, files),
        recursive=True,
    )
    read_names = []
    for file in suspects:
        read = pd.read_csv(file, index_col=0)
        filenames1 = os.path.split(file)[0]
        filenames = os.path.split(filenames1)[-1]
        if not os.path.exists(os.path.join(outcome_folder, filenames)):
            os.makedirs(os.path.join(outcome_folder, filenames))
        read_name = os.path.join(outcome_folder, filenames, os.path.basename(file).split('/')[-1])
        read[plus_one_log_columns] = read[plus_one_log_columns].apply(lambda x: x + 1, axis=1)
        read[plus_one_log_columns] = read[plus_one_log_columns].apply(lambda x: np.log(x), axis=1)
        read[log_columns] = read[log_columns].apply(lambda x: np.log(x), axis=1)
        read.to_csv(read_name)
        read_names.append(read_name)
    return read_names

In [None]:
sep.preprocess('our_datasets_unprocessed',
           'csv', 'our_datasets_A',
           log_cols=['ACA_B_CoV','MCA_B_CoV','PCA_B_CoV','TotalGM_B_CoV'],
           plus_one_log_columns = ['WMH_count','WMHvol_WMvol'])

In [None]:
outcome_folder = 'outcometrial2/EDIS'
show_sample = os.path.join(outcome_folder,'TrainingDataComplete.csv')
show = pd.read_csv(show_sample, index_col=0)
show

In [None]:
show.isna().sum()

In [None]:
origin = pd.read_csv('special\TrainingDataComplete.csv')
origin[['ACA_B_CoV','MCA_B_CoV','PCA_B_CoV','TotalGM_B_CoV']].describe()