# Assembling the TOP dataset: partial volume corrected only

This notebook assembles the TOP dataset, however, uses only partial volume corrected values for certain ASL values.
This is due to the file set given where no uncorrrected values were in the set.

### Import libraries

In [None]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt
import glob
import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config

## Read data into pandas dataframe

How do we define which files should be used together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, we will will use the first option, approved by Mathijs on 26th June 2023

In [None]:
experiment_folder= 'TOP'

# We use a config file, you do not have to...

In [None]:
# this is the alternative block to block 3 to run if you have no config file

In [None]:
# parsed_here = 'C:/your_path/not_pushed/data_anonymized/' # example of a hard-coded path you would feed
# config = Config.no_file({ 'bids': parsed_here})
# root_directory = config.get_directory('bids')
# if os.path.isdir(os.path.join(root_directory, experiment_folder)):
#     print("this folder exists, we will take tsv from here")
# else: 
#     print("this folder does not seem to exist, try typing again")

In [None]:

config = Config.from_file()
root_directory = config.get_directory('raw_data')
if os.path.isdir(os.path.join(root_directory, experiment_folder)):
    print("this folder exists, we will take tsv from here")
else: 
    print("this folder does not seem to exist, try typing again")

In [None]:
#root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files

## we must add code here to handle PVC0 
if and only if we later want to cover a set with uncorrected volume (PVC0) files.

In [None]:
#CoV files -> COV values NOT 
tsv_files_for_cov = []
basics = []
for file in os.listdir(tsv_path):
    if file.startswith('CoV') and file.endswith('.tsv'):
        tsv_files_for_cov.append(os.path.join(tsv_path, file))
    elif file.endswith('.tsv'):
        basics.append(os.path.join(tsv_path, file))
basics      

In [None]:
# Read files in selected folder into dataframes
cov_dataframes = [pd.read_csv(file, sep='\t', header=[0]) for file in tsv_files_for_cov]
# make a sample
sample_cov_df = cov_dataframes[2] # example of COV file
cols_cov = sample_cov_df.columns
#look at sample
sample_cov_df

In [None]:
len(sample_cov_df)

In [None]:
# Read files in selected folder into dataframes
basic_dataframes = [pd.read_csv(file, sep='\t', header=[0]) for file in basics]
# make a sample
sample_basic_df = basic_dataframes[2] # example of COV file
cols_basic = sample_basic_df.columns
#look at sample
sample_basic_df

In [None]:
# Read ALL files in selected folder into dataframes
dataframes = [pd.read_csv(file, sep='\t', header=[0]) for file in tsv_files]
# make a sample
sample_df = dataframes[2] # example of file
cols = sample_df.columns
#look at sample
#sample_df

## Below we can see some different columns should be in our final dataframe except sex and age, which we add from another frame.
## These lists have some overlap as some values are CoV values that are in are basic values

In [None]:
list_all_basic_column = []
for frame in basic_dataframes:
    list_columns = frame.columns.to_list()
    for x in list_columns:
        if x not in list_all_basic_column:
            list_all_basic_column.append(x)
#list_all_basic_column

In [None]:
list_all_cov_column = []
for frame in cov_dataframes:
    list_columns = frame.columns.to_list()
    for x in list_columns:
        if x not in list_all_cov_column:
            list_all_cov_column.append(x)
#list_all_cov_column

# Below a function for COV columns to add _cov to name if units are SD/mean

# need to get all cov_columns?

In [None]:
# we transform one of the COV, we need to transform all of them
for frame in cov_dataframes:
    print(frame.columns)

In [None]:
dict_0 = {
    'participant_id' : 'participant_id', #
    'session' : 'session', #,
    'LongitudinalTimePoint' : 'LongitudinalTimePoint', #
    'SubjectNList' : 'SubjectNList', #
    'Site' : 'Site', #
    'GM_vol' : 'GM_vol', #
    'WM_vol' : 'WM_vol' ,#
    'CSF_vol' : 'CSF_vol', #
    'GM_ICVRatio' : 'GM_ICVRatio', #
    'GMWM_ICVRatio' : 'GMWM_ICVRatio', #
    'WMH_vol' : 'WMH_vol',#
    'WMH_count' : 'WMH_count', # to here
    'DeepWM_B' : 'DeepWM_B_cov',
    'DeepWM_L' : 'DeepWM_L_cov',
    'DeepWM_R' : 'DeepWM_R_cov',
       }
dict_1 = {
    'participant_id' : 'participant_id', #
    'session' : 'session', #,
    'LongitudinalTimePoint' : 'LongitudinalTimePoint', #
    'SubjectNList' : 'SubjectNList', #
    'Site' : 'Site', #
    'GM_vol' : 'GM_vol', #
    'WM_vol' : 'WM_vol' ,#
    'CSF_vol' : 'CSF_vol', #
    'GM_ICVRatio' : 'GM_ICVRatio', #
    'GMWM_ICVRatio' : 'GMWM_ICVRatio', #
    'WMH_vol' : 'WMH_vol',#
    'WMH_count' : 'WMH_count', # to here
    'ACA_B' : 'ACA_B_cov',
    'ACA_L' : 'ACA_L_cov',
    'ACA_R' : 'ACA_R_cov',
    'MCA_B' : 'MCA_B_cov',
    'MCA_L' : 'MCA_L_cov',
    'MCA_R' : 'MCA_R_cov',
    'PCA_B' : 'PCA_B_cov',
    'PCA_L' : 'PCA_L_cov',
    'PCA_R' : 'PCA_R_cov',
       }
dict_2 = {
    'participant_id' : 'participant_id', #
    'session' : 'session', #,
    'LongitudinalTimePoint' : 'LongitudinalTimePoint', #
    'SubjectNList' : 'SubjectNList', #
    'Site' : 'Site', #
    'GM_vol' : 'GM_vol', #
    'WM_vol' : 'WM_vol' ,#
    'CSF_vol' : 'CSF_vol', #
    'GM_ICVRatio' : 'GM_ICVRatio', #
    'GMWM_ICVRatio' : 'GMWM_ICVRatio', #
    'WMH_vol' : 'WMH_vol',#
    'WMH_count' : 'WMH_count', # to here
    'ACA_proximal_B' : 'ACA_proximal_B_cov', #
    'ACA_proximal_L' : 'ACA_proximal_L_cov', #
    'ACA_proximal_R' : 'ACA_proximal_R_cov', #
    'ACA_intermediate_B' : 'ACA_intermediate_B_cov' ,#
    'ACA_intermediate_L' : 'ACA_intermediate_L_cov' ,#
    'ACA_intermediate_R' : 'ACA_intermediate_R_cov' ,#
    'ACA_distal_B' : 'ACA_distal_B_cov', #
    'ACA_distal_R' : 'ACA_distal_R_cov', #
    'MCA_proximal_B' : 'MCA_proximal_B_cov' ,#
    'MCA_proximal_L' : 'MCA_proximal_L_cov' ,#
    'MCA_proximal_R' : 'MCA_proximal_R_cov' ,#
    'MCA_intermediate_B' : 'MCA_intermediate_B_cov',#
    'MCA_intermediate_L' : 'MCA_intermediate_L_cov',#
    'MCA_intermediate_R' : 'MCA_intermediate_R_cov',#
    'MCA_distal_B' : 'MCA_distal_B_cov',#
    'MCA_distal_L' : 'MCA_distal_L_cov',#
    'MCA_distal_R' : 'MCA_distal_R_cov',#
    'PCA_proximal_B' : 'PCA_proximal_B_cov',#
    'PCA_proximal_L' : 'PCA_proximal_L_cov',#
    'PCA_proximal_R' : 'PCA_proximal_R_cov',#
    'PCA_intermediate_B' : 'PCA_intermediate_B_cov',#
    'PCA_intermediate_L' : 'PCA_intermediate_L_cov',
    'PCA_intermediate_R' : 'PCA_intermediate_R_cov',
    'PCA_distal_B' : 'PCA_distal_B_cov',
    'PCA_distal_L' : 'PCA_distal_L_cov',
    'PCA_distal_R' : 'PCA_distal_R_cov',
       }
dict_3 = {
    'participant_id' : 'participant_id', #
    'session' : 'session', #,
    'LongitudinalTimePoint' : 'LongitudinalTimePoint', #
    'SubjectNList' : 'SubjectNList', #
    'Site' : 'Site', #
    'GM_vol' : 'GM_vol', #
    'WM_vol' : 'WM_vol' ,#
    'CSF_vol' : 'CSF_vol', #
    'GM_ICVRatio' : 'GM_ICVRatio', #
    'GMWM_ICVRatio' : 'GMWM_ICVRatio', #
    'WMH_vol' : 'WMH_vol',#
    'WMH_count' : 'WMH_count', # to here
    'TotalGM_B' : 'TotalGM_B_cov',
    'TotalGM_L' : 'TotalGM_L_cov',
    'TotalGM_R' : 'TotalGM_R_cov',
       }
cov_dataframes[0].rename(columns=dict_0,
          inplace=True)
cov_dataframes[1].rename(columns=dict_1,
          inplace=True)
cov_dataframes[2].rename(columns=dict_2,
          inplace=True)
cov_dataframes[3].rename(columns=dict_3,
          inplace=True)
cov_dataframes[0]

### Below we take all the dataframes and put them into tsv files in a specific folder

In [None]:
#cov_dataframes[3].columns

In [None]:
numbr = 0
for frame in cov_dataframes:
    filepath = '../open_work/internal_results/stitchy/cov' 
    filename = os.path.join(filepath,str(numbr+1)) 
    if not os.path.exists(filepath):
    # if filder doesn't exist, create it
        os.makedirs(filepath)
    frame.to_csv((filename +'.tsv'), sep="\t")
    numbr +=1

In [None]:
numbr = 0
for frame in basic_dataframes:
    filepath = '../open_work/internal_results/stitchy/basics' 
    filename = os.path.join(filepath,str(numbr+1)) 
    if not os.path.exists(filepath):
    # if filder doesn't exist, create it
        os.makedirs(filepath)
    frame.to_csv((filename +'.tsv'), sep="\t")
    numbr +=1

In [None]:
cov_tsv_folder_made = '../open_work/internal_results/stitchy/cov'
cov_identical_columns = sep.check_identical_columns(cov_tsv_folder_made)
cov_identical_columns

In [None]:
basics_tsv_folder_made = '../open_work/internal_results/stitchy/basics'
basics_identical_columns = sep.check_identical_columns(basics_tsv_folder_made)
basics_identical_columns

In [None]:
# now we need to make a super dataframes list
super_dataframes = basic_dataframes +cov_dataframes
len(super_dataframes)

## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [None]:
# tsv_folder_made = '../open_work/internal_results/stitchy'
identical_columns = sep.check_identical_columns(basics_tsv_folder_made)

In [None]:
#identical_columns

In [None]:
#sample_df.columns

In [None]:
stitched = sample_basic_df[identical_columns].copy()

In [None]:
len(stitched)

In [None]:
n_identical = stitched.shape[1]

In [None]:
stitched

### Here is where we add the different parts to stitched

In [None]:
for df in super_dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched.columns

In [None]:
len(stitched.columns)

## Here we can should get rid of second visits, but what we see is that session was not in the common columns. We will NOT get rid of all second time points, and people ending in _2. And mention to scientists..turns out to be a mute point

In [None]:
#stitched.columns

In [None]:
# now we can get rid of double header
stitched = stitched[1:]
#stitched

In [None]:
stitched['session'].unique()

In [None]:
stitched['LongitudinalTimePoint'].unique()

### so due the fact they are all on first visit, first session, we can write this into p[atient id LATER

## So in this dataset we have one longitudinal timepoint, and one type of session. no need to filter down away from this

In [None]:
stitched.columns

In [None]:
sexage_path = os.path.join(root_directory, 'age_data', 'Age_sex_TOP.csv')
sexage_df = pd.read_csv(sexage_path, index_col=0)
print(len(sexage_df))
sexage_df['renumber'] = sexage_df.index
sexage_df['renumber'] = sexage_df['renumber'].apply(str)
sexage_df

## Here we take the patient ID and align it with our other frame's index

In [None]:
stitched = stitched.reset_index(drop=False)
stitched

In [None]:
len(stitched.participant_id.unique())

In [None]:
stitched['renumber'] = stitched['participant_id']

In [None]:
sexage_df = sexage_df.reset_index(drop=True)
sexage_df

In [None]:
#stitched.columns

In [None]:
result = stitched.merge(sexage_df, on="renumber")
result

In [None]:
result

In [None]:
# just in case there are duplicates in there
result = result.loc[:,~result.columns.duplicated()].copy()

In [None]:
result

## Conform file to new standard

### example of new standard (from M.D. on 23/08/2023)

In [None]:
standard_path = '../researcher_interface/sample_sep_values/showable_standard.csv'


In [None]:
standard = pd.read_csv(standard_path)
set_standard = set(standard.columns.to_list())

In [None]:
result.columns = result.columns.str.lower()
set_results= set(result.columns.to_list())

z = set_results.intersection(set_standard) 
#z

In [None]:
for f in result.columns:
    print(f)

In [None]:
# shift column 'Name' to first position
first_column = result.pop('participant_id')
  
# insert column using insert(position,column_name,
# first_column) function
result.insert(0, 'participant_id', first_column)
result['participant_id'] = result['participant_id']+'_ses-1_run-1'
#result

In [None]:
#standard.columns

In [None]:
result['session_id'] = result['session']
second_column = result.pop('session_id')
result.insert(1, 'session_id', second_column)
result['site'] = "TOP"
result['run_id'] = result['longitudinaltimepoint']
#result

In [None]:
#standard.columns

In [None]:
result['gm_icv_ratio'] = result['gm_icvratio']
result['gmwm_icv_ratio'] = result['gmwm_icvratio']
result['cbf_gm_pvc0'] = 'NaN' 
result['cbf_gm_pvc2']=  result['totalgm_b'] 
result['cbf_wm_pvc0']= 'NaN'
result['cbf_wm_pvc2']= result['deepwm_b']
result['cbf_aca_pvc0'] =  'NaN'
result['cbf_mca_pvc0']  = 'NaN'
result['cbf_pca_pvc0'] =  'NaN'
result['cbf_aca_pvc2']  = result['aca_b']
result['cbf_mca_pvc2']  = result['mca_b']
result['cbf_pca_pvc2']  = result['pca_b']

In [None]:
# for - need to check with student and scientists!
result['cov_gm_pvc0'] =  'NaN' # correct does not exist
# result['cov_gm_pvc2']  = result[]
result['cov_wm_pvc0']  = 'NaN' # does not exist
# result['cov_wm_pvc2']  = result[]
result['cov_aca_pvc0'] =  'NaN'# does not exist
result['cov_mca_pvc0']  = 'NaN'# does not exist
result['cov_pca_pvc0']  = 'NaN'# does not exist
result['cov_aca_pvc2'] = result['aca_b_cov']
result['cov_mca_pvc2'] = result['mca_b_cov']
result['cov_pca_pvc2'] = result['pca_b_cov']


     # 'ACA_proximal_B_cov',
     #   'ACA_proximal_L_cov', 'ACA_proximal_R_cov', 'ACA_intermediate_B_cov',
     #   'ACA_intermediate_L_cov', 'ACA_intermediate_R_cov', 'ACA_distal_B_cov',
     #   'ACA_distal_R_cov', 'MCA_proximal_B_cov', 'MCA_proximal_L_cov',
     #   'MCA_proximal_R_cov', 'MCA_intermediate_B_cov',
     #   'MCA_intermediate_L_cov', 'MCA_intermediate_R_cov', 'MCA_distal_B_cov',
     #   'MCA_distal_L_cov', 'MCA_distal_R_cov', 'PCA_proximal_B_cov',
     #   'PCA_proximal_L_cov', 'PCA_proximal_R_cov', 'PCA_intermediate_B_cov',
     #   'PCA_intermediate_L_cov', 'PCA_intermediate_R_cov', 'PCA_distal_B_cov',
     #   'PCA_distal_L_cov', 'PCA_distal_R_cov'

In [None]:
for f in result.columns:
    print(f)


In [None]:
# we added a bunch of columns 
len(result.columns)

In [None]:
# set_results= set(result.columns.to_list())

# z = set_results.intersection(set_standard) 
# new_result_columns = []
# for listable in z:
#     new_result_columns.append(listable)
# new_results = result[new_result_columns]
# new_results.columns

In [None]:
# standard.columns[:33]

In [None]:
# new_results = new_results[standard.columns[:33]]
# new_results

In [None]:
sex_mapping = {0:'F',1:'M',}
results = result.assign(sex = result.sex.map(sex_mapping))
results

## here need to reorder the columns again

In [None]:
standard.columns

In [None]:
#results.columns

In [None]:
len(results.columns)

In [None]:
# shift columns to  position
third_column = results.pop('run_id')
fourth_column = results.pop('age')
fifth_column = results.pop('sex')
sixth_column = results.pop('site')
seventh_column = results.pop('gm_vol')
eight_column = results.pop('wm_vol')
ninth_column = results.pop('csf_vol')
tenth_column = results.pop('gm_icv_ratio')
eleventh_column = results.pop('gmwm_icv_ratio')
twelvth_column = results.pop('wmh_vol')
thirteenth_column = results.pop('wmh_count')

#last_column = results.pop('index')

results.insert(2, 'run_id', third_column)
results.insert(3, 'age', fourth_column)
results.insert(4, 'sex', fifth_column)
results.insert(5, 'site', sixth_column)
results.insert(6, 'gm_vol', seventh_column)
results.insert(7, 'wm_vol', eight_column)
results.insert(8, 'csf_vol', ninth_column)
results.insert(9, 'gm_ivc_ratio',tenth_column)
results.insert(10, 'gmwm_ivc_ratio',eleventh_column)
results.insert(11, 'wmh_vol',twelvth_column)
results.insert(12, 'wmh_count',thirteenth_column)
#results.insert(82, 'index', last_column)

results

In [None]:
results = results.drop(['index', 'subjectnlist','session','longitudinaltimepoint', 'tp','gmwm_icvratio','gm_icvratio'], axis=1) 

In [None]:
results

In [None]:
# for column in results.columns:
#     print(column, type(results[column][9]))
standard.columns

In [None]:
for f in results.columns:
    print(f)

In [None]:
len(results.columns)

## now take last 20 columns, and put them after wmh count
## generally reorder columns!
not done yet

In [None]:
head_columns = results.columns[:13]
middle_columns = results.columns[13:97]
tail_columns = results.columns[97:]

In [None]:
f_results = pd.concat([results[head_columns],results[tail_columns],results[middle_columns]], axis=1)
f_results.columns

In [None]:
for column in f_results.columns:
    print(column)

In [None]:
for column in f_results.columns[6:]:
    #print(column)
    f_results[column] = pd.to_numeric(f_results[column], errors = 'coerce')


In [None]:
f_results

In [None]:
sep.check_sex_dimorph_expectations(f_results)

In [None]:
len(f_results.columns.to_list())

In [None]:
# # maybe examine bigger graphs?
# numeric_results = results.iloc[:, 2:55]
# numeric_results = numeric_results.drop('sex', axis= 1) 
# numeric_results = numeric_results.drop('site', axis= 1) 
# #numeric_results = numeric_results.drop('renumber', axis= 1) 
# #numeric_results = numeric_results.dropna(axis=0) 
# sep.relate_columns_graphs(numeric_results, 'age')

In [None]:
sep.relate_columns_graphs_numeric(f_results, 'age')

In [None]:
doubled_outliers =sep.find_outliers_by_list(f_results, f_results.columns.to_list()[6:], 2)

In [None]:
doubled_outliers

## Save outlier for M.

In [None]:
filepath = '../open_work/internal_results' 
filename = os.path.join(filepath,'more_possible_outliers.csv') 
if not os.path.exists(filepath):
    # if filder doesn't exist, create it
    os.makedirs(filepath)
doubled_outliers.to_csv(filename)  

## Save off file of data

In [None]:
filepath = '../open_work/internal_results' 
filename = os.path.join(filepath,'top_stitched_conformed.csv') 
if not os.path.exists(filepath):
    # if filder doesn't exist, create it
    os.makedirs(filepath)
f_results.to_csv(filename)  

In [None]:
# for f in f_results.columns: 
#     print(f)

In [None]:
# f_results[['gm_ivc_ratio','gmwm_ivc_ratio']]