In [2]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)

import sys
sys.path.insert(0, '../') # path to functions
from cvasl.file_handler import Config

## Read data into pandas dataframe

How do we define which files should be stitched together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, I will use the first option

In [4]:
# Identify files
experiment_folder = 'TOP'


config = Config()
root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files


['C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\CoV_qCBF_StandardSpace_DeepWM_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\CoV_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\CoV_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_Prox_Med_Dist_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\CoV_qCBF_StandardSpace_TotalGM_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\mean_qCBF_StandardSpace_DeepWM_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\mean_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\mean_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_Prox_Med_Dist_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/

In [11]:
# Read files into dataframes

dataframes = [pd.read_csv(file, sep='\t', header=[0,1], index_col=0) for file in tsv_files]
sample_df = dataframes[0]
cols = sample_df.columns
sample_df


participant_id,session,LongitudinalTimePoint,SubjectNList,Site,GM_vol,WM_vol,CSF_vol,GM_ICVRatio,GMWM_ICVRatio,WMH_vol,WMH_count,DeepWM_B,DeepWM_L,DeepWM_R
StudyID,...,integer,integer,integer,Liter,Liter,Liter,ratio GM/ICV,ratio (GM+WM)/ICV,mL,n lesions (integer),SD/mean,SD/mean,SD/mean
sub-0001_1,ASL_1,1,1,1,0.71736,0.52803,0.31812,0.45881,0.79653,1.743,24.0,8.7620,8.5880,8.9388
sub-0002_1,ASL_1,1,2,1,0.72383,0.62394,0.25673,0.45112,0.83999,1.629,23.0,9.0749,7.6862,10.2840
sub-0005_1,ASL_1,1,3,1,0.71919,0.44499,0.39444,0.46143,0.74693,,,5.0065,5.1959,4.7791
sub-0006_1,ASL_1,1,4,1,0.64079,0.52942,0.27159,0.44444,0.81163,1.631,20.0,8.2278,8.2301,8.0645
sub-0007_1,ASL_1,1,5,1,0.70341,0.52337,0.31289,0.45686,0.79678,0.681,9.0,7.9812,7.7579,8.1308
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sub-1165_1,ASL_1,1,891,1,0.71537,0.48601,0.27123,0.48578,0.81582,1.425,18.0,6.6858,6.3732,7.0912
sub-1166_1,ASL_1,1,892,1,0.68938,0.52763,0.26509,0.46514,0.82114,0.723,18.0,9.9917,10.8378,9.0693
sub-1167_1,ASL_1,1,893,1,0.73473,0.53549,0.35985,0.45074,0.77924,3.935,20.0,4.5665,4.8245,4.3038
sub-1168_1,ASL_1,1,894,1,0.71094,0.60974,0.37480,0.41931,0.77894,4.170,26.0,9.5267,9.9459,9.0904


## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [12]:
# how many identical columns are there in the files?
n_identical = 12  # columns A - L

stitched = sample_df[cols[:n_identical]].copy()
stitched


participant_id,session,LongitudinalTimePoint,SubjectNList,Site,GM_vol,WM_vol,CSF_vol,GM_ICVRatio,GMWM_ICVRatio,WMH_vol,WMH_count,DeepWM_B
StudyID,...,integer,integer,integer,Liter,Liter,Liter,ratio GM/ICV,ratio (GM+WM)/ICV,mL,n lesions (integer),SD/mean
sub-0001_1,ASL_1,1,1,1,0.71736,0.52803,0.31812,0.45881,0.79653,1.743,24.0,8.7620
sub-0002_1,ASL_1,1,2,1,0.72383,0.62394,0.25673,0.45112,0.83999,1.629,23.0,9.0749
sub-0005_1,ASL_1,1,3,1,0.71919,0.44499,0.39444,0.46143,0.74693,,,5.0065
sub-0006_1,ASL_1,1,4,1,0.64079,0.52942,0.27159,0.44444,0.81163,1.631,20.0,8.2278
sub-0007_1,ASL_1,1,5,1,0.70341,0.52337,0.31289,0.45686,0.79678,0.681,9.0,7.9812
...,...,...,...,...,...,...,...,...,...,...,...,...
sub-1165_1,ASL_1,1,891,1,0.71537,0.48601,0.27123,0.48578,0.81582,1.425,18.0,6.6858
sub-1166_1,ASL_1,1,892,1,0.68938,0.52763,0.26509,0.46514,0.82114,0.723,18.0,9.9917
sub-1167_1,ASL_1,1,893,1,0.73473,0.53549,0.35985,0.45074,0.77924,3.935,20.0,4.5665
sub-1168_1,ASL_1,1,894,1,0.71094,0.60974,0.37480,0.41931,0.77894,4.170,26.0,9.5267


In [None]:
# Would be nice to add a test here to double check that identical columns are actually indentical

## Add unique columns from files

In [13]:
for df in dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched

participant_id,session,LongitudinalTimePoint,SubjectNList,Site,GM_vol,WM_vol,CSF_vol,GM_ICVRatio,GMWM_ICVRatio,WMH_vol,...,PCA_proximal_L,PCA_proximal_R,PCA_intermediate_B,PCA_intermediate_L,PCA_intermediate_R,PCA_distal_B,PCA_distal_L,PCA_distal_R,TotalGM_L,TotalGM_R
StudyID,...,integer,integer,integer,Liter,Liter,Liter,ratio GM/ICV,ratio (GM+WM)/ICV,mL,...,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min
sub-0001_1,ASL_1,1,1,1,0.71736,0.52803,0.31812,0.45881,0.79653,1.743,...,41.9488,57.7411,65.9906,56.1429,76.2191,83.6050,71.4460,96.2028,57.9220,63.8339
sub-0002_1,ASL_1,1,2,1,0.72383,0.62394,0.25673,0.45112,0.83999,1.629,...,62.0036,60.4720,71.5105,73.7114,69.7295,90.1896,93.3760,85.1374,74.1927,68.7649
sub-0005_1,ASL_1,1,3,1,0.71919,0.44499,0.39444,0.46143,0.74693,,...,68.9295,72.2354,84.4929,86.6735,82.4922,100.5462,105.0117,94.6801,82.9234,81.9457
sub-0006_1,ASL_1,1,4,1,0.64079,0.52942,0.27159,0.44444,0.81163,1.631,...,59.2353,55.3051,61.0346,63.8390,59.1239,66.4437,68.8396,64.2212,65.3173,62.0529
sub-0007_1,ASL_1,1,5,1,0.70341,0.52337,0.31289,0.45686,0.79678,0.681,...,82.6400,78.5647,92.5715,91.9522,93.0762,113.2723,114.2614,112.1711,94.7247,91.2521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sub-1165_1,ASL_1,1,891,1,0.71537,0.48601,0.27123,0.48578,0.81582,1.425,...,61.6648,65.4647,74.8321,77.6994,72.8863,85.6300,92.9272,79.7285,76.9157,77.3044
sub-1166_1,ASL_1,1,892,1,0.68938,0.52763,0.26509,0.46514,0.82114,0.723,...,40.8445,42.1407,49.1094,48.9355,49.2011,58.2895,59.8466,56.0928,55.3724,53.5625
sub-1167_1,ASL_1,1,893,1,0.73473,0.53549,0.35985,0.45074,0.77924,3.935,...,39.8501,42.9568,49.3340,49.8061,48.8532,67.8209,68.9911,65.9914,54.3193,54.5318
sub-1168_1,ASL_1,1,894,1,0.71094,0.60974,0.37480,0.41931,0.77894,4.170,...,43.5299,38.3813,46.5101,52.5951,41.1785,49.5986,51.9290,47.0285,52.1897,50.5664


## Add sex and age data

In [22]:
sexage_path = os.path.join(root_directory, 'age_data', 'Age_Sex_TOP.csv')
sexage_df = pd.read_csv(sexage_path, index_col=0)
sexage_df




Unnamed: 0_level_0,TP,Sex,Age
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
sub-0001_1,1,1,43.49
sub-0002_1,1,0,38.30
sub-0019_1,1,1,32.30
sub-0020_1,1,0,21.97
sub-0022_1,1,0,37.52
...,...,...,...
sub-1163_1,1,0,19.06
sub-1165_1,1,0,33.86
sub-1167_1,1,1,33.92
sub-1168_1,1,1,45.31


In [26]:
for col in sexage_df[2:]:
    stitched[col] = sexage_df[col]

stitched[['GM_vol', 'Sex', 'Age']]

participant_id,GM_vol,Sex,Age
StudyID,Liter,Unnamed: 2_level_1,Unnamed: 3_level_1
sub-0001_1,0.71736,1.0,43.49
sub-0002_1,0.72383,0.0,38.30
sub-0005_1,0.71919,,
sub-0006_1,0.64079,,
sub-0007_1,0.70341,,
...,...,...,...
sub-1165_1,0.71537,0.0,33.86
sub-1166_1,0.68938,,
sub-1167_1,0.73473,1.0,33.92
sub-1168_1,0.71094,1.0,45.31
