In [2]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)

import sys
sys.path.insert(0, '../') # path to functions
from cvasl.file_handler import Config

## Read data into pandas dataframe

How do we define which files should be stitched together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, I will use the first option

In [None]:
# Identify files
experiment_folder = 'TOP'


config = Config()
root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files


['C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\CoV_qCBF_StandardSpace_DeepWM_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\CoV_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\CoV_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_Prox_Med_Dist_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\CoV_qCBF_StandardSpace_TotalGM_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\mean_qCBF_StandardSpace_DeepWM_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\mean_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/DaniBodor/MyCodes/brainspinner/cvasl/test_data/raw_data/TOP\\mean_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_Prox_Med_Dist_n=895_06-Feb-2023_PVC2.tsv',
 'C:/Users/

In [None]:
# Read files into dataframes

dataframes = [pd.read_csv(file, sep='\t', header=[0,1]) for file in tsv_files]
sample_df = dataframes[0]
cols = sample_df.columns


## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [None]:
# how many identical columns are there in the files?
n_identical = 12  # columns A - L

stitched = sample_df[cols[:n_identical]].copy()
stitched


Unnamed: 0_level_0,participant_id,session,LongitudinalTimePoint,SubjectNList,Site,GM_vol,WM_vol,CSF_vol,GM_ICVRatio,GMWM_ICVRatio,WMH_vol,WMH_count
Unnamed: 0_level_1,StudyID,...,integer,integer,integer,Liter,Liter,Liter,ratio GM/ICV,ratio (GM+WM)/ICV,mL,n lesions (integer)
0,sub-0001_1,ASL_1,1,1,1,0.71736,0.52803,0.31812,0.45881,0.79653,1.743,24.0
1,sub-0002_1,ASL_1,1,2,1,0.72383,0.62394,0.25673,0.45112,0.83999,1.629,23.0
2,sub-0005_1,ASL_1,1,3,1,0.71919,0.44499,0.39444,0.46143,0.74693,,
3,sub-0006_1,ASL_1,1,4,1,0.64079,0.52942,0.27159,0.44444,0.81163,1.631,20.0
4,sub-0007_1,ASL_1,1,5,1,0.70341,0.52337,0.31289,0.45686,0.79678,0.681,9.0
...,...,...,...,...,...,...,...,...,...,...,...,...
890,sub-1165_1,ASL_1,1,891,1,0.71537,0.48601,0.27123,0.48578,0.81582,1.425,18.0
891,sub-1166_1,ASL_1,1,892,1,0.68938,0.52763,0.26509,0.46514,0.82114,0.723,18.0
892,sub-1167_1,ASL_1,1,893,1,0.73473,0.53549,0.35985,0.45074,0.77924,3.935,20.0
893,sub-1168_1,ASL_1,1,894,1,0.71094,0.60974,0.37480,0.41931,0.77894,4.170,26.0


In [None]:
# Would be nice to add a test here to double check that identical columns are actually indentical

## Add unique columns from files

In [None]:
for df in dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched

Unnamed: 0_level_0,participant_id,session,LongitudinalTimePoint,SubjectNList,Site,GM_vol,WM_vol,CSF_vol,GM_ICVRatio,GMWM_ICVRatio,...,PCA_proximal_R,PCA_intermediate_B,PCA_intermediate_L,PCA_intermediate_R,PCA_distal_B,PCA_distal_L,PCA_distal_R,TotalGM_B,TotalGM_L,TotalGM_R
Unnamed: 0_level_1,StudyID,...,integer,integer,integer,Liter,Liter,Liter,ratio GM/ICV,ratio (GM+WM)/ICV,...,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min
0,sub-0001_1,ASL_1,1,1,1,0.71736,0.52803,0.31812,0.45881,0.79653,...,57.7411,65.9906,56.1429,76.2191,83.6050,71.4460,96.2028,60.6981,57.9220,63.8339
1,sub-0002_1,ASL_1,1,2,1,0.72383,0.62394,0.25673,0.45112,0.83999,...,60.4720,71.5105,73.7114,69.7295,90.1896,93.3760,85.1374,71.6047,74.1927,68.7649
2,sub-0005_1,ASL_1,1,3,1,0.71919,0.44499,0.39444,0.46143,0.74693,...,72.2354,84.4929,86.6735,82.4922,100.5462,105.0117,94.6801,82.4797,82.9234,81.9457
3,sub-0006_1,ASL_1,1,4,1,0.64079,0.52942,0.27159,0.44444,0.81163,...,55.3051,61.0346,63.8390,59.1239,66.4437,68.8396,64.2212,63.7201,65.3173,62.0529
4,sub-0007_1,ASL_1,1,5,1,0.70341,0.52337,0.31289,0.45686,0.79678,...,78.5647,92.5715,91.9522,93.0762,113.2723,114.2614,112.1711,93.0743,94.7247,91.2521
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
890,sub-1165_1,ASL_1,1,891,1,0.71537,0.48601,0.27123,0.48578,0.81582,...,65.4647,74.8321,77.6994,72.8863,85.6300,92.9272,79.7285,77.1030,76.9157,77.3044
891,sub-1166_1,ASL_1,1,892,1,0.68938,0.52763,0.26509,0.46514,0.82114,...,42.1407,49.1094,48.9355,49.2011,58.2895,59.8466,56.0928,54.5250,55.3724,53.5625
892,sub-1167_1,ASL_1,1,893,1,0.73473,0.53549,0.35985,0.45074,0.77924,...,42.9568,49.3340,49.8061,48.8532,67.8209,68.9911,65.9914,54.4204,54.3193,54.5318
893,sub-1168_1,ASL_1,1,894,1,0.71094,0.60974,0.37480,0.41931,0.77894,...,38.3813,46.5101,52.5951,41.1785,49.5986,51.9290,47.0285,51.4417,52.1897,50.5664
