In [1]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config


## Read data into pandas dataframe

How do we define which files should be stitched together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, we will will use the first option

In [2]:
experiment_folder= 'StrokeMRI'

 StrokeMRI


In [7]:
config = Config()
root_directory = config.get_directory('raw_data')
if os.path.isdir(os.path.join(root_directory, experiment_folder)):
    print("this folder exists, we will take tsv from here")
else: 
    print("this folder does not seem to exist, try typing again")

this folder exists, we will take tsv from here


In [8]:
# Identify files
#experiment_folder = 'TOP'

root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files

['C:/Projects/brainspin/not_pushed/data_anonymized/StrokeMRI\\CoV_qCBF_StandardSpace_DeepWM_n=589_13-Jan-2023_PVC2.tsv',
 'C:/Projects/brainspin/not_pushed/data_anonymized/StrokeMRI\\CoV_qCBF_StandardSpace_TotalGM_n=589_13-Jan-2023_PVC2.tsv',
 'C:/Projects/brainspin/not_pushed/data_anonymized/StrokeMRI\\mean_qCBF_StandardSpace_DeepWM_n=589_13-Jan-2023_PVC2.tsv',
 'C:/Projects/brainspin/not_pushed/data_anonymized/StrokeMRI\\mean_qCBF_StandardSpace_TotalGM_n=589_13-Jan-2023_PVC2.tsv']

In [9]:
# Read files into dataframes
dataframes = [pd.read_csv(file, sep='\t', header=[0,1], index_col=0) for file in tsv_files]
sample_df = dataframes[0]
cols = sample_df.columns
sample_df

participant_id,session,LongitudinalTimePoint,SubjectNList,Site,GM_vol,WM_vol,CSF_vol,GM_ICVRatio,GMWM_ICVRatio,WMH_vol,WMH_count,DeepWM_B,DeepWM_L,DeepWM_R
StudyID,...,integer,integer,integer,Liter,Liter,Liter,ratio GM/ICV,ratio (GM+WM)/ICV,mL,n lesions (integer),SD/mean,SD/mean,SD/mean
sub-5908001_1,ASL_1,1,1,1,,,,,,,,0.0000,0.0000,0.0000
sub-5908201_1,ASL_1,1,2,1,0.64754,0.49441,0.31320,0.44500,0.78476,1.249,24.0,8.0434,8.6306,7.1949
sub-5908301_1,ASL_1,1,3,1,0.60517,0.48594,0.42304,0.39968,0.72061,14.597,25.0,3.7791,3.7875,3.6219
sub-5908401_1,ASL_1,1,4,1,0.62367,0.53915,0.43119,0.39126,0.72949,73.385,47.0,1.5444,1.5989,1.4925
sub-5908501_1,ASL_1,1,5,1,0.61724,0.53779,0.33692,0.41371,0.77417,6.341,30.0,5.1248,5.2018,4.9273
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sub-5944002_1,ASL_1,1,585,1,0.57528,0.50907,0.52840,0.35671,0.67236,22.915,32.0,2.3344,2.3700,1.9594
sub-5944101_1,ASL_1,1,586,1,0.58338,0.48623,0.41528,0.39288,0.72033,4.745,25.0,4.3768,4.3178,4.4395
sub-5944102_1,ASL_1,1,587,1,0.58675,0.47585,0.42899,0.39337,0.71239,5.249,30.0,3.8390,4.0910,3.5403
sub-5944201_1,ASL_1,1,588,1,0.63929,0.55235,0.39661,0.40251,0.75028,8.612,9.0,4.9303,4.4489,5.5673


## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [14]:
# how many identical columns are there in the files?
n_identical = 11  # columns A - L

stitched = sample_df[cols[:n_identical]].copy()
stitched['renumber'] = stitched.index


In [22]:
#stitched.columns

In [None]:
# Would be nice to add a test here to double check that identical columns are actually indentical

## Add unique columns from files

In [15]:
for df in dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched.columns

MultiIndex([(              'session',                 '...'),
            ('LongitudinalTimePoint',             'integer'),
            (         'SubjectNList',             'integer'),
            (                 'Site',             'integer'),
            (               'GM_vol',               'Liter'),
            (               'WM_vol',               'Liter'),
            (              'CSF_vol',               'Liter'),
            (          'GM_ICVRatio',        'ratio GM/ICV'),
            (        'GMWM_ICVRatio',   'ratio (GM+WM)/ICV'),
            (              'WMH_vol',                  'mL'),
            (            'WMH_count', 'n lesions (integer)'),
            (             'renumber',                    ''),
            (             'DeepWM_B',             'SD/mean'),
            (             'DeepWM_L',             'SD/mean'),
            (             'DeepWM_R',             'SD/mean'),
            (            'TotalGM_B',             'SD/mean'),
        

## Add sex and age data

In [17]:
#sexage_df


In [18]:
sexage_path = os.path.join(root_directory, 'age_data', 'Age_Sex_StrokeMRI.csv')
sexage_df = pd.read_csv(sexage_path, index_col=0)
sexage_df['renumber'] = sexage_df.index
sexage_df['renumber'] = sexage_df['renumber'].apply(str)

#sexage_df['renumber']
# tp 2 then + '2_1' if 1 then 1_1

In [19]:
sexage_df.loc[sexage_df.TP == 1, 'add_column'] = "01_1"
sexage_df.loc[sexage_df.TP == 2, 'add_column'] = "02_1"
sexage_df['renumber'] = sexage_df['renumber'] + sexage_df['add_column']
sexage_df

Unnamed: 0_level_0,TP,Sex,Age,renumber,add_column
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
59080,1,1,29.819178,5908001_1,01_1
59082,1,1,43.172603,5908201_1,01_1
59083,1,1,66.367123,5908301_1,01_1
59084,1,0,65.852055,5908401_1,01_1
59085,1,1,55.838356,5908501_1,01_1
...,...,...,...,...,...
59440,2,0,74.769863,5944002_1,02_1
59441,1,0,73.608219,5944101_1,01_1
59441,2,0,74.512329,5944102_1,02_1
59442,1,0,67.526027,5944201_1,01_1


Now we need to reformat the participant ID

In [24]:
stitched = stitched.reset_index(drop=True)
sexage_df = sexage_df.reset_index(drop=True)
result = pd.concat([stitched, sexage_df], axis=1, join="inner")
result

Unnamed: 0,"(session, ...)","(LongitudinalTimePoint, integer)","(SubjectNList, integer)","(Site, integer)","(GM_vol, Liter)","(WM_vol, Liter)","(CSF_vol, Liter)","(GM_ICVRatio, ratio GM/ICV)","(GMWM_ICVRatio, ratio (GM+WM)/ICV)","(WMH_vol, mL)",...,"(DeepWM_L, mL/100g/min)","(DeepWM_R, mL/100g/min)","(TotalGM_B, mL/100g/min)","(TotalGM_L, mL/100g/min)","(TotalGM_R, mL/100g/min)",TP,Sex,Age,renumber,add_column
0,ASL_1,1,1,1,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,1,1,29.819178,5908001_1,01_1
1,ASL_1,1,2,1,0.64754,0.49441,0.31320,0.44500,0.78476,1.249,...,25.1854,28.4596,84.4527,85.0203,83.7856,1,1,43.172603,5908201_1,01_1
2,ASL_1,1,3,1,0.60517,0.48594,0.42304,0.39968,0.72061,14.597,...,26.1957,23.7556,67.1158,67.4091,66.8006,1,1,66.367123,5908301_1,01_1
3,ASL_1,1,4,1,0.62367,0.53915,0.43119,0.39126,0.72949,73.385,...,29.5016,28.5958,54.2586,52.4991,56.3248,1,0,65.852055,5908401_1,01_1
4,ASL_1,1,5,1,0.61724,0.53779,0.33692,0.41371,0.77417,6.341,...,28.3792,27.0853,64.3183,68.5788,59.9463,1,1,55.838356,5908501_1,01_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
584,ASL_1,1,585,1,0.57528,0.50907,0.52840,0.35671,0.67236,22.915,...,31.4615,26.0258,61.8818,65.3882,58.5500,2,0,74.769863,5944002_1,02_1
585,ASL_1,1,586,1,0.58338,0.48623,0.41528,0.39288,0.72033,4.745,...,25.0157,26.2155,53.5166,54.9420,52.3440,1,0,73.608219,5944101_1,01_1
586,ASL_1,1,587,1,0.58675,0.47585,0.42899,0.39337,0.71239,5.249,...,26.5315,27.7642,54.3541,54.6246,54.1232,2,0,74.512329,5944102_1,02_1
587,ASL_1,1,588,1,0.63929,0.55235,0.39661,0.40251,0.75028,8.612,...,24.3498,23.7347,59.2195,58.3677,60.1301,1,0,67.526027,5944201_1,01_1


In [27]:
# for col in sexage_df:
#     stitched[col] = sexage_df[col]

result.columns = [c[0]  for c in result.columns]
result.columns

Index(['session', 'LongitudinalTimePoint', 'SubjectNList', 'Site', 'GM_vol',
       'WM_vol', 'CSF_vol', 'GM_ICVRatio', 'GMWM_ICVRatio', 'WMH_vol',
       'WMH_count', 'renumber', 'DeepWM_B', 'DeepWM_L', 'DeepWM_R',
       'TotalGM_B', 'TotalGM_L', 'TotalGM_R', 'DeepWM_B', 'DeepWM_L',
       'DeepWM_R', 'TotalGM_B', 'TotalGM_L', 'TotalGM_R', 'T', 'S', 'A', 'r',
       'a'],
      dtype='object')

## save off file

In [28]:
 
filepath = '../open_work/internal_results/StrokeMRI_stitched.csv' 
result.to_csv(filepath)  

# Look at columns

In [29]:
result.columns

Index(['session', 'LongitudinalTimePoint', 'SubjectNList', 'Site', 'GM_vol',
       'WM_vol', 'CSF_vol', 'GM_ICVRatio', 'GMWM_ICVRatio', 'WMH_vol',
       'WMH_count', 'renumber', 'DeepWM_B', 'DeepWM_L', 'DeepWM_R',
       'TotalGM_B', 'TotalGM_L', 'TotalGM_R', 'DeepWM_B', 'DeepWM_L',
       'DeepWM_R', 'TotalGM_B', 'TotalGM_L', 'TotalGM_R', 'T', 'S', 'A', 'r',
       'a'],
      dtype='object')

In [None]:
stitched

In [None]:
def concat_double_header(dataframe_dub):
    dataframe = dataframe_dub.copy()
    dataframe.columns = [c[0] + "_" + c[1] for c in dataframe.columns]
    return dataframe

In [None]:
lo = concat_double_header(stitched)
lo.columns

In [None]:


dataframe = stitched.copy()
dataframe.columns = [c[0] + "_" + c[1] for c in dataframe.columns]

#dataframe.columns

In [None]:
# col=dataframe.columns.to_list()

# for i in col:
#     print(dataframe[i].shape)
#     print(dataframe[i])

In [None]:
# special_column = 'Age'
# all_columns = dataframe.columns
# notmain = dataframe.drop(special_column, axis=1)
# notmain_columns = notmain.columns

In [None]:
def relate_columns_graphs(dataframe, special_column_name):
    y = dataframe[special_column_name]
    col = dataframe.columns.to_list()
    a = len(col)  # number of rows
    b = 1  # number of columns
    c = 1  # initialize plot counter
    
    fig = plt.figure(figsize=(10,(len(col)*3)))
    for i in col:
        plt.subplot(a, b, c)
        plt.scatter(dataframe[i], y)
        plt.title('{}, subplot: {}{}{}'.format(i, a, b, c))
        plt.xlabel(i)
        c = c + 1
    plt.savefig(("versus"+special_column_name +".png")) 
                     


In [None]:
sep.relate_columns_graphs(dataframe, 'Age_')

In [32]:
topper =pd.read_csv('../open_work/internal_results/top_stitched.csv')
topper

Unnamed: 0,participant_id,session,LongitudinalTimePoint,SubjectNList,Site,GM_vol,WM_vol,CSF_vol,GM_ICVRatio,GMWM_ICVRatio,...,PCA_intermediate_R.1,PCA_distal_B.1,PCA_distal_L.1,PCA_distal_R.1,TotalGM_B.1,TotalGM_L.1,TotalGM_R.1,TP,Sex,Age
0,StudyID,...,integer,integer,integer,Liter,Liter,Liter,ratio GM/ICV,ratio (GM+WM)/ICV,...,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,mL/100g/min,,,
1,sub-0001_1,ASL_1,1,1,1,0.71736,0.52803,0.31812,0.45881,0.79653,...,76.2191,83.605,71.446,96.2028,60.6981,57.922,63.8339,1.0,1.0,43.49
2,sub-0002_1,ASL_1,1,2,1,0.72383,0.62394,0.25673,0.45112,0.83999,...,69.7295,90.1896,93.376,85.1374,71.6047,74.1927,68.7649,1.0,0.0,38.30
3,sub-0005_1,ASL_1,1,3,1,0.71919,0.44499,0.39444,0.46143,0.74693,...,82.4922,100.5462,105.0117,94.6801,82.4797,82.9234,81.9457,,,
4,sub-0006_1,ASL_1,1,4,1,0.64079,0.52942,0.27159,0.44444,0.81163,...,59.1239,66.4437,68.8396,64.2212,63.7201,65.3173,62.0529,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
891,sub-1165_1,ASL_1,1,891,1,0.71537,0.48601,0.27123,0.48578,0.81582,...,72.8863,85.63,92.9272,79.7285,77.103,76.9157,77.3044,1.0,0.0,33.86
892,sub-1166_1,ASL_1,1,892,1,0.68938,0.52763,0.26509,0.46514,0.82114,...,49.2011,58.2895,59.8466,56.0928,54.525,55.3724,53.5625,,,
893,sub-1167_1,ASL_1,1,893,1,0.73473,0.53549,0.35985,0.45074,0.77924,...,48.8532,67.8209,68.9911,65.9914,54.4204,54.3193,54.5318,1.0,1.0,33.92
894,sub-1168_1,ASL_1,1,894,1,0.71094,0.60974,0.3748,0.41931,0.77894,...,41.1785,49.5986,51.929,47.0285,51.4417,52.1897,50.5664,1.0,1.0,45.31
