# Assembling the Inight 46 dataset

### Import libraries

In [1]:
import os       # using operating system dependent functionality (folders)
import pandas as pd # data analysis and manipulation
import numpy as np    # numerical computing (manipulating and performing operations on arrays of data)
import matplotlib.pyplot as plt

import sys
sys.path.insert(0, '../') # path to functions
import cvasl.seperated as sep
from cvasl.file_handler import Config


## Read data into pandas dataframe

How do we define which files should be used together?
options:
- all files in folder
- based on suffix (e.g. "n=895_06-Feb-2023_PVC2.tsv")
- check first columns to see whether it matches

For now, we will will use the first option

In [2]:
experiment_folder= 'Insight46'

In [3]:
config = Config()
root_directory = config.get_directory('raw_data')
if os.path.isdir(os.path.join(root_directory, experiment_folder)):
    print("this folder exists, we will take tsv from here")
else: 
    print("this folder does not seem to exist, try typing again")

this folder exists, we will take tsv from here


In [4]:
root_directory = config.get_directory('raw_data')
tsv_path = os.path.join(root_directory, experiment_folder)

tsv_files = [os.path.join(tsv_path, file) for file in os.listdir(tsv_path) if file.endswith('.tsv')]
tsv_files

['C:/Projects/brainspin/not_pushed/data_anonymized/Insight46\\CoV_qCBF_StandardSpace_DeepWM_n=1670_29-Sep-2022_PVC2.tsv',
 'C:/Projects/brainspin/not_pushed/data_anonymized/Insight46\\CoV_qCBF_StandardSpace_Hammers_n=1670_29-Sep-2022_PVC2.tsv',
 'C:/Projects/brainspin/not_pushed/data_anonymized/Insight46\\CoV_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_n=1670_29-Sep-2022_PVC2.tsv',
 'C:/Projects/brainspin/not_pushed/data_anonymized/Insight46\\CoV_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_Prox_Med_Dist_n=1670_29-Sep-2022_PVC2.tsv',
 'C:/Projects/brainspin/not_pushed/data_anonymized/Insight46\\CoV_qCBF_StandardSpace_TotalGM_n=1670_29-Sep-2022_PVC2.tsv',
 'C:/Projects/brainspin/not_pushed/data_anonymized/Insight46\\mean_qCBF_StandardSpace_DeepWM_n=1670_29-Sep-2022_PVC2.tsv',
 'C:/Projects/brainspin/not_pushed/data_anonymized/Insight46\\mean_qCBF_StandardSpace_Hammers_n=1670_29-Sep-2022_PVC2.tsv',
 'C:/Projects/brainspin/not_pushed/data_anonymized/Insight46\\mean_qCBF_StandardSpace_Tatu_ACA_MCA_PCA_n=16

In [5]:
# Read files into dataframes
dataframes = [pd.read_csv(file, sep='\t', header=[0,1], index_col=0) for file in tsv_files]
sample_df = dataframes[0]
cols = sample_df.columns
sample_df

participant_id,session,LongitudinalTimePoint,SubjectNList,Site,GM_vol,WM_vol,CSF_vol,GM_ICVRatio,GMWM_ICVRatio,WMH_vol,WMH_count,MeanMotion,DeepWM_B,DeepWM_L,DeepWM_R
StudyID,...,integer,integer,integer,Liter,Liter,Liter,ratio GM/ICV,ratio (GM+WM)/ICV,mL,n lesions (integer),mm,SD/mean,SD/mean,SD/mean
sub-100151241_1,ASL_1,TimePoint_1,1,1,,,,,,,,,0.0000,0.0000,0.0000
sub-100151242_1,ASL_1,TimePoint_1,2,1,,,,,,,,,0.0000,0.0000,0.0000
sub-10015124_1,ASL_1,TimePoint_1,3,1,0.56794,0.47687,0.34052,0.40997,0.75420,4.889,16.0,0.16098,10.9918,10.8279,11.0443
sub-10015124_2,ASL_1,TimePoint_2,3,1,0.56664,0.47270,0.34156,0.41034,0.75265,7.444,19.0,0.17257,7.9018,9.0773,6.8846
sub-100248221_1,ASL_1,TimePoint_1,4,1,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
sub-19995714_2,ASL_1,TimePoint_2,1300,1,0.63999,0.54546,0.42336,0.39780,0.73685,1.735,23.0,0.09423,20.3281,16.4068,25.2544
sub-199968291_1,ASL_1,TimePoint_1,1301,1,,,,,,,,,,,
sub-199968292_1,ASL_1,TimePoint_1,1302,1,,,,,,,,,,,
sub-19996829_1,ASL_1,TimePoint_1,1303,1,0.52206,0.45926,0.40260,0.37723,0.70909,15.875,41.0,,,,


## Copy identical columns from any file

we could also read it from the data, but if it's always the same, we can just define it here

In [6]:
# how many identical columns are there in the files?
n_identical = 11  # columns A - L

stitched = sample_df[cols[:n_identical]].copy()
stitched['renumber'] = stitched.index

# Caution, scientists need to confirm the above steps are legitamate, until there is a check that these are the columns that are identical.

In [7]:
for df in dataframes:
    for col in df.columns[n_identical:]:
        stitched[col] = df[col]

stitched.columns

  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[col] = df[col]
  stitched[c

MultiIndex([(              'session',               '...'),
            ('LongitudinalTimePoint',           'integer'),
            (         'SubjectNList',           'integer'),
            (                 'Site',           'integer'),
            (               'GM_vol',             'Liter'),
            (               'WM_vol',             'Liter'),
            (              'CSF_vol',             'Liter'),
            (          'GM_ICVRatio',      'ratio GM/ICV'),
            (        'GMWM_ICVRatio', 'ratio (GM+WM)/ICV'),
            (              'WMH_vol',                'mL'),
            ...
            (       'PCA_proximal_R',       'mL/100g/min'),
            (   'PCA_intermediate_B',       'mL/100g/min'),
            (   'PCA_intermediate_L',       'mL/100g/min'),
            (   'PCA_intermediate_R',       'mL/100g/min'),
            (         'PCA_distal_B',       'mL/100g/min'),
            (         'PCA_distal_L',       'mL/100g/min'),
            (         'P

# Here we note that we have 391 columns...a hand check shows this is possible. Lots pf brain areas left, right and both. But needs discussion with scientists

In [9]:
sexage_path = os.path.join(root_directory, 'age_data', 'Age_sex_Insight46.csv')
sexage_df = pd.read_csv(sexage_path, index_col=0)
sexage_df['renumber'] = sexage_df.index
sexage_df['renumber'] = sexage_df['renumber'].apply(str)

#sexage_df['renumber']
# tp 2 then + '2_1' if 1 then 1_1
sexage_df

Unnamed: 0_level_0,sex,ageatscandate_i46p1,renumber
participant ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10015124,2,69.733333,10015124
10024822,2,70.288889,10024822
10041211,1,71.430556,10041211
10075012,1,69.883333,10075012
10113912,1,69.866667,10113912
...,...,...,...
19902911,1,71.341667,19902911
19910919,1,70.741667,19910919
19978012,1,71.686111,19978012
19995714,1,70.944444,19995714


# Here we assume that we only have time point 1 on this dataset. This must be checked with scientists

In [10]:
sexage_df['renumber'] = "sub_" +sexage_df['renumber'] + "_1"
sexage_df

Unnamed: 0_level_0,sex,ageatscandate_i46p1,renumber
participant ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
10015124,2,69.733333,sub_10015124_1
10024822,2,70.288889,sub_10024822_1
10041211,1,71.430556,sub_10041211_1
10075012,1,69.883333,sub_10075012_1
10113912,1,69.866667,sub_10113912_1
...,...,...,...
19902911,1,71.341667,sub_19902911_1
19910919,1,70.741667,sub_19910919_1
19978012,1,71.686111,sub_19978012_1
19995714,1,70.944444,sub_19995714_1


In [11]:
stitched = stitched.reset_index(drop=True)
sexage_df = sexage_df.reset_index(drop=True)
result = pd.concat([stitched, sexage_df], axis=1, join="inner")
result

Unnamed: 0,"(session, ...)","(LongitudinalTimePoint, integer)","(SubjectNList, integer)","(Site, integer)","(GM_vol, Liter)","(WM_vol, Liter)","(CSF_vol, Liter)","(GM_ICVRatio, ratio GM/ICV)","(GMWM_ICVRatio, ratio (GM+WM)/ICV)","(WMH_vol, mL)",...,"(PCA_intermediate_R, mL/100g/min)","(PCA_distal_B, mL/100g/min)","(PCA_distal_L, mL/100g/min)","(PCA_distal_R, mL/100g/min)","(TotalGM_B, mL/100g/min)","(TotalGM_L, mL/100g/min)","(TotalGM_R, mL/100g/min)",sex,ageatscandate_i46p1,renumber
0,ASL_1,TimePoint_1,1,1,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2,69.733333,sub_10015124_1
1,ASL_1,TimePoint_1,2,1,,,,,,,...,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,0.0000,2,70.288889,sub_10024822_1
2,ASL_1,TimePoint_1,3,1,0.56794,0.47687,0.34052,0.40997,0.75420,4.889,...,77.5698,113.5901,120.0490,108.3283,82.9412,84.9229,81.0443,1,71.430556,sub_10041211_1
3,ASL_1,TimePoint_2,3,1,0.56664,0.47270,0.34156,0.41034,0.75265,7.444,...,67.4296,113.8901,116.3125,112.3273,71.9442,74.1086,69.8671,1,69.883333,sub_10075012_1
4,ASL_1,TimePoint_1,4,1,,,,,,,...,,,,,,,,1,69.866667,sub_10113912_1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
396,ASL_1,TimePoint_1,314,1,0.53013,0.40755,0.38581,0.40055,0.70849,0.537,...,115.9928,162.2286,167.8662,158.8206,97.6513,99.8699,95.6118,1,71.341667,sub_19902911_1
397,ASL_1,TimePoint_1,315,1,,,,,,,...,,,,,,,,1,70.741667,sub_19910919_1
398,ASL_1,TimePoint_1,316,1,,,,,,,...,,,,,,,,1,71.686111,sub_19978012_1
399,ASL_1,TimePoint_1,317,1,0.57667,0.45820,0.31145,0.42833,0.76867,1.527,...,84.4920,97.1203,99.6583,96.6426,73.8856,74.0166,73.7748,1,70.944444,sub_19995714_1


In [17]:
result.columns = [c[0]  for c in result.columns]
result.columns

Index(['session', 'LongitudinalTimePoint', 'SubjectNList', 'Site', 'GM_vol',
       'WM_vol', 'CSF_vol', 'GM_ICVRatio', 'GMWM_ICVRatio', 'WMH_vol',
       ...
       'PCA_intermediate_R', 'PCA_distal_B', 'PCA_distal_L', 'PCA_distal_R',
       'TotalGM_B', 'TotalGM_L', 'TotalGM_R', 's', 'a', 'r'],
      dtype='object', length=394)

In [18]:
for n in result.columns:
    print(n)

session
LongitudinalTimePoint
SubjectNList
Site
GM_vol
WM_vol
CSF_vol
GM_ICVRatio
GMWM_ICVRatio
WMH_vol
WMH_count
renumber
MeanMotion
DeepWM_B
DeepWM_L
DeepWM_R
TL_hippocampus_B
TL_hippocampus_L
TL_hippocampus_R
TL_amygdala_B
TL_amygdala_L
TL_amygdala_R
TL_anterior_temporal_lobe_medial_part_B
TL_anterior_temporal_lobe_medial_part_L
TL_anterior_temporal_lobe_medial_part_R
TL_anterior_temporal_lobe_lateral_part_B
TL_anterior_temporal_lobe_lateral_part_L
TL_anterior_temporal_lobe_lateral_part_R
TL_parahippocampal_and_ambient_gyrus_B
TL_parahippocampal_and_ambient_gyrus_L
TL_parahippocampal_and_ambient_gyrus_R
TL_superior_temporal_gyrus_middle_part_B
TL_superior_temporal_gyrus_middle_part_L
TL_superior_temporal_gyrus_middle_part_R
TL_middle_and_inferior_temporal_gyrus_B
TL_middle_and_inferior_temporal_gyrus_L
TL_middle_and_inferior_temporal_gyrus_R
TL_fusiform_gyrus_B
TL_fusiform_gyrus_L
TL_fusiform_gyrus_R
cerebellum_B
cerebellum_L
cerebellum_R
brainstem_excluding_substantia_nigra_B
brain

## Save off file

In [19]:
filepath = '../open_work/internal_results/Insight46_stitched.csv' 
result.to_csv(filepath)  

In [20]:
#result.columns

Index(['session', 'LongitudinalTimePoint', 'SubjectNList', 'Site', 'GM_vol',
       'WM_vol', 'CSF_vol', 'GM_ICVRatio', 'GMWM_ICVRatio', 'WMH_vol',
       ...
       'PCA_intermediate_R', 'PCA_distal_B', 'PCA_distal_L', 'PCA_distal_R',
       'TotalGM_B', 'TotalGM_L', 'TotalGM_R', 's', 'a', 'r'],
      dtype='object', length=394)