# data extraction

In [121]:
import os
import pandas as pd
import numpy as np

def load_data(file_path, filename) -> tuple[list[pd.DataFrame], list[str]]:
    # output: list of dataframes, list of `Image Data ID`
    dirs = os.listdir(file_path)
    data = []
    image_ids = []
    for dir in dirs:
        if dir.endswith(".nii"):
            path = os.path.join(file_path, dir, "stats", filename)
            if not os.path.exists(path):
                print(f"File {path} does not exist.")
                continue
            single_data = pd.read_csv(
                path,
                delim_whitespace=True,
                comment="#",
                header=None,
            )
            image_ids.append(dir[:-4])
            data.append(single_data)
    return data, image_ids

In [122]:
file_path = '/home/zqy/learningFile/PDSrepo/zqy/dataset'
filename = 'aseg+DKT.stats'
data_list, image_ids = load_data(file_path, filename)


  single_data = pd.read_csv(
  single_data = pd.read_csv(
  single_data = pd.read_csv(


In [123]:
# read manifest
manifest = pd.read_csv('../dataset/ADNI1_Screening_1.5T_1_29_2024.csv')
manifest.head(2)

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I62666,013_S_1275,MCI,F,79,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,2/22/2007,NiFTI,1/29/2024
1,I119268,121_S_1322,MCI,F,72,sc,MRI,MPR; ; N3; Scaled_2,Processed,3/02/2007,NiFTI,1/29/2024


In [124]:
headers = 'Index SegId NVoxels Volume_mm3 StructName normMean normStdDev normMin normMax normRange'.split()
headers


['Index',
 'SegId',
 'NVoxels',
 'Volume_mm3',
 'StructName',
 'normMean',
 'normStdDev',
 'normMin',
 'normMax',
 'normRange']

In [125]:
from copy import deepcopy

def add_headers(data_list: list[str], headers=headers):
    output = deepcopy(data_list)
    for data in output:
        data.columns = headers
    return output

In [126]:
data_list = add_headers(data_list, headers)

In [110]:
# show the first data in data_list
data_list[0]

Unnamed: 0,Index,SegId,NVoxels,Volume_mm3,StructName,normMean,normStdDev,normMin,normMax,normRange
0,1,2,222426,227010.218,Left-Cerebral-White-Matter,104.3496,8.7436,29.0,132.0,103.0
1,2,4,18886,19037.465,Left-Lateral-Ventricle,20.4963,11.4587,4.0,69.0,65.0
2,3,5,1221,1274.514,Left-Inf-Lat-Vent,40.3948,15.5353,9.0,85.0,76.0
3,4,7,12445,13125.106,Left-Cerebellum-White-Matter,109.2044,9.7331,53.0,135.0,82.0
4,5,8,48417,47573.122,Left-Cerebellum-Cortex,81.1562,14.3559,8.0,123.0,115.0
...,...,...,...,...,...,...,...,...,...,...
95,96,2029,7117,6672.153,ctx-rh-superiorparietal,74.5181,11.0458,45.0,100.0,55.0
96,97,2030,10247,9421.409,ctx-rh-superiortemporal,76.8637,11.1068,48.0,106.0,58.0
97,98,2031,4924,4484.066,ctx-rh-supramarginal,73.7264,10.9216,46.0,96.0,50.0
98,99,2034,641,575.272,ctx-rh-transversetemporal,80.3978,12.3712,50.0,105.0,55.0


In [111]:
def check_attribute(data_list: pd.DataFrame, attribute):
    # check if specific attribute are the same in all subjects
    failed = 0
    for data in data_list:
        if False in (data.loc[:, attribute] == data_list[0].loc[:, attribute]).values:
            failed += 1
    if failed == 0:
        print(f"All subjects have the same {attribute} attribute.")
    else:
        print(f"There are **{failed}** subjects with different {attribute} attribute.")


In [112]:
check_attribute(data_list, 'SegId')
check_attribute(data_list, 'StructName')

All subjects have the same SegId attribute.
All subjects have the same StructName attribute.


In [113]:
data_list[0].head()

Unnamed: 0,Index,SegId,NVoxels,Volume_mm3,StructName,normMean,normStdDev,normMin,normMax,normRange
0,1,2,222426,227010.218,Left-Cerebral-White-Matter,104.3496,8.7436,29.0,132.0,103.0
1,2,4,18886,19037.465,Left-Lateral-Ventricle,20.4963,11.4587,4.0,69.0,65.0
2,3,5,1221,1274.514,Left-Inf-Lat-Vent,40.3948,15.5353,9.0,85.0,76.0
3,4,7,12445,13125.106,Left-Cerebellum-White-Matter,109.2044,9.7331,53.0,135.0,82.0
4,5,8,48417,47573.122,Left-Cerebellum-Cortex,81.1562,14.3559,8.0,123.0,115.0


In [127]:
def drop_attribute(data_list):
    """
    drop the redundant and index attribute.
    """
    output = deepcopy(data_list)
    attribute_list = [
        'Index', 'SegId', 'NVoxels', 'StructName', 'normRange'
    ]
    for attribute in attribute_list:
        for data in output:
            if attribute in data.columns:
                data.drop(columns=[attribute], inplace=True)
    return output

data_list_drop = drop_attribute(data_list)


In [115]:
def flatten_combine(data_list_drop):
    flatten_data = []
    for data in data_list_drop:
        flatten_data.append(data.to_numpy().flatten())
    return np.vstack(flatten_data)

data_combine = flatten_combine(data_list_drop)

In [116]:
data_combine.shape, type(data_combine)

((3, 500), numpy.ndarray)

In [117]:
data_list_drop[0].head(2)

Unnamed: 0,Volume_mm3,normMean,normStdDev,normMin,normMax
0,227010.218,104.3496,8.7436,29.0,132.0
1,19037.465,20.4963,11.4587,4.0,69.0


In [159]:
def add_labels(data: np.ndarray) -> pd.DataFrame:
    index = image_ids
    columns = np.concatenate([['Volume_mm3'+str(i),
                               'normMean'+str(i),
                               'normStdDev'+str(i),
                               'normMin'+str(i),
                               'normMax'+str(i),
                               ] for i in range(data.shape[1]//5)])
    output = pd.DataFrame(data, columns=columns, index=index)
    return output

data_pd = add_labels(data_combine)

In [132]:
data_pd.head(2)

Unnamed: 0,Volume_mm30,normMean0,normStdDev0,normMin0,normMax0,Volume_mm31,normMean1,normStdDev1,normMin1,normMax1,...,Volume_mm398,normMean98,normStdDev98,normMin98,normMax98,Volume_mm399,normMean99,normStdDev99,normMin99,normMax99
I31143,227010.218,104.3496,8.7436,29.0,132.0,19037.465,20.4963,11.4587,4.0,69.0,...,575.272,80.3978,12.3712,50.0,105.0,5596.531,71.203,8.7248,43.0,97.0
I119400,275483.319,104.4471,9.3109,34.0,133.0,28434.358,20.2901,10.5809,6.0,73.0,...,498.024,77.3474,12.097,51.0,100.0,6005.433,71.6953,9.486,44.0,97.0


In [133]:
manifest.head(2)

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I62666,013_S_1275,MCI,F,79,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,2/22/2007,NiFTI,1/29/2024
1,I119268,121_S_1322,MCI,F,72,sc,MRI,MPR; ; N3; Scaled_2,Processed,3/02/2007,NiFTI,1/29/2024


In [137]:
def check_image_id(data_pd):
    isin_value = sum(data_pd.index.isin(manifest.loc[:, 'Image Data ID']).astype(int))
    if isin_value == data_pd.shape[0]:
        print("All image IDs are valid.")
    else:
        print("Some image IDs are invalid.")
        print(f"isin_value: {isin_value}, data_pd.shape[0]: {data_pd.shape[0]}")

check_image_id(data_pd)

All image IDs are valid.


In [170]:
def add_column(data: pd.DataFrame, col_name:str) -> pd.DataFrame:
    if col_name not in manifest.columns:
        raise ValueError(f"{col_name} not in manifest")
    columns = []
    for idx in data.index.to_list():
        if idx not in manifest.loc[:, 'Image Data ID'].to_list():
            print(f"{idx} is not in Image Data ID attribute!")
            columns.append(None)
        else:
            columns.append(
                manifest[manifest.loc[:, "Image Data ID"] == idx].loc[:, col_name].iloc[0]
            )
    data.insert(0, col_name, columns)

data_final = deepcopy(data_pd)
add_column(data_final, 'Sex')
add_column(data_final, 'Age')
add_column(data_final, 'Group')
    

In [171]:
data_final.head(2)

Unnamed: 0,Group,Age,Sex,Volume_mm30,normMean0,normStdDev0,normMin0,normMax0,Volume_mm31,normMean1,...,Volume_mm398,normMean98,normStdDev98,normMin98,normMax98,Volume_mm399,normMean99,normStdDev99,normMin99,normMax99
I31143,AD,73,M,227010.218,104.3496,8.7436,29.0,132.0,19037.465,20.4963,...,575.272,80.3978,12.3712,50.0,105.0,5596.531,71.203,8.7248,43.0,97.0
I119400,CN,70,M,275483.319,104.4471,9.3109,34.0,133.0,28434.358,20.2901,...,498.024,77.3474,12.097,51.0,100.0,6005.433,71.6953,9.486,44.0,97.0



# trivial
**we find same subject has different data, so we ignore the subject id.**

In [2]:
import pandas as pd

manifest = pd.read_csv('/home/zqy/learningFile/PDSrepo/zqy/dataset/ADNI1_Screening_1.5T_1_29_2024.csv')

In [3]:
manifest.head()

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I62666,013_S_1275,MCI,F,79,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,2/22/2007,NiFTI,1/29/2024
1,I119268,121_S_1322,MCI,F,72,sc,MRI,MPR; ; N3; Scaled_2,Processed,3/02/2007,NiFTI,1/29/2024
2,I59697,116_S_0649,MCI,M,87,sc,MRI,MPR; GradWarp; N3; Scaled,Processed,7/24/2006,NiFTI,1/29/2024
3,I68581,099_S_0880,MCI,M,84,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,10/05/2006,NiFTI,1/29/2024
4,I60760,029_S_1318,MCI,F,83,sc,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,2/17/2007,NiFTI,1/29/2024


In [25]:
import collections
counter = collections.Counter(manifest.loc[:, 'Subject'].values)


In [23]:
max(counter.values())

2

In [18]:
manifest

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
0,I62666,013_S_1275,MCI,F,79,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,2/22/2007,NiFTI,1/29/2024
1,I119268,121_S_1322,MCI,F,72,sc,MRI,MPR; ; N3; Scaled_2,Processed,3/02/2007,NiFTI,1/29/2024
2,I59697,116_S_0649,MCI,M,87,sc,MRI,MPR; GradWarp; N3; Scaled,Processed,7/24/2006,NiFTI,1/29/2024
3,I68581,099_S_0880,MCI,M,84,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,10/05/2006,NiFTI,1/29/2024
4,I60760,029_S_1318,MCI,F,83,sc,MRI,MPR-R; GradWarp; B1 Correction; N3; Scaled,Processed,2/17/2007,NiFTI,1/29/2024
...,...,...,...,...,...,...,...,...,...,...,...,...
1070,I50487,062_S_0730,AD,F,71,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,7/19/2006,NiFTI,1/29/2024
1071,I119802,018_S_0682,AD,M,68,sc,MRI,MPR; ; N3; Scaled_2,Processed,7/06/2006,NiFTI,1/29/2024
1072,I51542,013_S_1205,AD,M,83,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,1/11/2007,NiFTI,1/29/2024
1073,I40312,136_S_0299,AD,F,89,sc,MRI,MPR; GradWarp; B1 Correction; N3; Scaled,Processed,5/01/2006,NiFTI,1/09/2024


In [26]:
manifest[manifest.loc[:, 'Subject'] == '128_S_0188']

Unnamed: 0,Image Data ID,Subject,Group,Sex,Age,Visit,Modality,Description,Type,Acq Date,Format,Downloaded
6,I119369,128_S_0188,MCI,M,86,sc,MRI,MPR-R; GradWarp; N3; Scaled_2,Processed,2/06/2006,NiFTI,1/29/2024
465,I69639,128_S_0188,MCI,M,86,sc,MRI,MPR-R; GradWarp; N3; Scaled,Processed,2/06/2006,NiFTI,1/29/2024


In [None]:
data_final.to_csv(f'../{filename}980.csv', index=False, sep=',', header=True)