In [2]:
import pandas as pd
import numpy as np
from pathlib import Path

file_path = '../rawdata/AllSeries.tsv'
data = pd.read_csv(file_path, sep='\t')

data.head()

Unnamed: 0,Collection,PatientID,StudyInstanceUID,Modality,SeriesInstanceUID,SeriesNumber,SeriesDate,BodyPartExamined,ImageCount,TimeStamp,CollectionURI,FileSize,DateReleased
0,4D-Lung,100_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.552215730027...,CT,1.3.6.1.4.1.14519.5.2.1.6834.5010.189721824525...,507.0,1997-10-03 00:00:00.0,LUNG,50,2015-07-20 17:58:54.0,https://doi.org/10.7937/K9/TCIA.2016.ELN8YGLE,26405988,2015-07-20 17:58:54.0
1,4D-Lung,100_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.980344486630...,CT,1.3.6.1.4.1.14519.5.2.1.6834.5010.336250251691...,507.0,1997-10-07 00:00:00.0,LUNG,50,2015-07-20 17:40:07.0,https://doi.org/10.7937/K9/TCIA.2016.ELN8YGLE,26405988,2015-07-20 17:40:07.0
2,4D-Lung,100_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.157653211810...,CT,1.3.6.1.4.1.14519.5.2.1.6834.5010.227929163446...,503.0,1997-09-18 00:00:00.0,LUNG,50,2015-07-20 17:56:27.0,https://doi.org/10.7937/K9/TCIA.2016.ELN8YGLE,26405988,2015-07-20 17:56:27.0
3,4D-Lung,100_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.256783235670...,CT,1.3.6.1.4.1.14519.5.2.1.6834.5010.925990093742...,502.0,1997-10-14 00:00:00.0,LUNG,50,2015-07-20 17:55:30.0,https://doi.org/10.7937/K9/TCIA.2016.ELN8YGLE,26405988,2015-07-20 17:55:30.0
4,4D-Lung,100_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.980344486630...,CT,1.3.6.1.4.1.14519.5.2.1.6834.5010.139116724721...,501.0,1997-10-07 00:00:00.0,LUNG,50,2015-07-20 17:51:12.0,https://doi.org/10.7937/K9/TCIA.2016.ELN8YGLE,26405988,2015-07-20 17:51:12.0


In [18]:
# filter for Collection 
filtered_data = data[(data['Collection'] == 'Healthy-Total-Body-CTs')]

# sample size (unique PatientIDs)
sample_size = filtered_data['PatientID'].nunique()

# modality counts for CT, MR, PT, RTSTRUCT, SEG
modalities = list(set(filtered_data["Modality"]))
modality_counts = {modality: filtered_data[filtered_data['Modality'] == modality]['SeriesInstanceUID'].nunique() for modality in modalities}

# number of samples with segmentations
seg_modalities = ["RTSTRUCT", "SEG"]
seg_counts = {seg_modality: filtered_data[filtered_data['Modality'] == seg_modality]['PatientID'].nunique() for seg_modality in seg_modalities}

# images / patient (range of unique SeriesInstanceUIDs for each unique PatientID)
images_per_patient = filtered_data.groupby('PatientID')['SeriesInstanceUID'].nunique().unique()
images_per_patient_range = np.sort(images_per_patient)

print(f'Sample size: {sample_size}')
for modality, count in modality_counts.items():
    print(f'{modality} count: {count}')
print(f'Images per patient range: {images_per_patient_range}')
print(f'Samples with segmentation data: {seg_counts}')

Sample size: 0
Images per patient range: []
Samples with segmentation data: {'RTSTRUCT': 0, 'SEG': 0}


# IBM Aspera Connect data processing
From the .sums files

In [41]:
dataset_name = "RSNA-ASNR-MICCAI-BraTS-2021"

aspera_sums_data = pd.read_csv(f"../rawdata/aspera_files/{dataset_name}.sums", sep=" ", header=None, names=["md5", "file_name"])

In [42]:
aspera_sums_data

Unnamed: 0,md5,file_name
0,623c8e748d6ef779e04c9def5fe8a52f,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
1,fdfa7813aa025fba014531f5b4c1e989,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
2,966c482d486e7a3fe41f8dff1d4ad55f,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
3,8d9af639f705937341c2c7ef8b653391,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
4,74e74fcd62aa74145bd6ad8fd6184807,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
...,...,...
407240,8a29d09e734e5f74e9b1ad7366f462b6,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Validati...
407241,9bb5c3d38248c702182a167080407d10,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Validati...
407242,8f5076820e0f8eccc95e8c7b589654a0,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
407243,940ab71634ac7843886fee5606a704c9,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...


In [55]:
aspera_sums_data[aspera_sums_data['file_name'].str.contains(r"TrainingSet/TCGA-LGG.*seg.*")]

Unnamed: 0,md5,file_name
22603,43842f85c51c403bad28099bd0796ae3,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
28831,f4475676f10bc536e69792d4df49d279,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
40736,6ac24699c02c44f45c27075386f07c92,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
53836,8a867017c8b7745aa13f95b508c6fea1,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
61695,3ebabd8adec975c2bc8a2fdb34916481,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
...,...,...
367745,c07fbafa4de5fd2c83e541257056396e,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
377866,b64967350af1178766bc8d02bea5815e,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
386888,193fcaef157cb10905487957b673e111,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...
388449,0a274f39e94c9910964b07a8e707d55b,RSNA-ASNR-MICCAI-BraTS-2021/BraTS2021_Training...


In [52]:
aspera_sums_data[aspera_sums_data['file_name'].str.contains(r"ValidationSet/IvyGAP.*seg.*")]

Unnamed: 0,md5,file_name


In [39]:
segmentation_identifier = "_segmentation"
segmentation_rows = aspera_sums_data[aspera_sums_data.file_name.str.contains(segmentation_identifier)]
patid_series = pd.Series(data=[Path(file_path).parent.stem for file_path in segmentation_rows.file_name.to_list()])

print(f"Number of segmentation files: {len(segmentation_rows)}")
print(f"Number of unique patient IDs: {patid_series.nunique()}")

Number of segmentation files: 1503
Number of unique patient IDs: 501


In [40]:
len(aspera_sums_data) - len(segmentation_rows)

10026

In [19]:
pat_ids = []
for file in sorted(Path("/Users/katyscott/Downloads/lung-ct/ai-segmentations-dcm/").glob("ai_PD-1-Lung*")):
    pat_ids.append(file.stem[0:18])

pat_ids_series = pd.Series(pat_ids)
pat_ids_series.unique()

array(['ai_PD-1-Lung-00001', 'ai_PD-1-Lung-00002', 'ai_PD-1-Lung-00003',
       'ai_PD-1-Lung-00004', 'ai_PD-1-Lung-00005', 'ai_PD-1-Lung-00006',
       'ai_PD-1-Lung-00007', 'ai_PD-1-Lung-00008', 'ai_PD-1-Lung-00009',
       'ai_PD-1-Lung-00010', 'ai_PD-1-Lung-00011', 'ai_PD-1-Lung-00012',
       'ai_PD-1-Lung-00013', 'ai_PD-1-Lung-00014', 'ai_PD-1-Lung-00015',
       'ai_PD-1-Lung-00016', 'ai_PD-1-Lung-00017', 'ai_PD-1-Lung-00018',
       'ai_PD-1-Lung-00019', 'ai_PD-1-Lung-00020', 'ai_PD-1-Lung-00022',
       'ai_PD-1-Lung-00023', 'ai_PD-1-Lung-00025', 'ai_PD-1-Lung-00026',
       'ai_PD-1-Lung-00027', 'ai_PD-1-Lung-00028', 'ai_PD-1-Lung-00029',
       'ai_PD-1-Lung-00030', 'ai_PD-1-Lung-00031', 'ai_PD-1-Lung-00032',
       'ai_PD-1-Lung-00034', 'ai_PD-1-Lung-00035', 'ai_PD-1-Lung-00036',
       'ai_PD-1-Lung-00037', 'ai_PD-1-Lung-00038', 'ai_PD-1-Lung-00041',
       'ai_PD-1-Lung-00042', 'ai_PD-1-Lung-00043', 'ai_PD-1-Lung-00044',
       'ai_PD-1-Lung-00045', 'ai_PD-1-Lung-00046'],

In [20]:
pat_ids_series.nunique()

41