In [2]:
import pandas as pd

file_path = '../rawdata/AllSeries.tsv'
data = pd.read_csv(file_path, sep='\t')

data.head()

Unnamed: 0,Collection,PatientID,StudyInstanceUID,Modality,SeriesInstanceUID,SeriesNumber,SeriesDate,BodyPartExamined,ImageCount,TimeStamp,CollectionURI,FileSize,DateReleased
0,4D-Lung,100_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.552215730027...,CT,1.3.6.1.4.1.14519.5.2.1.6834.5010.189721824525...,507.0,1997-10-03 00:00:00.0,LUNG,50,2015-07-20 17:58:54.0,https://doi.org/10.7937/K9/TCIA.2016.ELN8YGLE,26405988,2015-07-20 17:58:54.0
1,4D-Lung,100_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.980344486630...,CT,1.3.6.1.4.1.14519.5.2.1.6834.5010.336250251691...,507.0,1997-10-07 00:00:00.0,LUNG,50,2015-07-20 17:40:07.0,https://doi.org/10.7937/K9/TCIA.2016.ELN8YGLE,26405988,2015-07-20 17:40:07.0
2,4D-Lung,100_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.157653211810...,CT,1.3.6.1.4.1.14519.5.2.1.6834.5010.227929163446...,503.0,1997-09-18 00:00:00.0,LUNG,50,2015-07-20 17:56:27.0,https://doi.org/10.7937/K9/TCIA.2016.ELN8YGLE,26405988,2015-07-20 17:56:27.0
3,4D-Lung,100_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.256783235670...,CT,1.3.6.1.4.1.14519.5.2.1.6834.5010.925990093742...,502.0,1997-10-14 00:00:00.0,LUNG,50,2015-07-20 17:55:30.0,https://doi.org/10.7937/K9/TCIA.2016.ELN8YGLE,26405988,2015-07-20 17:55:30.0
4,4D-Lung,100_HM10395,1.3.6.1.4.1.14519.5.2.1.6834.5010.980344486630...,CT,1.3.6.1.4.1.14519.5.2.1.6834.5010.139116724721...,501.0,1997-10-07 00:00:00.0,LUNG,50,2015-07-20 17:51:12.0,https://doi.org/10.7937/K9/TCIA.2016.ELN8YGLE,26405988,2015-07-20 17:51:12.0


In [3]:
# filter for Collection 
filtered_data = data[(data['Collection'] == 'HNSCC-3DCT-RT')]

# sample size (unique PatientIDs)
sample_size = filtered_data['PatientID'].nunique()

# modality counts for CT, MR, PT, RTSTRUCT, SEG
modalities = list(set(filtered_data["Modality"]))
modality_counts = {modality: filtered_data[filtered_data['Modality'] == modality]['SeriesInstanceUID'].nunique() for modality in modalities}

# number of samples with segmentations
seg_modalities = ["RTSTRUCT", "SEG"]
seg_counts = {seg_modality: filtered_data[filtered_data['Modality'] == seg_modality]['PatientID'].nunique() for seg_modality in seg_modalities}

# images / patient (range of unique SeriesInstanceUIDs for each unique PatientID)
images_per_patient = filtered_data.groupby('PatientID')['SeriesInstanceUID'].nunique().unique()
images_per_patient_range = sorted(images_per_patient)

print(f'Sample size: {sample_size}')
for modality, count in modality_counts.items():
    print(f'{modality} count: {count}')
print(f'Images per patient range: {images_per_patient_range}')
print(f'Samples with segmentation data: {seg_counts}')

Sample size: 31
CT count: 93
RTDOSE count: 93
RTSTRUCT count: 185
Images per patient range: [np.int64(11), np.int64(12)]
Samples with segmentation data: {'RTSTRUCT': 31, 'SEG': 0}
