In [1]:
import pandas as pd
from pathlib import Path
import numpy as np

In [2]:
digest_file_path = Path("/Users/katyscott/Downloads/TCIA_FDG-PET-CT-Lesions_v1-nbia-digest.xlsx")

In [3]:
filtered_data = pd.read_excel(digest_file_path)

# sample size (unique PatientIDs)
sample_size = filtered_data['Patient ID'].nunique()

# modality counts for CT, MR, PT, RTSTRUCT, SEG
modalities = list(set(filtered_data["Modality"]))
modality_counts = {modality: filtered_data[filtered_data['Modality'] == modality]['Series Instance UID'].nunique() for modality in modalities}

# number of samples with segmentations
seg_modalities = ["RTSTRUCT", "SEG"]
seg_counts = {seg_modality: filtered_data[filtered_data['Modality'] == seg_modality]['Patient ID'].nunique() for seg_modality in seg_modalities}

# images / patient (range of unique SeriesInstanceUIDs for each unique PatientID)
images_per_patient = filtered_data.groupby('Patient ID')['Series Instance UID'].nunique().unique()
images_per_patient_range = np.sort(images_per_patient)

print(f'Sample size: {sample_size}')
for modality, count in modality_counts.items():
    print(f'{modality} count: {count}')
print(f'Images per patient range: {images_per_patient_range}')
print(f'Samples with segmentation data: {seg_counts}')

Sample size: 900
PT count: 1014
CT count: 1014
SEG count: 1014
Images per patient range: [ 3  6  9 12 15]
Samples with segmentation data: {'RTSTRUCT': 0, 'SEG': 900}


In [None]:
clinical_file_excel = Path("/Users/katyscott/Downloads/Meningioma-SEG-Class-Clinical-Data.xlsx")

clinical_data = pd.read_excel(clinical_file_excel, sheet_name=None)
clinical_data.keys()

In [None]:
clinical_data['Clinical Data'].Patient.nunique(dropna=True)

In [4]:
clinical_file_csv = Path("/Users/katyscott/Downloads/Clinical-Metadata-FDG-PET_CT-Lesions.csv")
clinical_data = pd.read_csv(clinical_file_csv)
clinical_data.columns

Index(['Series UID', 'Collection', '3rd Party Analysis',
       'Data Description URI', 'Subject ID', 'Study UID', 'Study Description',
       'Study Date', 'Series Description', 'Manufacturer', 'Modality',
       'SOP Class Name', 'SOP Class UID', 'Number of Images', 'File Size',
       'File Location', 'Download Timestamp', 'diagnosis', 'age', 'sex'],
      dtype='object')

In [17]:
diag_patients = clinical_data[clinical_data['diagnosis'].isin(['NEGATIVE'])] #["LYMPHOMA", "MELANOMA", "LUNG_CANCER"])] #
diag_patients = diag_patients['Subject ID'].unique()

diag_pat_diagnosis_count = clinical_data[clinical_data['Subject ID'].isin(diag_patients)].groupby("Subject ID").agg({"diagnosis": "nunique"})
diag_pat_single_diagnosis = diag_pat_diagnosis_count[diag_pat_diagnosis_count['diagnosis']==1]
diag_pat_single_diagnosis

Unnamed: 0_level_0,diagnosis
Subject ID,Unnamed: 1_level_1
PETCT_0143bab87a,1
PETCT_0225325b91,1
PETCT_0410759456,1
PETCT_048981112f,1
PETCT_05bed31780,1
...,...
PETCT_f8de0cde56,1
PETCT_fa45f610c4,1
PETCT_fc0389a486,1
PETCT_ff1451316e,1


In [18]:
diag_patient_data = clinical_data[clinical_data["Subject ID"].isin(diag_pat_single_diagnosis.index.unique())]
diag_patient_data.groupby("Modality").size()

Modality
CT     471
PT     471
SEG    471
dtype: int64

In [20]:
diag_patient_data.groupby("Subject ID").agg({"Study UID": "nunique"})

Unnamed: 0_level_0,Study UID
Subject ID,Unnamed: 1_level_1
PETCT_0143bab87a,1
PETCT_0225325b91,1
PETCT_0410759456,2
PETCT_048981112f,1
PETCT_05bed31780,1
...,...
PETCT_f8de0cde56,1
PETCT_fa45f610c4,1
PETCT_fc0389a486,1
PETCT_ff1451316e,1


In [None]:
clinical_data['diagnosis'].value_counts(dropna=False)

In [None]:
clinical_data

In [None]:
pats_and_lesions = lesion_data[['unique_pt_id', 'Lesion #']]

grp_pats = pats_and_lesions.groupby('unique_pt_id', as_index=False).size()

grp_pats[grp_pats['size'] > 1]