In [7]:
import asyncio
import re
from collections import defaultdict
from pathlib import Path
from ast import literal_eval
import pandas as pd
from nbiatoolkit import NBIA_ENDPOINT
from nbiatoolkit.nbia import NBIAClient
from nbiatoolkit.settings import Settings
from rich import print
settings = Settings()
client = NBIAClient(
    username=settings.login.nbia_username,
    password=settings.login.nbia_password,
)

In [13]:
rootdir = Path.cwd().parent.parent
csvpath = rootdir / "rtstructs_with_ref_modality.csv"
assert csvpath.exists(), f'File not found: {csvpath}'
df = pd.read_csv(csvpath)
df.roi_names = df.roi_names.apply(literal_eval)

df.head()

Unnamed: 0,collection,seriesuid,ref_series,ref_study,roi_names,ReferenceSeriesModality
0,RADCURE,1.3.6.1.4.1.14519.5.2.1.2636579726319942239019...,1.3.6.1.4.1.14519.5.2.1.1508874225260709402875...,1.3.6.1.4.1.14519.5.2.1.2438917685611411186973...,"[GTVp, GTVn_[LRETRO], GTVn_[IVR2], GTVn_[IIIR2...",CT
1,RADCURE,1.3.6.1.4.1.14519.5.2.1.1896514473634735057343...,1.3.6.1.4.1.14519.5.2.1.2817221580900473231692...,1.3.6.1.4.1.14519.5.2.1.2346584974952187485363...,"[Brainstem, SpinalCord, Eye_R, Eye_L, Lens_R, ...",CT
2,RADCURE,1.3.6.1.4.1.14519.5.2.1.4768558638856469947398...,1.3.6.1.4.1.14519.5.2.1.7875051069417113297384...,1.3.6.1.4.1.14519.5.2.1.2861906910120462502384...,"[GTVp, GTVn_[R2], Brainstem, SpinalCord, Lips,...",CT
3,RADCURE,1.3.6.1.4.1.14519.5.2.1.1757804829758101191969...,1.3.6.1.4.1.14519.5.2.1.1195947805058344267549...,1.3.6.1.4.1.14519.5.2.1.2825314136035738098810...,"[GTVp, GTVn_[R2i], GTVn_[R2ii], GTVn_[R1b], GT...",CT
4,RADCURE,1.3.6.1.4.1.14519.5.2.1.2289610724896836261610...,1.3.6.1.4.1.14519.5.2.1.2649802235242390129121...,1.3.6.1.4.1.14519.5.2.1.3380478650395524096479...,"[Brainstem, SpinalCord, GTVn_[IIL2], GTVn_[IL2...",CT


In [10]:
to_download = df[df["ReferenceSeriesModality"].isin(["CT", "MR"])]
to_download.head()

Unnamed: 0,collection,seriesuid,ref_series,ref_study,roi_names,ReferenceSeriesModality
0,RADCURE,1.3.6.1.4.1.14519.5.2.1.2636579726319942239019...,1.3.6.1.4.1.14519.5.2.1.1508874225260709402875...,1.3.6.1.4.1.14519.5.2.1.2438917685611411186973...,"[GTVp, GTVn_[LRETRO], GTVn_[IVR2], GTVn_[IIIR2...",CT
1,RADCURE,1.3.6.1.4.1.14519.5.2.1.1896514473634735057343...,1.3.6.1.4.1.14519.5.2.1.2817221580900473231692...,1.3.6.1.4.1.14519.5.2.1.2346584974952187485363...,"[Brainstem, SpinalCord, Eye_R, Eye_L, Lens_R, ...",CT
2,RADCURE,1.3.6.1.4.1.14519.5.2.1.4768558638856469947398...,1.3.6.1.4.1.14519.5.2.1.7875051069417113297384...,1.3.6.1.4.1.14519.5.2.1.2861906910120462502384...,"[GTVp, GTVn_[R2], Brainstem, SpinalCord, Lips,...",CT
3,RADCURE,1.3.6.1.4.1.14519.5.2.1.1757804829758101191969...,1.3.6.1.4.1.14519.5.2.1.1195947805058344267549...,1.3.6.1.4.1.14519.5.2.1.2825314136035738098810...,"[GTVp, GTVn_[R2i], GTVn_[R2ii], GTVn_[R1b], GT...",CT
4,RADCURE,1.3.6.1.4.1.14519.5.2.1.2289610724896836261610...,1.3.6.1.4.1.14519.5.2.1.2649802235242390129121...,1.3.6.1.4.1.14519.5.2.1.3380478650395524096479...,"[Brainstem, SpinalCord, GTVn_[IIL2], GTVn_[IL2...",CT


In [11]:
to_download.collection.value_counts()

collection
RADCURE                    3337
CPTAC-CCRCC                 636
CPTAC-UCEC                  609
HNSCC                       606
CPTAC-HNSCC                 537
CPTAC-PDA                   534
Head-Neck-PET-CT            532
HNSCC-3DCT-RT               185
HEAD-NECK-RADIOMICS-HN1     137
Pancreatic-CT-CBCT-SEG      130
TCGA-HNSC                    20
Name: count, dtype: int64

In [16]:
import numpy as np

np.random.seed(42)  # Set the random seed for reproducibility

groups = to_download.groupby("collection")
num_cases_to_download = 10

col_dict = {}

for collection, group in groups:
    
    modalities = group.ReferenceSeriesModality.unique()

    series_of_interest = []

    if len(modalities) > 1:
        MR_group = group[group.ReferenceSeriesModality == "MR"]
        CT_group = group[group.ReferenceSeriesModality == "CT"]

        # get num_cases_to_download from each modality
        series_of_interest.extend(MR_group.sample(num_cases_to_download, random_state=42).seriesuid)
        series_of_interest.extend(MR_group.sample(num_cases_to_download, random_state=42).ref_series)
        series_of_interest.extend(CT_group.sample(num_cases_to_download, random_state=42).seriesuid)
        series_of_interest.extend(CT_group.sample(num_cases_to_download, random_state=42).ref_series)

    else:
        series_of_interest.extend(group.sample(num_cases_to_download, random_state=42).seriesuid)
        series_of_interest.extend(group.sample(num_cases_to_download, random_state=42).ref_series)
    
    col_dict[collection] = series_of_interest

col_dict

{'CPTAC-CCRCC': ['1.2.826.0.1.534147.667.2747872357.20234226564866.4',
  '1.2.826.0.1.534147.667.2747872357.2023422643392.4',
  '1.2.826.0.1.534147.667.2747872357.202341982216847.4',
  '1.2.826.0.1.534147.667.2747872357.20234266445593.4',
  '1.2.826.0.1.534147.667.2747872357.202342032857391.4',
  '1.2.826.0.1.534147.667.2747872357.202342651329961.4',
  '1.2.826.0.1.534147.756.812677238.202321595941149.4',
  '1.2.826.0.1.534147.756.812677238.202322372223526.4',
  '1.2.826.0.1.534147.667.2747872357.202342032019353.2',
  '1.2.826.0.1.534147.756.812677238.20232227247833.4',
  '1.3.6.1.4.1.14519.5.2.1.7085.2626.165364992023526946432733784560',
  '1.3.6.1.4.1.14519.5.2.1.7085.2626.275418299553191022863636314497',
  '1.3.6.1.4.1.14519.5.2.1.4801.5885.311496008284161524311375082502',
  '1.3.6.1.4.1.14519.5.2.1.7085.2626.221053926001264879672844257915',
  '1.3.6.1.4.1.14519.5.2.1.7085.2626.221053926001264879672844257915',
  '1.3.6.1.4.1.14519.5.2.1.3320.3273.132642653502802060220279546165',
  '

In [17]:
len([item for sublist in col_dict.values() for item in sublist])

300