In [None]:
%load_ext autoreload
%autoreload 2
from rich import print # noqa
from rich.table import Table # noqa

from imgtools.crawler.crawl2 import (
    crawl_directory, # noqa
    find_dicoms,
    json, # noqa
    pathlib,
)
from pydicom import dcmread # noqa
from imgtools.crawler.parse_dicom import parse_dicom
from imgtools.dicom.input import (
    SEGRefSeries,
    SEGRefSOPs,
    SR_RefSeries,
    SR_RefSOPs,
    RTDOSERefPlanSOP,
    RTDOSERefStructSOP,
    RTPLANRefStructSOP,
    rtdose_reference_uids,
    rtplan_reference_uids,
    rtstruct_reference_uids,
    seg_reference_uids,
    sr_reference_uids,
)
from imgtools.logging import logger # noqa
from imgtools.utils.timer import timer # noqa

logger.setLevel("DEBUG")

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [3]:
testpath = pathlib.Path().cwd().parent / "testdata"
dicoms = find_dicoms(testpath, recursive=True, check_header=False, extension="dcm")
print(f'Found {len(dicoms)} dicoms in {testpath}')

[2m2025-02-27T16:33:22-0500[0m [[32m[1mdebug    [0m] [1mLooking for DICOM files       [0m [[0m[1m[34mimgtools[0m][0m [36mcall[0m=[35mutils.find_dicoms:161[0m [36mcheck_header[0m=[35mFalse[0m [36mdirectory[0m=[35mPosixPath('/home/gpudual/bhklab/radiomics/Projects/med-imagetools/testdata')[0m [36mlimit[0m=[35mNone[0m [36mrecursive[0m=[35mTrue[0m [36msearch_input[0m=[35mNone[0m [36msearch_pattern[0m=[35m*.dcm[0m


In [4]:
# filter groups by looking if the path has '{MODALITY}_Series' in it
from collections import defaultdict

groups = defaultdict(list)

for modality in ["CT", "MR", "PT", "SEG", "RTSTRUCT", "RTDOSE", "RTPLAN", "SR"]:
    groups[modality] = [d for d in dicoms if f"{modality}_Series" in d.as_posix()]

# print a summary of the groups
table = Table(title="Groups")
table.add_column("Modality")
table.add_column("Count")
for modality, group in groups.items():
    table.add_row(modality, str(len(group)))
print(table)

In [7]:
rtplan_files = groups["RTPLAN"]

# Create a table to list the number of UIDs
uid_table = Table(title="RTPLAN Reference UIDs")
uid_table.add_column("RTPLAN File")
uid_table.add_column("Number of Referenced Instance UIDs")

for rtplan in rtplan_files:
    uids = rtplan_reference_uids(rtplan)
    uid_table.add_row(str(rtplan.relative_to(testpath.parent)), '1')

print(uid_table)

In [8]:
seg_files = groups["SEG"]

seg_table = Table(title="SEG Reference UIDs")
seg_table.add_column("SEG File")
seg_table.add_column("# Ref Series UIDs")
seg_table.add_column("# Ref SOP UIDs")

for seg in seg_files:
    match seg_reference_uids(seg):
        case SEGRefSeries(ref_uid), SEGRefSOPs(ref_sops):
            seg_table.add_row(str(seg.relative_to(testpath.parent)), "1", str(len(ref_sops)))
        case SEGRefSOPs(ref_sops):
            seg_table.add_row(str(seg.relative_to(testpath.parent)), "0", str(len(ref_sops)))

print(seg_table)

In [9]:
sr_files = groups["SR"]

sr_table = Table(title="SR Reference UIDs")
sr_table.add_column("SR File")
sr_table.add_column("# Ref Series UIDs")
sr_table.add_column("# Ref SOP UIDs")

for sr in sr_files:
    match sr_reference_uids(sr):
        case SR_RefSeries(ref_uid), SR_RefSOPs(ref_sops):
            sr_table.add_row(str(sr.relative_to(testpath.parent)), str(len(ref_uid)), str(len(ref_sops)))
        case SR_RefSOPs(ref_sops):
            sr_table.add_row(str(sr.relative_to(testpath.parent)), "0", str(len(ref_sops)))
        
print(sr_table)

In [20]:
from tqdm.autonotebook import tqdm

weird = [*groups["RTDOSE"], *rtplan_files, *seg_files, *sr_files, *groups["RTSTRUCT"], *groups["CT"], *groups["MR"], *groups["PT"]]
weird_parsed = []
for w in tqdm(weird):
    try:
        weird_parsed.append(parse_dicom(w))
    except Exception as e:
        print(f"Error parsing {w}: {e}")


  0%|          | 0/84519 [00:00<?, ?it/s]

  1%|          | 693/84519 [00:17<34:54, 40.02it/s]  


KeyboardInterrupt: 

In [19]:
import pandas as pd
from pandas import json_normalize

# Flatten the dictionaries and create DataFrames
dfs = [json_normalize(w) for w in weird_parsed]
df = pd.concat(dfs, ignore_index=True)
# remove the filepath column
df = df.drop(columns=["filepath"])

df

Unnamed: 0,PatientID,StudyInstanceUID,SeriesInstanceUID,SOPInstanceUID,Modality,FrameOfReferenceUID,ReferencedSOPInstanceUID,RTDOSERefStructSOP,RTDOSERefPlanSOP,RTPLANRefStructSOP,ReferencedSeriesUID
0,HN-HGJ-072,1.3.6.1.4.1.14519.5.2.1.5168.2407.233545666306...,1.3.6.1.4.1.14519.5.2.1.5168.2407.269371969014...,1.3.6.1.4.1.14519.5.2.1.5168.2407.531562124805...,RTDOSE,1.3.6.1.4.1.14519.5.2.1.5168.2407.158839962192...,1.3.6.1.4.1.14519.5.2.1.5168.2407.921846643422...,1.3.6.1.4.1.14519.5.2.1.5168.2407.921846643422...,,,
1,HN-HGJ-072,1.3.6.1.4.1.14519.5.2.1.5168.2407.756484993547...,1.3.6.1.4.1.14519.5.2.1.5168.2407.334308688250...,1.3.6.1.4.1.14519.5.2.1.5168.2407.112329825176...,RTDOSE,1.3.6.1.4.1.14519.5.2.1.5168.2407.224471774536...,1.3.6.1.4.1.14519.5.2.1.5168.2407.106404316007...,1.3.6.1.4.1.14519.5.2.1.5168.2407.106404316007...,,,
2,VS-SEG-002,1.3.6.1.4.1.14519.5.2.1.2146429604232375654471...,1.3.6.1.4.1.14519.5.2.1.2312603692751891713589...,1.3.6.1.4.1.14519.5.2.1.1614181921545286225703...,RTDOSE,1.3.6.1.4.1.14519.5.2.1.3234962357111403548336...,1.3.6.1.4.1.14519.5.2.1.1295381050312110912875...,,1.3.6.1.4.1.14519.5.2.1.1295381050312110912875...,,
3,VS-SEG-002,1.3.6.1.4.1.14519.5.2.1.2146429604232375654471...,1.3.6.1.4.1.14519.5.2.1.2263372411356965457223...,1.3.6.1.4.1.14519.5.2.1.1558857347201472295276...,RTDOSE,1.3.6.1.4.1.14519.5.2.1.3234962357111403548336...,1.3.6.1.4.1.14519.5.2.1.1862471466198123065560...,,1.3.6.1.4.1.14519.5.2.1.1862471466198123065560...,,
4,VS-SEG-001,1.3.6.1.4.1.14519.5.2.1.2674248213846638137808...,1.3.6.1.4.1.14519.5.2.1.3111935591986827656438...,1.3.6.1.4.1.14519.5.2.1.1137257506625090799968...,RTDOSE,1.3.6.1.4.1.14519.5.2.1.3263657427692460239850...,1.3.6.1.4.1.14519.5.2.1.1203774555098766706658...,,1.3.6.1.4.1.14519.5.2.1.1203774555098766706658...,,
...,...,...,...,...,...,...,...,...,...,...,...
577,OCT-01-1303,1.3.6.1.4.1.12201.1109.13661133743850639572713...,1.2.752.243.1.1.20240903184430224.2200.65687.1,1.2.752.243.1.1.20240903184430224.2200.65687,RTSTRUCT,1.3.6.1.4.1.12201.1109.19362084419441258738557...,,,,,1.3.6.1.4.1.12201.1109.32518289154523966610386...
578,HNSCC-01-0005,1.3.6.1.4.1.14519.5.2.1.1706.8040.135679639275...,1.3.6.1.4.1.14519.5.2.1.1706.8040.458338964033...,1.3.6.1.4.1.14519.5.2.1.1706.8040.107238381189...,RTSTRUCT,,,,,,1.3.6.1.4.1.14519.5.2.1.1706.8040.218267768790...
579,HN1600,1.3.6.1.4.1.40744.29.2567103918185170854055100...,1.3.6.1.4.1.40744.29.8646651130754985590950727...,1.3.6.1.4.1.40744.29.3095632643069272941075671...,RTSTRUCT,,,,,,1.3.6.1.4.1.40744.29.4102576699412926455275059...
580,LUNG1-001,1.3.6.1.4.1.32722.99.99.2393413539117143687725...,1.3.6.1.4.1.32722.99.99.2279381215866080725084...,1.3.6.1.4.1.32722.99.99.6468474582136099606367...,RTSTRUCT,,,,,,1.3.6.1.4.1.32722.99.99.2989917765213423750108...
