# DICOM Part 4 Section B.5 Standard SOP Classes

In [5]:
import requests
import xml.etree.ElementTree as ET

# take a peak of the XML content
# URI for DICOM Standard Part 4
xml_uri = 'https://dicom.nema.org/medical/dicom/current/source/docbook/part04/part04.xml'

# Download the XML content
response = requests.get(xml_uri)
content = response.text

#Parse the XML content
root = ET.fromstring(content)
root

<Element '{http://docbook.org/ns/docbook}book' at 0x000001BC7840AC00>

In [8]:
# Find the chapter B
chapter_b = None
for child in root:
    if child.attrib.get('label') == 'B':
        chapter_b = child
        break

# Find the section with xml:id="sect_B.5"
section_b5 = None
if chapter_b is not None:
    for section in chapter_b.findall('.//{http://docbook.org/ns/docbook}section'):
        if section.attrib.get('label') == "B.5":
            section_b5 = section
            break

# Find table B.5-1
table_b51 = None
if section_b5 is not None:
    for table in section_b5.findall('.//{http://docbook.org/ns/docbook}table'):
        if table.attrib.get('label') == 'B.5-1':
            table_b51 = table
            break

In [9]:
# print the table
if table_b51 is not None:
    print(ET.tostring(table_b51).decode())

<ns0:table xmlns:ns0="http://docbook.org/ns/docbook" frame="box" label="B.5-1" rules="all" xml:id="table_B.5-1">
                <ns0:caption>Standard SOP Classes</ns0:caption>
                <ns0:thead>
                    <ns0:tr valign="top">
                        <ns0:th align="center" colspan="1" rowspan="1">
                            <ns0:para xml:id="para_d5ac09f7-fad0-454d-ae01-98de6af5e14f">SOP Class Name</ns0:para>
                        </ns0:th>
                        <ns0:th align="center" colspan="1" rowspan="1">
                            <ns0:para xml:id="para_873bad43-46ef-4569-afca-d49dafa57b07">SOP Class UID</ns0:para>
                        </ns0:th>
                        <ns0:th align="center" colspan="1" rowspan="1">
                            <ns0:para xml:id="para_faee907c-fdfc-4e5a-ba41-631a1b2e811f">
                                <ns0:emphasis role="bold">IOD Specification (defined in <ns0:olink targetdoc="PS3.3" targetptr="PS3.3" xrefstyle="sele

In [10]:
import pandas as pd

# Define the namespace
namespace = {'ns0': 'http://docbook.org/ns/docbook'}

def extract_text(element):
    if element is None:
        return ''
    return ''.join(element.itertext()).strip()

# Function to extract table data
def extract_table_data(table):
    # Extract header
    headers = []
    thead = table.find("ns0:thead", namespace)
    if thead is not None:
        header_row = thead.find("ns0:tr", namespace)
        headers = [extract_text(th.find("ns0:para", namespace)) for th in header_row.findall("ns0:th", namespace)]
    
    # Extract body
    body_data = []
    tbody = table.find("ns0:tbody", namespace)
    if tbody is not None:
        for row in tbody.findall("ns0:tr", namespace):
            row_data = [extract_text(td.find("ns0:para", namespace)) for td in row.findall("ns0:td", namespace)]
            body_data.append(row_data)
    
    return headers, body_data


In [11]:
# Extract table data
headers, body_data = extract_table_data(table_b51)

# Create DataFrame with specific columns
if headers and body_data:
    # Print the headers for verification
    print("Headers:", headers)

    # Select only the first two columns
    selected_columns = [headers[0], headers[1]]
    filtered_body_data = [[row[0], row[1]] for row in body_data]
    
    # Create DataFrame
    df = pd.DataFrame(filtered_body_data, columns=selected_columns)
else:
    print("Table data could not be extracted.")

Headers: ['SOP Class Name', 'SOP Class UID', 'IOD Specification (defined in )', 'Specialization']


In [13]:
# Function to truncate the text at the end of the word "Image"
def truncate_image_storage(text):
    index = text.find("Image")
    if index != -1:
        return text[:index + len("Image")].strip()
    return text

# Create the new column
df['Truncated SOP Class Name'] = df['SOP Class Name'].apply(truncate_image_storage)
df

Unnamed: 0,SOP Class Name,SOP Class UID,Truncated SOP Class Name
0,Computed Radiography Image Storage,1.2.840.10008.5.1.4.1.1.1,Computed Radiography Image
1,Digital X-Ray Image Storage - For Presentation,1.2.840.10008.5.1.4.1.1.1.1,Digital X-Ray Image
2,Digital X-Ray Image Storage - For Processing,1.2.840.10008.5.1.4.1.1.1.1.1,Digital X-Ray Image
3,Digital Mammography X-Ray Image Storage - For ...,1.2.840.10008.5.1.4.1.1.1.2,Digital Mammography X-Ray Image
4,Digital Mammography X-Ray Image Storage - For ...,1.2.840.10008.5.1.4.1.1.1.2.1,Digital Mammography X-Ray Image
...,...,...,...
163,Enhanced RT Image Storage,1.2.840.10008.5.1.4.1.1.481.23,Enhanced RT Image
164,Enhanced Continuous RT Image Storage,1.2.840.10008.5.1.4.1.1.481.24,Enhanced Continuous RT Image
165,RT Patient Position Acquisition Instruction St...,1.2.840.10008.5.1.4.1.1.481.25,RT Patient Position Acquisition Instruction St...
166,RT Beams Delivery Instruction Storage,1.2.840.10008.5.1.4.34.7,RT Beams Delivery Instruction Storage


In [15]:
df.to_csv('../files/part4_sop_class.csv', index=False)