### 1. Loading and exploring the Dataset

In [1]:
import xml.etree.ElementTree as ET

def extract_roi_from_aim(aim_file_path):
    try:
        # Parse the AIM XML file
        tree = ET.parse(aim_file_path)
        root = tree.getroot()

        roi_list = []
        
        # Navigate through the AIM XML structure to find TwoDimensionGeometricShapeEntity
        shapes = root.findall(".//TwoDimensionGeometricShapeEntity")
        if not shapes:
            print("No TwoDimensionGeometricShapeEntity found in AIM file.")
            return None

        for shape in shapes:
            roi = {}
            
            # Extract the shape ID and label
            roi['shapeIdentifier'] = shape.findtext("shapeIdentifier", default="N/A")
            roi['label'] = shape.findtext("label", default="N/A")
            
            # Extract the spatial coordinates (x, y)
            coordinates = []
            for coordinate in shape.findall(".//TwoDimensionSpatialCoordinate"):
                x = coordinate.findtext("x")
                y = coordinate.findtext("y")
                if x is not None and y is not None:
                    coordinates.append((float(x), float(y)))
            
            if coordinates:
                roi['coordinates'] = coordinates
                roi_list.append(roi)
            else:
                print(f"No coordinates found for shape {roi['shapeIdentifier']}")

        return roi_list if roi_list else None
    except ET.ParseError as e:
        print(f"Error parsing AIM file: {e}")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [1]:
import pandas as pd
metadata=pd.read_csv("/kaggle/input/nsclc-radiogenomics-6-1-21-version-4/metadata.csv")

In [2]:
metadata.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1351 entries, 0 to 1350
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Series UID            1351 non-null   object
 1   Collection            1351 non-null   object
 2   3rd Party Analysis    17 non-null     object
 3   Data Description URI  1351 non-null   object
 4   Subject ID            1351 non-null   object
 5   Study UID             1351 non-null   object
 6   Study Description     1351 non-null   object
 7   Study Date            1351 non-null   object
 8   Series Description    1349 non-null   object
 9   Manufacturer          1342 non-null   object
 10  Modality              1351 non-null   object
 11  SOP Class Name        1351 non-null   object
 12  SOP Class UID         1351 non-null   object
 13  Number of Images      1351 non-null   int64 
 14  File Size             1351 non-null   object
 15  File Location         1351 non-null   

In [3]:
metadata['Series UID'].nunique()

1351

In [4]:
import xml.etree.ElementTree as ET

def extract_roi_and_image_info(xml_file_path):
    # Parse the XML file
    tree = ET.parse(xml_file_path)
    root = tree.getroot()
    
    # Define namespaces (adjust if necessary)
    namespaces = {
        '': 'gme://caCORE.caCORE/4.4/edu.northwestern.radiology.AIM',
        'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
        'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
    }
    
    # Extract ROI location
    roi_locations = []
    for markup_entity in root.findall('.//markupEntityCollection/*', namespaces):
        entity_type = markup_entity.attrib.get('{http://www.w3.org/2001/XMLSchema-instance}type')
        print(f"Markup Entity Type: {entity_type}")  # Debugging output
        
        if entity_type == 'TwoDimensionCircle':
            coordinates = markup_entity.findall('.//twoDimensionSpatialCoordinateCollection/TwoDimensionSpatialCoordinate', namespaces)
            coords = []
            for coord in coordinates:
                x = coord.find('x', namespaces)
                y = coord.find('y', namespaces)
                if x is not None and y is not None:
                    x_value = x.attrib.get('value')
                    y_value = y.attrib.get('value')
                    print(f"Coordinate: x={x_value}, y={y_value}")  # Debugging output
                    if x_value is not None and y_value is not None:
                        try:
                            coords.append((float(x_value), float(y_value)))
                        except ValueError:
                            print(f"Error converting coordinates: x={x_value}, y={y_value}")
            if coords:
                roi_locations.append(coords)
    
    # Extract image study and image series
    image_study_uid = None
    image_series_uid = None
    study_start_date = None
    study_start_time = None
    
    for image_ref in root.findall('.//imageReferenceEntityCollection/ImageReferenceEntity', namespaces):
        image_study = image_ref.find('imageStudy', namespaces)
        if image_study is not None:
            instance_uid_element = image_study.find('instanceUid', namespaces)
            if instance_uid_element is not None:
                image_study_uid = instance_uid_element.attrib.get('root')
            
            start_date = image_study.find('startDate', namespaces)
            if start_date is not None:
                study_start_date = start_date.attrib.get('value')
            
            start_time = image_study.find('startTime', namespaces)
            if start_time is not None:
                study_start_time = start_time.attrib.get('value')
            
            image_series = image_study.find('imageSeries', namespaces)
            if image_series is not None:
                instance_uid_element = image_series.find('instanceUid', namespaces)
                if instance_uid_element is not None:
                    image_series_uid = instance_uid_element.attrib.get('root')
    
    return roi_locations, image_study_uid, image_series_uid, study_start_date, study_start_time

# Specify the path to your XML file
xml_file_path = '/kaggle/input/nsclc-radiogenomics-6-1-21-version-4/AIM_files_updated-11-10-2020/AIM_files_updated-11-10-2020/AMC-005.xml'

roi_locations, image_study_uid, image_series_uid, study_start_date, study_start_time = extract_roi_and_image_info(xml_file_path)
print('ROI Locations:', roi_locations)
print('Image Study UID:', image_study_uid)
print('Image Series UID:', image_series_uid)
print('Study Start Date:', study_start_date)
print('Study Start Time:', study_start_time)

Markup Entity Type: TwoDimensionCircle
Coordinate: x=373.1063829787234, y=345.87234042553195
Coordinate: x=374.7404255319149, y=351.3191489361702
ROI Locations: [[(373.1063829787234, 345.87234042553195), (374.7404255319149, 351.3191489361702)]]
Image Study UID: 1.3.6.1.4.1.14519.5.2.1.4334.1501.501311459388146011620266039484
Image Series UID: 1.3.6.1.4.1.14519.5.2.1.4334.1501.162938158768722859985597214287
Study Start Date: 1994-07-21T08:53:22
Study Start Time: 08:53:22


In [5]:
# UID values to filter
study_uid = '1.3.6.1.4.1.14519.5.2.1.4334.1501.501311459388146011620266039484'
series_uid = '1.3.6.1.4.1.14519.5.2.1.4334.1501.162938158768722859985597214287'

# Apply filter
metadata[
    (metadata['Study UID'] == study_uid) &
    (metadata['Series UID'] == series_uid)
]

Unnamed: 0,Series UID,Collection,3rd Party Analysis,Data Description URI,Subject ID,Study UID,Study Description,Study Date,Series Description,Manufacturer,Modality,SOP Class Name,SOP Class UID,Number of Images,File Size,File Location,Download Timestamp
26,1.3.6.1.4.1.14519.5.2.1.4334.1501.162938158768...,NSCLC Radiogenomics,,https://doi.org/10.7937/K9/TCIA.2017.7hs46erv,AMC-005,1.3.6.1.4.1.14519.5.2.1.4334.1501.501311459388...,ThoraxCHESTNONCONTRAST Adult,07-21-1994,Chest 1.0 B45f,SIEMENS,CT,CT Image Storage,1.2.840.10008.5.1.4.1.1.2,343,180.90 MB,.\NSCLC Radiogenomics\AMC-005\07-21-1994-NA-Th...,2024-08-26T12:14:04.744


In [6]:
import os
import pandas as pd
import xml.etree.ElementTree as ET
import logging

# Set up logging for debugging
logging.basicConfig(level=logging.DEBUG)

def extract_roi_and_image_info(xml_file_path):
    try:
        # Parse the XML file
        tree = ET.parse(xml_file_path)
        root = tree.getroot()

        # Define namespaces (adjust if necessary)
        namespaces = {
            '': 'gme://caCORE.caCORE/4.4/edu.northwestern.radiology.AIM',
            'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#',
            'xsi': 'http://www.w3.org/2001/XMLSchema-instance'
        }

        # Extract ROI location and imageReferenceUid
        roi_locations = []
        image_ref_uid_value = None

        for markup_entity in root.findall('.//markupEntityCollection/*', namespaces):
            entity_type = markup_entity.attrib.get('{http://www.w3.org/2001/XMLSchema-instance}type')
            logging.debug(f"Markup Entity Type: {entity_type}")

            if entity_type == 'TwoDimensionCircle':
                coordinates = markup_entity.findall('.//twoDimensionSpatialCoordinateCollection/TwoDimensionSpatialCoordinate', namespaces)
                coords = []
                for coord in coordinates:
                    x = coord.find('x', namespaces)
                    y = coord.find('y', namespaces)
                    if x is not None and y is not None:
                        x_value = x.attrib.get('value')
                        y_value = y.attrib.get('value')
                        logging.debug(f"Coordinate: x={x_value}, y={y_value}")
                        if x_value is not None and y_value is not None:
                            try:
                                coords.append((float(x_value), float(y_value)))
                            except ValueError:
                                logging.error(f"Error converting coordinates: x={x_value}, y={y_value}")
                if coords:
                    roi_locations.append(coords)

                # Extract imageReferenceUid
                image_ref_uid = markup_entity.find('imageReferenceUid', namespaces)
                if image_ref_uid is not None:
                    image_ref_uid_value = image_ref_uid.attrib.get('root')
                    logging.debug(f"Image Reference UID: {image_ref_uid_value}")
                else:
                    logging.debug("imageReferenceUid not found in MarkupEntity")

        # Extract image study and image series
        image_study_uid = None
        image_series_uid = None
        study_start_date = None
        study_start_time = None

        for image_ref in root.findall('.//imageReferenceEntityCollection/ImageReferenceEntity', namespaces):
            image_study = image_ref.find('imageStudy', namespaces)
            if image_study is not None:
                instance_uid_element = image_study.find('instanceUid', namespaces)
                if instance_uid_element is not None:
                    image_study_uid = instance_uid_element.attrib.get('root')

                start_date = image_study.find('startDate', namespaces)
                if start_date is not None:
                    study_start_date = start_date.attrib.get('value')

                start_time = image_study.find('startTime', namespaces)
                if start_time is not None:
                    study_start_time = start_time.attrib.get('value')

                image_series = image_study.find('imageSeries', namespaces)
                if image_series is not None:
                    instance_uid_element = image_series.find('instanceUid', namespaces)
                    if instance_uid_element is not None:
                        image_series_uid = instance_uid_element.attrib.get('root')

        return roi_locations, image_study_uid, image_series_uid, study_start_date, study_start_time, image_ref_uid_value
    
    except ET.ParseError as e:
        logging.error(f"Error parsing XML file {xml_file_path}: {e}")
        return None, None, None, None, None, None

def process_xml_files(directory_path):
    # List to hold extracted data
    data = []

    # Iterate over each XML file in the directory
    for filename in os.listdir(directory_path):
        if filename.endswith('.xml'):
            xml_file_path = os.path.join(directory_path, filename)
            roi_locations, study_uid, series_uid, start_date, start_time, image_ref_uid = extract_roi_and_image_info(xml_file_path)
            
            # Append extracted data to the list
            if study_uid is not None and series_uid is not None:  # Only add if data is valid
                data.append({
                    'file_name': filename,
                    'Study UID': study_uid,
                    'Series UID': series_uid,
                    'roi_locations': roi_locations,
                    'start_date': start_date,
                    'start_time': start_time,
                    'Image Reference UID': image_ref_uid
                })

    # Create DataFrame from the collected data
    df = pd.DataFrame(data)
    return df

# Specify the path to your directory containing XML files
directory_path = '/kaggle/input/nsclc-radiogenomics-6-1-21-version-4/AIM_files_updated-11-10-2020/AIM_files_updated-11-10-2020'

# Process XML files and create DataFrame
xml_df = process_xml_files(directory_path)

In [7]:
xml_df

Unnamed: 0,file_name,Study UID,Series UID,roi_locations,start_date,start_time,Image Reference UID
0,R01-076.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.137912259338...,1.3.6.1.4.1.14519.5.2.1.4334.1501.201809319317...,"[[(355.00854700854705, 270.22222222222223), (3...",19940423121242,12:12:42,1.3.6.1.4.1.14519.5.2.1.4334.1501.223865377877...
1,R01-104.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.250426352468...,1.3.6.1.4.1.14519.5.2.1.4334.1501.184465376893...,"[[(177.94456289978677, 341.1513859275053), (18...",19950412162405,16:24:05,1.3.6.1.4.1.14519.5.2.1.4334.1501.480526200249...
2,R01-109.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.223070940126...,1.3.6.1.4.1.14519.5.2.1.4334.1501.139977685997...,"[[(86.17855698426973, 303.1207850126064), (88....",19940730121054,12:10:54,1.3.6.1.4.1.14519.5.2.1.4334.1501.212967023555...
3,R01-011.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.357040377405...,1.3.6.1.4.1.14519.5.2.1.4334.1501.108960293210...,"[[(328.4528301886793, 360.1174004192872), (330...",19900904093753,09:37:53,1.3.6.1.4.1.14519.5.2.1.4334.1501.820030585451...
4,AMC-006.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.296461180370...,1.3.6.1.4.1.14519.5.2.1.4334.1501.127565890510...,"[[(209.70212765957447, 168.3063829787234), (21...",1991-08-09T10:09:05,10:09:05,1.3.6.1.4.1.14519.5.2.1.4334.1501.155999600049...
...,...,...,...,...,...,...,...
185,AMC-030.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.286648222242...,1.3.6.1.4.1.14519.5.2.1.4334.1501.977011071235...,"[[(137.2939953166097, 285.39386782636325), (13...",1994-08-20T10:43:50,10:43:50,1.3.6.1.4.1.14519.5.2.1.4334.1501.133564673823...
186,AMC-014.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.244443454975...,1.3.6.1.4.1.14519.5.2.1.4334.1501.245990063601...,"[[(135.7699680511182, 333.1544195953142), (137...",1992-11-21T11:09:43,11:09:43,1.3.6.1.4.1.14519.5.2.1.4334.1501.266612763782...
187,AMC-048.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.238974246939...,1.3.6.1.4.1.14519.5.2.1.4334.1501.151248250772...,"[[(162.66098081023455, 313.8592750533049), (16...",1993-08-28T15:18:11,15:18:11,1.3.6.1.4.1.14519.5.2.1.4334.1501.153624812818...
188,R01-151.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.838323959306...,1.3.6.1.4.1.14519.5.2.1.4334.1501.334197940603...,"[[(325.5682062298604, 322.26852846401715), (32...",19940930080020,08:00:20,1.3.6.1.4.1.14519.5.2.1.4334.1501.178904155970...


In [8]:
xml_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 190 entries, 0 to 189
Data columns (total 7 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   file_name            190 non-null    object
 1   Study UID            190 non-null    object
 2   Series UID           190 non-null    object
 3   roi_locations        190 non-null    object
 4   start_date           190 non-null    object
 5   start_time           190 non-null    object
 6   Image Reference UID  190 non-null    object
dtypes: object(7)
memory usage: 10.5+ KB


In [9]:
xml_df

Unnamed: 0,file_name,Study UID,Series UID,roi_locations,start_date,start_time,Image Reference UID
0,R01-076.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.137912259338...,1.3.6.1.4.1.14519.5.2.1.4334.1501.201809319317...,"[[(355.00854700854705, 270.22222222222223), (3...",19940423121242,12:12:42,1.3.6.1.4.1.14519.5.2.1.4334.1501.223865377877...
1,R01-104.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.250426352468...,1.3.6.1.4.1.14519.5.2.1.4334.1501.184465376893...,"[[(177.94456289978677, 341.1513859275053), (18...",19950412162405,16:24:05,1.3.6.1.4.1.14519.5.2.1.4334.1501.480526200249...
2,R01-109.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.223070940126...,1.3.6.1.4.1.14519.5.2.1.4334.1501.139977685997...,"[[(86.17855698426973, 303.1207850126064), (88....",19940730121054,12:10:54,1.3.6.1.4.1.14519.5.2.1.4334.1501.212967023555...
3,R01-011.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.357040377405...,1.3.6.1.4.1.14519.5.2.1.4334.1501.108960293210...,"[[(328.4528301886793, 360.1174004192872), (330...",19900904093753,09:37:53,1.3.6.1.4.1.14519.5.2.1.4334.1501.820030585451...
4,AMC-006.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.296461180370...,1.3.6.1.4.1.14519.5.2.1.4334.1501.127565890510...,"[[(209.70212765957447, 168.3063829787234), (21...",1991-08-09T10:09:05,10:09:05,1.3.6.1.4.1.14519.5.2.1.4334.1501.155999600049...
...,...,...,...,...,...,...,...
185,AMC-030.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.286648222242...,1.3.6.1.4.1.14519.5.2.1.4334.1501.977011071235...,"[[(137.2939953166097, 285.39386782636325), (13...",1994-08-20T10:43:50,10:43:50,1.3.6.1.4.1.14519.5.2.1.4334.1501.133564673823...
186,AMC-014.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.244443454975...,1.3.6.1.4.1.14519.5.2.1.4334.1501.245990063601...,"[[(135.7699680511182, 333.1544195953142), (137...",1992-11-21T11:09:43,11:09:43,1.3.6.1.4.1.14519.5.2.1.4334.1501.266612763782...
187,AMC-048.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.238974246939...,1.3.6.1.4.1.14519.5.2.1.4334.1501.151248250772...,"[[(162.66098081023455, 313.8592750533049), (16...",1993-08-28T15:18:11,15:18:11,1.3.6.1.4.1.14519.5.2.1.4334.1501.153624812818...
188,R01-151.xml,1.3.6.1.4.1.14519.5.2.1.4334.1501.838323959306...,1.3.6.1.4.1.14519.5.2.1.4334.1501.334197940603...,"[[(325.5682062298604, 322.26852846401715), (32...",19940930080020,08:00:20,1.3.6.1.4.1.14519.5.2.1.4334.1501.178904155970...


**[1.2]** Building Path DataFrame from Text Files

xml_df

In [10]:
import pandas as pd
df_label=pd.read_csv("/kaggle/input/nsclc-radiogenomics-6-1-21-version-4/NSCLCR01Radiogenomic_DATA_LABELS_2018-05-22_1500-shifted.csv")

In [11]:
df_label = df_label[pd.notna(df_label['Case ID'])]

In [12]:
df_label['Recurrence'].value_counts()

Recurrence
no               156
yes               54
Not collected      1
Name: count, dtype: int64

In [13]:
import pandas as pd

# Filter rows where 'Recurrence' is 'Not collected'
not_collected_df = df_label[df_label['Recurrence'] == 'Not collected']

# Extract 'Case ID' column from the filtered DataFrame
not_collected_id = not_collected_df['Case ID']

# Convert to a list if needed
not_collected_id_list = not_collected_id.tolist()

print(not_collected_id_list)

['AMC-049']


In [14]:
# Filter out rows where 'Case ID' is in the not_collected_id_list
df_label= df_label[~df_label['Case ID'].isin(not_collected_id_list)]

In [15]:
import os
import pandas as pd

def gather_image_paths(root_folder):
    image_paths = []
    
    # Walk through the directory
    for subdir, _, files in os.walk(root_folder):
        for file in files:
            # Check if the file is an image (you can add more extensions as needed)
            if file.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif','.dcm')):
                # Create the full path to the image
                full_path = os.path.join(subdir, file)
                image_paths.append(full_path)
                
    return image_paths

In [16]:
root_folder = '/kaggle/input/nsclc-radiogenomics-6-1-21-version-4'
image_paths = gather_image_paths(root_folder)

# Create a DataFrame
df_images = pd.DataFrame(image_paths, columns=['image_path'])

In [17]:
# Function to extract label
def extract_label(path):
    parts = path.split('/')
    return parts[-4]  # The 4th element from the right

# Apply the function to the DataFrame
df_images['Patient ID'] = df_images['image_path'].apply(extract_label)

In [18]:
# Filter out rows where 'Patient ID' is in the not_collected_id_list
df_images= df_images[~df_images['Patient ID'].isin(not_collected_id_list)]

In [19]:
df_images = df_images.reset_index(drop=True)

In [20]:
df_images['Patient ID'].nunique()

210

In [21]:
df_label.rename(columns={'Case ID': 'Patient ID'}, inplace=True)

In [22]:
df_label['Patient ID'].nunique()

210

In [23]:
df_label.columns

Index(['Patient ID', 'Patient affiliation', 'Age at Histological Diagnosis',
       'Weight (lbs)', 'Gender', 'Ethnicity', 'Smoking status', 'Pack Years',
       'Quit Smoking Year', '%GG', 'Tumor Location (choice=RUL)',
       'Tumor Location (choice=RML)', 'Tumor Location (choice=RLL)',
       'Tumor Location (choice=LUL)', 'Tumor Location (choice=LLL)',
       'Tumor Location (choice=L Lingula)', 'Tumor Location (choice=Unknown)',
       'Histology ', 'Pathological T stage', 'Pathological N stage',
       'Pathological M stage', 'Histopathological Grade',
       'Lymphovascular invasion',
       'Pleural invasion (elastic, visceral, or parietal)',
       'EGFR mutation status', 'KRAS mutation status',
       'ALK translocation status', 'Adjuvant Treatment', 'Chemotherapy',
       'Radiation', 'Recurrence', 'Recurrence Location', 'Date of Recurrence',
       'Date of Last Known Alive', 'Survival Status', 'Date of Death',
       'Time to Death (days)', 'CT Date', 'Days between CT and 

In [24]:
df_images

Unnamed: 0,image_path,Patient ID
0,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-081
1,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-081
2,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-081
3,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-081
4,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-081
...,...,...
285088,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-007
285089,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-007
285090,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-007
285091,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-007


In [25]:
df = pd.merge(df_images, df_label, on='Patient ID', how='outer')

In [26]:
df

Unnamed: 0,image_path,Patient ID,Patient affiliation,Age at Histological Diagnosis,Weight (lbs),Gender,Ethnicity,Smoking status,Pack Years,Quit Smoking Year,...,Recurrence,Recurrence Location,Date of Recurrence,Date of Last Known Alive,Survival Status,Date of Death,Time to Death (days),CT Date,Days between CT and surgery,PET Date
0,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,AMC-001,Stanford,34,Not Collected,Male,Not Recorded In Database,Nonsmoker,,,...,yes,distant,10/7/1994,1/7/1997,Dead,1/7/1997,872.0,8/10/1994,9,Not Collected
1,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,AMC-001,Stanford,34,Not Collected,Male,Not Recorded In Database,Nonsmoker,,,...,yes,distant,10/7/1994,1/7/1997,Dead,1/7/1997,872.0,8/10/1994,9,Not Collected
2,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,AMC-001,Stanford,34,Not Collected,Male,Not Recorded In Database,Nonsmoker,,,...,yes,distant,10/7/1994,1/7/1997,Dead,1/7/1997,872.0,8/10/1994,9,Not Collected
3,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,AMC-001,Stanford,34,Not Collected,Male,Not Recorded In Database,Nonsmoker,,,...,yes,distant,10/7/1994,1/7/1997,Dead,1/7/1997,872.0,8/10/1994,9,Not Collected
4,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,AMC-001,Stanford,34,Not Collected,Male,Not Recorded In Database,Nonsmoker,,,...,yes,distant,10/7/1994,1/7/1997,Dead,1/7/1997,872.0,8/10/1994,9,Not Collected
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
285088,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-163,VA,68,229,Male,Caucasian,Current,30,,...,yes,distant,2/15/1996,1/11/1997,Dead,1/11/1997,462.0,8/17/1995,51,7/12/1995
285089,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-163,VA,68,229,Male,Caucasian,Current,30,,...,yes,distant,2/15/1996,1/11/1997,Dead,1/11/1997,462.0,8/17/1995,51,7/12/1995
285090,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-163,VA,68,229,Male,Caucasian,Current,30,,...,yes,distant,2/15/1996,1/11/1997,Dead,1/11/1997,462.0,8/17/1995,51,7/12/1995
285091,/kaggle/input/nsclc-radiogenomics-6-1-21-versi...,R01-163,VA,68,229,Male,Caucasian,Current,30,,...,yes,distant,2/15/1996,1/11/1997,Dead,1/11/1997,462.0,8/17/1995,51,7/12/1995


In [27]:
pip install tqdm

Note: you may need to restart the kernel to use updated packages.


In [28]:
import pandas as pd
import pydicom
from tqdm import tqdm

# Function to extract DICOM UIDs
def extract_dicom_uids(dicom_path):
    try:
        # Read the DICOM file
        dicom_data = pydicom.dcmread(dicom_path, force=True)  # force=True ensures reading even if incomplete

        # Extract UIDs
        study_uid = dicom_data.StudyInstanceUID if hasattr(dicom_data, 'StudyInstanceUID') else None
        series_uid = dicom_data.SeriesInstanceUID if hasattr(dicom_data, 'SeriesInstanceUID') else None
        sop_uid = dicom_data.SOPInstanceUID if hasattr(dicom_data, 'SOPInstanceUID') else None

        return study_uid, series_uid, sop_uid

    except Exception as e:
        print(f"Error reading {dicom_path}: {e}")
        return None, None, None


tqdm.pandas()  # This registers tqdm with pandas

# Apply the function with a progress bar
df[['StudyInstanceUID', 'SeriesInstanceUID', 'SOPInstanceUID']] = df['image_path'].progress_apply(
    lambda path: pd.Series(extract_dicom_uids(path))
)

100%|██████████| 285093/285093 [48:30<00:00, 97.95it/s]  


In [31]:
df['StudyInstanceUID'].nunique()

393

In [30]:
df.to_csv("/kaggle/working/processed_df.csv")