### Dataloader_New ###

In [2]:
import os
import pandas as pd
import glob
import torch
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
import pydicom
from tqdm import tqdm

In [3]:
# Define directories
train_image_dir = './Project/train_images'
train_descrip_dir = './Project/train_series_descriptions.csv'
train_image_coor_dir = "./Project/train_label_coordinates.csv"
train_csv = './Project/train.csv'

In [4]:
# Load metadata
train = pd.read_csv(train_csv)
df_meta_f = pd.read_csv(train_descrip_dir)
df_coor = pd.read_csv(train_image_coor_dir)

In [None]:
# Step 1: Metadata processing
part_1 = os.listdir(train_image_dir)
part_1 = list(filter(lambda x: x.find('.DS') == -1, part_1))
p1 = [(x, os.path.join(train_image_dir, x)) for x in part_1]
meta_obj = {
    p[0]: {'folder_path': p[1], 'SeriesInstanceUIDs': []}
    for p in p1
}

for m in meta_obj:
    meta_obj[m]['SeriesInstanceUIDs'] = list(
        filter(lambda x: x.find('.DS') == -1,
               os.listdir(meta_obj[m]['folder_path']))
    )

# Get series descriptions
for k in tqdm(meta_obj):
    for s in meta_obj[k]['SeriesInstanceUIDs']:
        if 'SeriesDescriptions' not in meta_obj[k]:
            meta_obj[k]['SeriesDescriptions'] = []
        try:
            meta_obj[k]['SeriesDescriptions'].append(
                df_meta_f[(df_meta_f['study_id'] == int(k)) &
                          (df_meta_f['series_id'] == int(s))]['series_description'].iloc[0])
        except:
            print("Failed on", s, k)

im_list_dcm = {}

for study_id in tqdm(meta_obj.keys()):
    study_data = meta_obj[study_id]
    
    for idx, series_id in enumerate(study_data['SeriesInstanceUIDs']):
        if series_id not in im_list_dcm:
            im_list_dcm[series_id] = {'images': [], 'description': study_data['SeriesDescriptions'][idx]}
        
        series_path = os.path.join(study_data['folder_path'], series_id)
        images = glob.glob(f"{series_path}/*.dcm")
        
        for img_path in sorted(images, key=lambda x: int(os.path.basename(x).replace('.dcm', ''))):
            im_list_dcm[series_id]['images'].append({
                'SOPInstanceUID': os.path.basename(img_path).replace('.dcm', ''),
                'dicom': pydicom.dcmread(img_path)
            })

100%|██████████| 1975/1975 [00:00<00:00, 2745.89it/s]
100%|██████████| 1975/1975 [07:28<00:00,  4.40it/s]


In [27]:
print(df_coor['condition'].unique())
print(df_coor['level'].unique())

['Spinal Canal Stenosis' 'Right Neural Foraminal Narrowing'
 'Left Neural Foraminal Narrowing' 'Left Subarticular Stenosis'
 'Right Subarticular Stenosis']
['L1/L2' 'L2/L3' 'L3/L4' 'L4/L5' 'L5/S1']


In [28]:
# Step 2: Define mappings for labels
condition_mapping = {
    "Spinal Canal Stenosis": 0,
    "Left Neural Foraminal Narrowing": 1,
    "Right Neural Foraminal Narrowing": 2,
    "Left Subarticular Stenosis": 3,
    "Right Subarticular Stenosis": 4
}

spinal_level_mapping = {
    "L1/L2": 0,
    "L1/L2": 1,
    "L3/L4": 2,
    "L4/L5": 3,
    "L5/S1": 4
}

grade_mapping = {
    "normal_mild": 0,
    "moderate": 1,
    "severe": 2
}

In [20]:
print(im_list_dcm.keys())


dict_keys(['1012284084', '1792451510', '2092806862', '1252873726', '801316590', '866293114', '1709080005', '2526352865', '992525108', '2539455828', '2720025375', '3775545364', '1705522953', '2883858173', '3088482668', '4018190332', '1243755365', '1870630737', '3461716915', '352098527', '4014890929', '588002243', '2391548363', '482346415', '598943280', '2460967246', '821987258', '995943005', '1049505285', '1131788901', '3398516088', '3675524442', '4193900495', '2116282832', '3087919501', '4056780644', '1523561649', '3995675145', '4236155943', '1603739483', '2418620709', '3119430323', '1958018915', '247942748', '2914428894', '1199603355', '1477339972', '1779061941', '81905111', '1305038229', '2484927966', '2727057862', '1410507520', '2773479263', '3261685527', '2377168492', '3170465859', '3941342785', '1737958872', '43128600', '814821691', '1685444328', '2264877107', '99892732', '2118341625', '2327425347', '3561285461', '1585900432', '3138717770', '875364015', '1484938409', '3383000137',

In [33]:
# Step 3: Define PyTorch Dataset class
class SpinalDataset(Dataset):
    def __init__(self, label_data, image_data, condition_map, level_map, grade_map, severity_data, transform=None):
        self.label_data = label_data
        self.image_data = image_data
        self.condition_map = condition_map
        self.level_map = level_map
        self.grade_map = grade_map
        self.severity_data = severity_data  # Severity data from train.csv
        self.transform = transform

    def __len__(self):
        return len(self.label_data)

    def __getitem__(self, idx):
        row = self.label_data.iloc[idx]
        series_id = str(row['series_id'])
        instance_number = int(row['instance_number'])

        # Retrieve the DICOM image
        image_info = self.image_data[series_id]['images'][instance_number]
        dicom_image = image_info['dicom'].pixel_array  # NumPy array from DICOM

        # Labels
        condition = self.condition_map[row['condition']]
        level = self.level_map[row['level']]

        # Extract severity grade
        severity_column = f"{row['condition'].lower()}_{row['level'].lower()}"
        grade = grade_mapping[self.severity_data.loc[self.severity_data['study_id'] == row['study_id'], severity_column].values[0]]

        # Apply transformations if any
        if self.transform:
            dicom_image = self.transform(dicom_image)

        return torch.tensor(dicom_image, dtype=torch.float32), torch.tensor([condition, level, grade], dtype=torch.long)

In [34]:
# Step 4: Instantiate Dataset and DataLoader
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5], std=[0.5])
])

dataset = SpinalDataset(label_data=df_coor, image_data=im_list_dcm,
                        condition_map=condition_mapping, level_map=spinal_level_mapping,
                        grade_map=grade_mapping, severity_data=train, transform=transform)

dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Test DataLoader
# DataLoader remains unchanged
dataloader = DataLoader(dataset, batch_size=16, shuffle=True)

# Test DataLoader
for images, labels in dataloader:
    print(f"Images batch shape: {images.shape}")
    print(f"Labels batch shape: {labels.shape}")
    break


KeyError: 'left neural foraminal narrowing_l4/l5'