In [98]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import pydicom
import shelve
import os
import cv2 as cv

## Data Format 1
This will store just the pixel data of each `.dcm` file, not any of the metadata. This is not ideal for a final product, but we're starting off simple before adding complexity. 
The data will be stored as follows. We're also only using the first 3 series from each study; this should be mostly fine for the first model as most of the studies only have 3 series anyways. 
```json
"study-id": {
	"X": {
		"series-1": ["img-1", "img-2", ...],
		"series-2": ["img-1", "img-2", ...],
		"series-3": ["img-1", "img-2", ...]
	},
	"Y": {
		"LNFN": [[1, 0, 0], [1, 0, 0], ...],
		"RNFN": [[1, 0, 0], [1, 0, 0], ...],
		"LSS": [[1, 0, 0], [1, 0, 0], ...],
		"RSS": [[1, 0, 0], [1, 0, 0], ...],
		"SCS": [[1, 0, 0], [1, 0, 0], ...]
	}
}
```
Note that each of the labels for each on the conditions have the shape `(5, 3)`: A one-hot encoded vector for three classes for each of the 5 vertebrae.

In [None]:
df = pd.read_csv('data/rsna-2024/train.csv')
df.set_index('study_id', inplace=True)

# Fill nan entries with 'Normal/Mild'
for col in df.columns:
    df[col] = df[col].fillna('Normal/Mild')

df

In [None]:
IMAGE_SIZE = (224, 224)

train_imgs_path = 'data/rsna-2024/train_images'
shelve_path = 'data/processed-data/data-1/db'

label_mappings = {
    'Normal/Mild': np.array([1, 0, 0]),
    'Moderate': np.array([0, 1, 0]),
    'Severe': np.array([0, 0, 1]),
}

start_idx = 0
if 'study_idx.txt' in os.listdir():
	with open('study_idx.txt', 'r') as f:
		start_idx = int(f.read().strip())


with shelve.open(shelve_path) as db:
	study_lst = os.listdir(train_imgs_path)
	study_lst.sort(key=lambda x: int(x))
	for (study_idx, study_id) in enumerate(study_lst):
		if study_idx <= start_idx:
			continue

		study_id = int(study_id)
		sample = {
			"X": {
				"series-1": [],
				"series-2": [],
				"series-3": [],
			},
			"Y": {
				"LNFN": None,
				"RNFN": None,
				"LSS": None,
				"RSS": None,
				"SCS": None,
			}
		}

		# Populate Labels
		label_series = df.loc[study_id]
		vert = ['l1_l2', 'l2_l3', 'l3_l4', 'l4_l5', 'l5_s1']
		conditions = [
			('left_neural_foraminal_narrowing', 'LNFN'),
			('right_neural_foraminal_narrowing', 'RNFN'),
			('left_subarticular_stenosis', 'LSS'),
			('right_subarticular_stenosis', 'RSS'),
			('spinal_canal_stenosis', 'SCS'),
		]

		for l_cond, s_cond in conditions:
			arr = np.stack([
				label_mappings[label_series[f"{l_cond}_{v}"]].copy()
				for v in vert
			])
			sample["Y"][s_cond] = arr
		

		# Populate images
		study_id = str(study_id)
		series_lst = os.listdir(os.path.join(train_imgs_path, study_id))
		series_lst.sort(key=lambda x: int(x))

		for i, series_id in enumerate(series_lst):
			if i >= 3: break
			instance_lst = os.listdir(os.path.join(train_imgs_path, study_id, series_id))
			instance_lst.sort(key=lambda x: int(x.partition('.')[0]))

			img_lst = []
			for instance in instance_lst:
				instance = os.path.join(train_imgs_path, study_id, series_id, instance)
				img = pydicom.dcmread(instance)
				arr = cv.resize(img.pixel_array, IMAGE_SIZE)
				img_lst.append(img.pixel_array)

			sample['X'][f'series-{i+1}'] = img_lst
	
		db[str(study_id)] = sample
		print(f"Finished: {study_idx}/{len(study_lst)}")

		with open('study_idx.txt', 'w') as f:
			f.write(str(study_idx))


In [103]:
IMAGE_SIZE = (224, 224)

with shelve.open(shelve_path) as db:
	for idx, (k, v) in enumerate(db.items()):
		for series in ['series-1', 'series-2', 'series-3']:
			for i in range(len(v["X"][series])):
				v["X"][series][i] = cv.resize(v["X"][series][i], IMAGE_SIZE)
		db[k] = v
		print(f"Finished: {idx}/{len(db)}")


Finished: 0/1974
Finished: 1/1974
Finished: 2/1974
Finished: 3/1974
Finished: 4/1974
Finished: 5/1974
Finished: 6/1974
Finished: 7/1974
Finished: 8/1974
Finished: 9/1974
Finished: 10/1974
Finished: 11/1974
Finished: 12/1974
Finished: 13/1974
Finished: 14/1974
Finished: 15/1974
Finished: 16/1974
Finished: 17/1974
Finished: 18/1974
Finished: 19/1974
Finished: 20/1974
Finished: 21/1974
Finished: 22/1974
Finished: 23/1974
Finished: 24/1974
Finished: 25/1974
Finished: 26/1974
Finished: 27/1974
Finished: 28/1974
Finished: 29/1974
Finished: 30/1974
Finished: 31/1974
Finished: 32/1974
Finished: 33/1974
Finished: 34/1974
Finished: 35/1974
Finished: 36/1974
Finished: 37/1974
Finished: 38/1974
Finished: 39/1974
Finished: 40/1974
Finished: 41/1974
Finished: 42/1974
Finished: 43/1974
Finished: 44/1974
Finished: 45/1974
Finished: 46/1974
Finished: 47/1974
Finished: 48/1974
Finished: 49/1974
Finished: 50/1974
Finished: 51/1974
Finished: 52/1974
Finished: 53/1974
Finished: 54/1974
Finished: 55/1974
Fi