In [1]:
from fastai2.vision.all import *
import matplotlib.pyplot as plt
from pydicom import dcmread
from skimage.segmentation import clear_border
from skimage.measure import label, regionprops
from scipy import ndimage

In [2]:
path = Path('/home/azaidi/Desktop/fastai/nbs/kaggle/osic')
Path.BASE_PATH = path
path.ls()

(#6) [Path('test.csv'),Path('train'),Path('train.csv'),Path('test'),Path('osic-pulmonary-fibrosis-progression.zip'),Path('sample_submission.csv')]

In [3]:
train_path = path/'train'
test_path = path/'test'
train_df = pd.read_csv(path/'train.csv')
test_df = pd.read_csv(path/'test.csv')
sample_sub = pd.read_csv(path/'sample_submission.csv')

In [4]:
#returns a list with the dicoms in order
def dcm_sort(patient_folder):
    #list comprehension that runs through folder of dicoms
    dcm_stacked = [dcmread(dcm) for dcm in patient_folder.ls()]
    dcm_stacked.sort(key=lambda x: int(x.InstanceNumber), reverse=True)
    #returning a python list of dicoms sorted
    return dcm_stacked

#transforms all slices in a scan in hounsfield units 
def ct_transformed_hu(dcm_sorted, threshold=-1000, replace=-1000):
    intercept = dcm_sorted[0].RescaleIntercept
    slices_stacked = np.stack([dcm.pixel_array for dcm in dcm_sorted])
    slices_stacked = slices_stacked.astype(np.int16)
    
    #converts the unknown values to desired replacement
    slices_stacked[slices_stacked <= threshold] = replace
    
    #turn into hounsfield scale
    slices_stacked += np.int16(intercept)
    
    return np.array(slices_stacked, dtype=np.int16)

In [6]:
#transforms all slices in a scan in hounsfield units 
def ct_transformed(dcm_sorted, threshold=-1000, replace=-1000):
    intercept = dcm_sorted[0].RescaleIntercept
    slices_stacked = np.stack([dcm.pixel_array for dcm in dcm_sorted])
    slices_stacked = slices_stacked.astype(np.int16)
    
    #converts the unknown values to desired replacement
    slices_stacked[slices_stacked <= threshold] = replace
    
    #turn into hounsfield scale
    slices_stacked += np.int16(intercept)
    
    return np.array(slices_stacked, dtype=np.int16)

In [13]:
one_scan = ct_transformed(dcm_sort(train_path.ls()[19]), 0)
len(one_scan), one_scan[20:30].shape

(64, (10, 512, 512))

In [14]:
ten_slices = one_scan[20:30]

In [23]:
def plot_slices(slices, rows, columns, figsize=(20,10)):
    fig, ax = plt.subplots(rows, columns, figsize=figsize)
    for n in range(rows):
        for m in range(columns):
            ax[n, m].imshow(ten_slices[n*5 + m])

In [25]:
train_path.ls()

(#176) [Path('train/ID00232637202260377586117'),Path('train/ID00134637202223873059688'),Path('train/ID00093637202205278167493'),Path('train/ID00307637202282126172865'),Path('train/ID00364637202296074419422'),Path('train/ID00172637202238316925179'),Path('train/ID00335637202286784464927'),Path('train/ID00213637202257692916109'),Path('train/ID00392637202302319160044'),Path('train/ID00030637202181211009029')...]

Having to query into the dicom each time we want to extract out information is cumbersome -- let's just pull all the data from the dicoms we're interested in considering into a dataframe

In [27]:
%%time
scans = [dcm_sort(folder) for folder in train_path.ls()]

CPU times: user 21.1 s, sys: 11.4 s, total: 32.4 s
Wall time: 45 s


In [33]:
#ok now we have 176 lists of dicoms inside of a list
len(scans)

176

Let's initialize a dataframe of the right size -- 176 rows by X number of columns -- let's remind ourselves what's in a dicom first

In [39]:
dcm = scans[0][0]
dcm

Dataset.file_meta -------------------------------
(0002, 0000) File Meta Information Group Length  UL: 200
(0002, 0001) File Meta Information Version       OB: b'\x00\x01'
(0002, 0002) Media Storage SOP Class UID         UI: CT Image Storage
(0002, 0003) Media Storage SOP Instance UID      UI: 2.25.121598162386088989868087504905267208491
(0002, 0010) Transfer Syntax UID                 UI: Explicit VR Little Endian
(0002, 0012) Implementation Class UID            UI: 1.2.276.0.7230010.3.0.3.6.1
(0002, 0013) Implementation Version Name         SH: 'OSIRIX_361'
(0002, 0016) Source Application Entity Title     AE: 'ANONYMOUS'
-------------------------------------------------
(0008, 0008) Image Type                          CS: ['ORIGINAL', 'PRIMARY', 'AXIAL']
(0008, 0018) SOP Instance UID                    UI: 2.25.121598162386088989868087504905267208491
(0008, 0060) Modality                            CS: 'CT'
(0008, 0070) Manufacturer                        LO: 'TOSHIBA'
(0008, 1090) M

In [69]:
meow = len(scans[0])
meow

67

In [70]:
scan_info = [meow, dcm.Rows, dcm.Columns, dcm.SliceThickness,
             dcm.PixelSpacing[0], dcm.PixelSpacing[1],
             dcm.Manufacturer]

In [73]:
scan_info

[67, 512, 512, "5.0", "0.683", "0.683", 'TOSHIBA']

In [93]:
%%time
holder = []
for dcms in scans:
    slice_count = len(dcms)
    dcm = dcms[0]
    holder.append([slice_count, dcm.Rows, dcm.Columns, 
                   dcm.SliceThickness, dcm.PixelSpacing[0], 
                   dcm.PixelSpacing[1], dcm.Manufacturer])

CPU times: user 4.57 ms, sys: 240 µs, total: 4.81 ms
Wall time: 4.77 ms


In [94]:
len(holder), holder[0]

(176, [67, 512, 512, "5.0", "0.683", "0.683", 'TOSHIBA'])

In [101]:
metadata = pd.DataFrame(holder, columns=['num_slices', 'rows', 'columns', 
                              'slice_Thickness', 'spacing_x', 
                              'spacing_y', 'manufacturer'])

In [108]:
type(metadata['spacing_y'][0]), type(metadata['manufacturer'][0])

(numpy.float64, str)

Pandas did the work for us to cast things that look like numerical values as such! :)