In [130]:
import pydicom as dicom
import os
import numpy as np
from PIL import Image
from matplotlib import pyplot as plt, cm
from skimage import transform, io

In [131]:
def read_dcm(path):
    file = dicom.dcmread(path)
    
    return file.pixel_array

def read_png(path):
    return io.imread(path)

def resize(img, dims):
    return transform.resize(img, dims, mode='reflect')

In [132]:
def load(path, dims=None):
    name, ext = os.path.splitext(path)
    if ext.lower() == '.dcm':
        pixels = read_dcm(path)
    elif ext.lower() == '.png':
        pixels = read_png(path)
    else:
        return None
    
    if dims:
        return resize(pixels, dims)
    else:
        return pixels

In [133]:
def display(pixels):
    plt.figure()
    plt.imshow(pixels)

In [134]:
def get_dicoms(path):
    """gets all dicom files under the given path, in all subfolders and returns a list of their paths"""

    dcm_lst = []
    for dir_name, subdir_lst, file_lst in os.walk(path):
        for file in file_lst:
            name, ext = os.path.splitext(file)
            if ext.lower() == '.dcm':
                dcm_lst.append(os.path.join(dir_name, file))
    
    return dcm_lst

In [135]:
import csv

def csv_to_dict(path, key_header, val_header, name_key):
    reader = csv.reader(open(path))
    headers = next(reader)
    key_idx, val_idx = headers.index(key_header), headers.index(val_header)
    
    d = {}
    for row in reader:
        d[get_img_name(row[key_idx], name_key)] = row[val_idx]
    
    return d
    

train_key, test_key = 'Mass-Training', 'Mass-Test'
path_header = 'image file path'
density_header = 'breast_density'

def get_img_name(path, key):
    for sub in path.split('/'):
        if key in sub:
            return sub
    return None

In [136]:
def combine_and_save(out_folder, dcm_lst, label_dict, name_key, dims):
    """
    Combines images with their given labels.
    Set density levels 1 and 2 to 'not dense' = 0
    Set density levels 3 and 4 to 'dense' = 1
    """
    
    exampleList, labelList = [], []
    
    i = 0
    for dicom in dcm_lst:
        name = get_img_name(dicom, name_key)
        pixels = load(dicom, dims)
        
        exampleList.append(pixels)
        
        label = label_dict[name]
        labelList.append(label)
        
        i += 1
        print("{0} out of {1} processed. {2}% complete".format(i, len(dcm_lst), int(100 * i/len(dcm_lst))))
    
    save(out_folder, '{}-Examples'.format(name_key), np.array(exampleList) / UINT16_MAX_VALUE)
    save(out_folder, '{}-Labels'.format(name_key), np.array(labelList))

def save(folder, name, arr):
    np.savez(folder + '/' + name, arr)

In [137]:
RESNET_INPUT_SIZE = (224, 224)
UINT16_MAX_VALUE = 65535
NUM_CLASSES = SPLIT = 2

def convert_data(images_path, labels_path, out_folder, name_key, 
                 key_header=path_header, val_header=density_header, dims=RESNET_INPUT_SIZE):
    
    dcm_lst = get_dicoms(images_path)
    label_dict = csv_to_dict(labels_path, key_header, val_header, name_key)
    combine_and_save(out_folder, dcm_lst, label_dict, name_key, dims)

In [138]:
train_img_path, test_img_path = 'Train', 'Test'
train_labels_path, test_labels_path = 'Mass-Training-Description.csv', 'Mass-Testing-Description.csv'
out = 'processed'

if not os.path.isdir(out):
    os.mkdir(out)

convert_data(train_img_path, train_labels_path, out, train_key)

print("Done with train")

convert_data(test_img_path, test_labels_path, out, test_key)

print("Done with test")

1 out of 354 processed. 0% complete
2 out of 354 processed. 0% complete
3 out of 354 processed. 0% complete
4 out of 354 processed. 1% complete
5 out of 354 processed. 1% complete
6 out of 354 processed. 1% complete
7 out of 354 processed. 1% complete
8 out of 354 processed. 2% complete
9 out of 354 processed. 2% complete
10 out of 354 processed. 2% complete
11 out of 354 processed. 3% complete
12 out of 354 processed. 3% complete
13 out of 354 processed. 3% complete
14 out of 354 processed. 3% complete
15 out of 354 processed. 4% complete
16 out of 354 processed. 4% complete
17 out of 354 processed. 4% complete
18 out of 354 processed. 5% complete
19 out of 354 processed. 5% complete
20 out of 354 processed. 5% complete
21 out of 354 processed. 5% complete
22 out of 354 processed. 6% complete
23 out of 354 processed. 6% complete
24 out of 354 processed. 6% complete
25 out of 354 processed. 7% complete
26 out of 354 processed. 7% complete
27 out of 354 processed. 7% complete
28 out of 

215 out of 354 processed. 60% complete
216 out of 354 processed. 61% complete
217 out of 354 processed. 61% complete
218 out of 354 processed. 61% complete
219 out of 354 processed. 61% complete
220 out of 354 processed. 62% complete
221 out of 354 processed. 62% complete
222 out of 354 processed. 62% complete
223 out of 354 processed. 62% complete
224 out of 354 processed. 63% complete
225 out of 354 processed. 63% complete
226 out of 354 processed. 63% complete
227 out of 354 processed. 64% complete
228 out of 354 processed. 64% complete
229 out of 354 processed. 64% complete
230 out of 354 processed. 64% complete
231 out of 354 processed. 65% complete
232 out of 354 processed. 65% complete
233 out of 354 processed. 65% complete
234 out of 354 processed. 66% complete
235 out of 354 processed. 66% complete
236 out of 354 processed. 66% complete
237 out of 354 processed. 66% complete
238 out of 354 processed. 67% complete
239 out of 354 processed. 67% complete
240 out of 354 processed.

74 out of 156 processed. 47% complete
75 out of 156 processed. 48% complete
76 out of 156 processed. 48% complete
77 out of 156 processed. 49% complete
78 out of 156 processed. 50% complete
79 out of 156 processed. 50% complete
80 out of 156 processed. 51% complete
81 out of 156 processed. 51% complete
82 out of 156 processed. 52% complete
83 out of 156 processed. 53% complete
84 out of 156 processed. 53% complete
85 out of 156 processed. 54% complete
86 out of 156 processed. 55% complete
87 out of 156 processed. 55% complete
88 out of 156 processed. 56% complete
89 out of 156 processed. 57% complete
90 out of 156 processed. 57% complete
91 out of 156 processed. 58% complete
92 out of 156 processed. 58% complete
93 out of 156 processed. 59% complete
94 out of 156 processed. 60% complete
95 out of 156 processed. 60% complete
96 out of 156 processed. 61% complete
97 out of 156 processed. 62% complete
98 out of 156 processed. 62% complete
99 out of 156 processed. 63% complete
100 out of 1