In [1]:
import dicom
from functools import reduce
import json
from matplotlib import pyplot as plt
from multiprocessing import Pool
import numpy as np
from operator import add
from os import path
import pandas as pd
import _pickle
import re
import requests
from scipy.misc import imresize
import sys

%matplotlib inline

First we need to download image descriptions from openi.

In [2]:
data_list = []

In [3]:
i = 0

while True:
    begin = i * 30 + 1
    end = (i + 1) * 30
    resp = requests.get("https://openi.nlm.nih.gov/retrieve.php?it=x&coll=iu&fields=msh&m={}&n={}".format(begin, end))
    tmp_list = json.loads(resp.text)["list"]
    if len(tmp_list) < 1:
        break
    data_list.extend(tmp_list)
    
    i += 1
    
    if i % 100 == 0:
        print(begin, end)

2971 3000
5971 6000


Add filename to dict of image.

In [4]:
pattern = re.compile(r"(.*?)([0-9]+)_IM-([^\.]*)\.(.*)")

for data in data_list:
    data["localFilename"] = re.sub(pattern, r"\2/\2_IM-\3.dcm", data["imgLarge"])

Check whether filenames are correct

In [5]:
dicom_filepaths = !ls /home/a.kondyukov/data/Indianapolis_dicom/**/*
filenames_set = set(dicom_filepaths)

for data in data_list:
    var = "/home/a.kondyukov/data/Indianapolis_dicom/" + data["localFilename"]
    assert var in filenames_set
        
        
new_filenames_set = set(["/home/a.kondyukov/data/Indianapolis_dicom/" + data["localFilename"] for data in data_list])
for cpath in dicom_filepaths:
    assert cpath in new_filenames_set, cpath

Fold majors list so as to find most common cases.

In [6]:
majors = [data["MeSH"]["major"] for data in data_list]
localFilenames = [data["localFilename"] for data in data_list]

In [7]:
majors_flat = reduce(add, majors)

In [8]:
cases = [
    "hypoinflation", 
    "hyperdistention", 
    "cardiomegaly", 
    "tortuous aorta", 
    "vertebrae degenerative",
    "spine degenerative",
    "granulomatous",
    "atherosclerosis",
    "calcinosis aorta",
    "emphysema"
        ]

In [9]:
cases_df = pd.Series(majors_flat).value_counts()
cases_df.index = pd.Series(cases_df.index).apply(lambda s: s.strip()).values

Several dicts useful for mapping from disease to index in target array and vice versa.

In [10]:
case_sets = dict()

for case in cases:
    case_sets[case] = set()
    for major, count in cases_df.items():
        if all([w in major.lower() for w in case.split(" ")]):
            if count >= 10:
                case_sets[case].add(major)

In [11]:
reverse_case_dict = dict()

for case in case_sets:
    for cur_case in case_sets[case]:
        reverse_case_dict[cur_case] = case

In [12]:
case_num_dict = dict()

for i, c in enumerate(case_sets):
    case_num_dict[c] = i

In [13]:
Y_width = len(case_sets)

filename_case_dict = dict()

for img_desc, major in zip(data_list, majors):
    cur_res = np.zeros((Y_width, ), dtype=np.bool)
    for i, cur_case in enumerate(major):
        if cur_case in reverse_case_dict:
            cur_res[case_num_dict[reverse_case_dict[cur_case]]] = True
    filename_case_dict[img_desc["localFilename"]] = cur_res.tolist()

In [14]:
json.dump(reverse_case_dict, open("../pickles/case_group.json", "w"))
json.dump(case_num_dict, open("../pickles/group_num.json", "w"))
json.dump(filename_case_dict, open("../pickles/filename_arr.json", "w"))

In [22]:
IMAGE_SIZE = 1024

Load images, resize them and obtain np.array; prepare target variables.

In [43]:
num_examples = len(filename_case_dict)
X = np.zeros((num_examples, IMAGE_SIZE, IMAGE_SIZE, 1))
y = np.zeros((num_examples,), dtype=[
        ("emphysema", "i1"), 
        ("hypoinflation", "i1"), 
        ("athersclerosis", "i1"),
        ("granulomatous", "i1"),
        ("calcinosis aorta", "i1"),
        ("spine degenerative", "i1"),
        ("tortuous aorta", "i1"),
        ("cardiomegaly", "i1"),
        ("vertebrae degenerative", "i1"),
        ("hyperdistention", "i1")
    ])

def fun(args):
    k, v = args
    cur_dcm = dicom.read_file(path.join("/home/a.kondyukov/data/Indianapolis_dicom/", k))
    return imresize(cur_dcm.pixel_array, [IMAGE_SIZE, IMAGE_SIZE]), tuple(v)

rX = []
rY = []

with Pool(20) as p:
    for rx, ry in tqdm_notebook(
        p.imap(fun, filename_case_dict.items(), chunksize=5), 
        total=len(filename_case_dict) // 5):
        
        rX.append(rx)
        rY.append(ry)

          6341it [17:14,  6.13it/s]

In [48]:
X = np.array(rX)

for i, ry in enumerate(rY):
    y[i] = ry

In [61]:
y

array([(0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 0, 0, 0, 1, 0, 0, 0),
       (0, 0, 0, 1, 0, 0, 1, 0, 0, 0), ..., (0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
       (0, 0, 0, 0, 0, 0, 0, 0, 0, 0), (0, 0, 0, 1, 0, 0, 0, 0, 1, 0)], 
      dtype=[('emphysema', 'i1'), ('hypoinflation', 'i1'), ('athersclerosis', 'i1'), ('granulomatous', 'i1'), ('calcinosis aorta', 'i1'), ('spine degenerative', 'i1'), ('tortuous aorta', 'i1'), ('cardiomegaly', 'i1'), ('vertebrae degenerative', 'i1'), ('hyperdistention', 'i1')])

In [70]:
np.savez_compressed("../pickles/data.npz", X=X, y=y)