In [169]:
import dicom
from functools import reduce
import numpy as np
from operator import add
from os import path
import pandas as pd
from PIL import Image
import random
import numpy as np
import _pickle

In [69]:
filenames = !ls /home/a.kondyukov/data/Indianapolis_dicom/**/*.dcm

So, now let's choose several categories of images for further processing.

In [76]:
data_list = _pickle.load(open("../pickles/data_list", "rb"))

In [80]:
major_flat = reduce(add, [data["MeSH"]["major"] for data in data_list])

In [83]:
pd.Series(major_flat).value_counts()

normal                                                                2696
Lung/hypoinflation                                                     467
Lung/hyperdistention                                                   318
Cardiomegaly                                                           269
Cardiomegaly/mild                                                      245
Aorta/tortuous                                                         225
Thoracic Vertebrae/degenerative                                        212
Spine/degenerative                                                     208
Granulomatous Disease                                                  185
Technical Quality of Image Unsatisfactory                              183
Thoracic Vertebrae/degenerative/mild                                   172
No Indexing                                                            172
Atherosclerosis/aorta                                                  172
Markings/bronchovascular 

The most obvious (though very useful) case is cardiomegaly, so write filenames of images with CM.

In [155]:
cardiomegaly_indices = set([i for i, data in enumerate(data_list) if 
                       any(["cardiomegaly" in s.lower() for s in data["MeSH"]["major"]])])

print(len(cardiomegaly_filenames))

655


Next case we are able to try to detect is toruous aorta

In [156]:
tort_aort_indices = set([i for i, data in enumerate(data_list) if 
                       any(["tortuous" in s.lower() and 
                            "aorta" in s.lower() for s in data["MeSH"]["major"]])])

print(len(tort_aort_filenames))

453


Atherosclerosis.

In [157]:
atheros_indices = set([i for i, data in enumerate(data_list) if 
                       any(["atherosclerosis" in s.lower() for s in data["MeSH"]["major"]])])

print(len(atheros_filenames))

233


Pleural effusion.

In [158]:
effusion_indices = set([i for i, data in enumerate(data_list) if 
                       any(["effusion" in s.lower() for s in data["MeSH"]["major"]])])

print(len(effusion_filenames))

294


Here we choose examples with only one diagnosis.

In [159]:
chosen_indides = dict()

for s in [cardiomegaly_indices, tort_aort_indices, atheros_indices, effusion_indices]:
    for d in s:
        if d in chosen_indides:
            chosen_indides[d] += 1
        else:
            chosen_indides[d] = 1
            
duplicates = [k for k, v in chosen_indides.items() if v >= 2]

In [160]:
for s in [cardiomegaly_indices, tort_aort_indices, atheros_indices, effusion_indices]:
    s.difference_update(duplicates)

In [161]:
cardiomegaly_filenames = [data_list[i]["localFilename"] for i in cardiomegaly_indices]
tort_aort_filenames = [data_list[i]["localFilename"] for i in tort_aort_indices]
atheros_filenames = [data_list[i]["localFilename"] for i in atheros_indices]
effusion_filenames = [data_list[i]["localFilename"] for i in effusion_indices]

In [164]:
!mkdir /home/a.kondyukov/data/Indianapolis_tiff/cardiomegaly
!mkdir /home/a.kondyukov/data/Indianapolis_tiff/torture_aorta
!mkdir /home/a.kondyukov/data/Indianapolis_tiff/atherosclerosis
!mkdir /home/a.kondyukov/data/Indianapolis_tiff/pleural_effusion

In [173]:
for f in cardiomegaly_filenames:
    new_f = path.split(f)[1]
    !cp /home/a.kondyukov/data/Indianapolis_dicom/$f \
        /home/a.kondyukov/data/Indianapolis_sorted/cardiomegaly/$new_f

In [174]:
for f in tort_aort_filenames:
    new_f = path.split(f)[1]
    !cp /home/a.kondyukov/data/Indianapolis_dicom/$f \
        /home/a.kondyukov/data/Indianapolis_sorted/torture_aorta/$new_f

In [175]:
for f in atheros_filenames:
    new_f = path.split(f)[1]
    !cp /home/a.kondyukov/data/Indianapolis_dicom/$f \
        /home/a.kondyukov/data/Indianapolis_sorted/atherosclerosis/$new_f

In [176]:
for f in effusion_filenames:
    new_f = path.split(f)[1]
    !cp /home/a.kondyukov/data/Indianapolis_dicom/$f \
        /home/a.kondyukov/data/Indianapolis_sorted/pleural_effusion/$new_f

And finally, let's convert all images we need to TIFF format.

In [178]:
all_sets_filenames = !ls /home/a.kondyukov/data/Indianapolis_sorted/**/*.dcm

for filename in all_sets_filenames:
    img = Image.fromarray(dicom.read_file(filename).pixel_array)
    img.save(filename.replace("sorted", "sorted_tiff").replace("dcm", "tiff"))