In [1]:
from querymachine import QueryMachine
import matplotlib.pyplot as plt
import pydicom
import os
import pandas as pd
from tqdm import tqdm
pd.options.display.max_rows = 4000
# from sklearn.model_selection import train_test_split

In [2]:
qm = QueryMachine()
response = qm.getSeries()

In [3]:
savepath = '/data/CBIS-DDSM'
os.makedirs(savepath, exist_ok=True)
# b_dir = savepath + '/benign'
# m_dir = savepath + '/malignant'
# raw_dir = savepath + '/raw_dcm'
train_dir = savepath + '/train'
valid_dir = savepath + '/valid'
# os.makedirs(b_dir, exist_ok=True)
# os.makedirs(m_dir, exist_ok=True)
# os.makedirs(raw_dir, exist_ok=True)
os.makedirs(train_dir, exist_ok=True)
os.makedirs(valid_dir, exist_ok=True)
for pth in [train_dir, valid_dir]:
    os.makedirs(os.path.join(pth, 'benign'), exist_ok=True)
    os.makedirs(os.path.join(pth, 'malignant'), exist_ok=True)

In [4]:
calc = pd.read_csv('calc_case_description_train_set.csv')
mass = pd.read_csv('mass_case_description_train_set.csv')
labels_train = calc.append(mass, sort=False)
labels_train['set'] = 'train'
labels_train.set
calc = pd.read_csv('calc_case_description_test_set.csv')
mass = pd.read_csv('mass_case_description_test_set.csv')
labels_test = calc.append(mass, sort=False)
labels_test['set'] = 'test'
print(labels_train.shape, labels_test.shape)
labels = labels_train.append(labels_test)
labels.shape

(2864, 18) (704, 18)


(3568, 18)

In [5]:
labels.index = range(labels.shape[0])

In [6]:
cropped = [r['SeriesInstanceUID'] for r in response.json() if 'cropped' in r['SeriesDescription']]
ids = [r['SeriesInstanceUID'] for r in response.json() if 'full' in r['SeriesDescription']]
ids.extend(cropped)
len(ids), len(response.json())

(3210, 6775)

In [7]:
labs = {}
for f in ids:
    for row in labels.loc[:, [c for c in labels.columns if 'file path' in c]].itertuples(index=True, name=None):
        for name in row[1:]:
            if f in name.split('/'):
                label = labels.loc[row[0],'pathology'].split('_')[0].lower()
                img_set = labels.loc[row[0], 'set']
                labs[f] = (label, img_set)
len(labs)

3210

In [8]:
train_counter = 0
test_counter = 0
for k,v in labs.items():
    if v[1] == 'train':
        train_counter += 1
    else:
        test_counter += 1
train_counter, test_counter

(2552, 658)

In [9]:
train_counter + test_counter

3210

In [10]:
def downloadImages(response_list, labs, savepath):
    roi_counter = 0
    img_counter = 0
    failed_reads = 0
    for item in tqdm(response_list):
        series_id = item['SeriesInstanceUID']
        if series_id not in labs.keys():
            continue
#         if 'ROI' in item['SeriesDescription']:
#             roi_counter += int(item['ImageCount'])
#         else:
#             img_counter += int(item['ImageCount'])
        label, img_set = labs[series_id]
        qm.getSeriesImages(series_id, savepath=savepath)
        images = [f for f in os.listdir(savepath) if '.dcm' in f]
        for f in images:
            filepath = os.path.join(savepath, f)
            ds = pydicom.dcmread(filepath)
#             try:
#                 ds = pydicom.dcmread(filepath)
#             except:
#                 continue
            os.remove(filepath)
#             if not label:
#                 return ds
            if img_set == 'train':
                jpg_savepath = savepath + '/train'
            else:
                jpg_savepath = savepath + '/valid'
            jpg_savepath = jpg_savepath + '/' + label
            jpg_name = series_id + '_' + f
            jpg_filepath = os.path.join(jpg_savepath, jpg_name)
            jpg_filepath = jpg_filepath.replace('.dcm', '.jpg')
            ds.decompress()
            img = ds.pixel_array
            plt.imsave(jpg_filepath, ds.pixel_array,
                       vmin=0, vmax=2**16, format='jpg')
#     return roi_counter, img_counter

In [None]:
# roi_counter, img_counter = downloadImages(s.json(), labels_train, labels_test, savepath)
downloadImages(response.json(), labs, savepath)

 24%|██▍       | 1636/6775 [09:27<3:49:33,  2.68s/it]

In [6]:
def getLabel(ds, labels_train, labels_test):
    train = True
    label = None
    sid = ds.SeriesInstanceUID
    pid = ds.PatientID
    for c in labels_train.columns[1:]:
        for i, item in enumerate(labels_train[c]):
            if sid in item.split('/') or pid in item.split('/'):
                label = labels_train.pathology.iloc[i]
    if not label:
        train = False
        for c in labels_test.columns[1:]:
            for i, item in enumerate(labels_test[c]):
                if sid in item.split('/') or pid in item.split('/'):
                    label = labels_test.pathology.iloc[i]
    return train, label

In [51]:
s = set()
for d in response.json():
    s.add(d['SeriesInstanceUID'])
len(s)

6775

In [11]:
s.json()[0]

{'SeriesInstanceUID': '1.3.6.1.4.1.9590.100.1.2.117041576511324414842508325652101471266',
 'StudyInstanceUID': '1.3.6.1.4.1.9590.100.1.2.229361142710768138411679379233064924540',
 'Modality': 'MG',
 'SeriesDescription': 'ROI mask images',
 'BodyPartExamined': 'BREAST',
 'SeriesNumber': '1.000000',
 'Collection': 'CBIS-DDSM',
 'Visibility': '1',
 'ImageCount': 2}

In [68]:
rois = 0
imgs = 0
for d in response.json():
    if 'ROI' in d['SeriesDescription']:
        rois += d['ImageCount']
    else:
        imgs += d['ImageCount']
rois, imgs

(7026, 3213)

In [14]:
labels.columns

Index(['patient_id', 'breast density', 'left or right breast', 'image view',
       'abnormality id', 'abnormality type', 'calc type', 'calc distribution',
       'assessment', 'pathology', 'subtlety', 'image file path',
       'cropped image file path', 'ROI mask file path', 'breast_density',
       'mass shape', 'mass margins'],
      dtype='object')

In [22]:
labels['image file path'].unique().shape

(3103,)

In [4]:
labels_train = labels_train.loc[:, [c for c in labels_train.columns if 'image file path' in c or 'pathology' in c]]
labels_test = labels_test.loc[:, [c for c in labels_test.columns if 'image file path' in c or 'pathology' in c]]
labels = labels_train.append(labels_test)
labels.shape

(3568, 3)

In [5]:
labels.head()

Unnamed: 0,pathology,image file path,cropped image file path
0,MALIGNANT,Calc-Training_P_00005_RIGHT_CC/1.3.6.1.4.1.959...,Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9...
1,MALIGNANT,Calc-Training_P_00005_RIGHT_MLO/1.3.6.1.4.1.95...,Calc-Training_P_00005_RIGHT_MLO_1/1.3.6.1.4.1....
2,BENIGN,Calc-Training_P_00007_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00007_LEFT_CC_1/1.3.6.1.4.1.95...
3,BENIGN,Calc-Training_P_00007_LEFT_MLO/1.3.6.1.4.1.959...,Calc-Training_P_00007_LEFT_MLO_1/1.3.6.1.4.1.9...
4,BENIGN_WITHOUT_CALLBACK,Calc-Training_P_00008_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00008_LEFT_CC_1/1.3.6.1.4.1.95...


In [6]:
D = []
for d in os.listdir(raw_dir):
    filename = os.path.join(raw_dir, d)
    ds = pydicom.dcmread(filename)
    D.append(ds)
len(D)

600

In [23]:
ds = D[0]

In [24]:
ds

(0008, 0005) Specific Character Set              CS: 'ISO_IR 100'
(0008, 0016) SOP Class UID                       UI: Secondary Capture Image Storage
(0008, 0018) SOP Instance UID                    UI: 1.3.6.1.4.1.9590.100.1.2.171972048511494182814787636691927294819
(0008, 0020) Study Date                          DA: '20161004'
(0008, 0023) Content Date                        DA: '20160426'
(0008, 0030) Study Time                          TM: '151730'
(0008, 0033) Content Time                        TM: '123236.585000'
(0008, 0050) Accession Number                    SH: ''
(0008, 0060) Modality                            CS: 'MG'
(0008, 0064) Conversion Type                     CS: 'WSD'
(0008, 0090) Referring Physician's Name          PN: ''
(0008, 103e) Series Description                  LO: 'ROI mask images'
(0010, 0010) Patient's Name                      PN: 'Mass-Test_P_00932_LEFT_MLO_1'
(0010, 0020) Patient ID                          LO: 'Mass-Test_P_00932_LEFT_MLO_1'
(001

In [33]:
uid = '/'.join([ds.PatientID, ds.StudyInstanceUID, ds.SeriesInstanceUID])
# uid = '/'.join([ds.PatientID, ds.SeriesInstanceUID, ds.StudyInstanceUID])
uid

'Mass-Test_P_00932_LEFT_MLO_1/1.3.6.1.4.1.9590.100.1.2.387680469813552393102452998402059443767/1.3.6.1.4.1.9590.100.1.2.213455545211549072236204855191519693004'

In [34]:
uid in labels['image file path']

False

In [35]:
uid in labels['cropped image file path']

False

In [36]:
uid in labels['ROI mask file path']

False

In [40]:
pids = [l.split('/')[0] for l in labels['ROI mask file path']]
len(pids)

3568

In [41]:
pids[0]

'Calc-Training_P_00005_RIGHT_CC_1'

In [43]:
counter = 0
for p in pids:
    if ds.PatientID in p:
        counter += 1
counter

1

In [45]:
s = set()
for ds in D:
    s.add(ds.SeriesDescription)
s

{'ROI mask images', 'cropped images'}

In [None]:
ds.PatientID

In [6]:
def getLabel(ds, labels_train, labels_test):
    train = True
    label = None
    sid = ds.SeriesInstanceUID
    pid = ds.PatientID
    for c in labels_train.columns[1:]:
        for i, item in enumerate(labels_train[c]):
            if sid in item.split('/') or pid in item.split('/'):
                label = labels_train.pathology.iloc[i]
    if not label:
        train = False
        for c in labels_test.columns[1:]:
            for i, item in enumerate(labels_test[c]):
                if sid in item.split('/') or pid in item.split('/'):
                    label = labels_test.pathology.iloc[i]
    return train, label

In [7]:
def downloadImages(ids, labels_train, labels_test, savepath):
    roi_counter = 0
    img_counter = 0
    for item in tqdm(ids):
        series_id = item['SeriesInstanceUID']
        if 'ROI' in item['SeriesDescription']:
            roi_counter += int(item['ImageCount'])
        else:
            img_counter += int(item['ImageCount'])
        qm.getSeriesImages(series_id, savepath=savepath)
        images = [f for f in os.listdir(savepath) if '.dcm' in f]
        for f in images:
            filepath = os.path.join(savepath, f)
            try:
                ds = pydicom.dcmread(filepath)
            except:
                continue
            os.remove(filepath)
            train, label = getLabel(ds, labels_train, labels_test)
            if not label:
                return ds
            if train:
                jpg_savepath = savepath + '/train'
            else:
                jpg_savepath = savepath + '/valid'
            jpg_savepath = jpg_savepath + '/' + label.split('_')[0].lower()
            jpg_filepath = os.path.join(jpg_savepath, f)
            jpg_filepath = jpg_filepath.replace('.dcm', '.jpg')
            ds.decompress()
            img = ds.pixel_array
            plt.imsave(jpg_filepath, ds.pixel_array,
                       vmin=0, vmax=2**16, format='jpg')
    return roi_counter, img_counter

In [None]:
roi_counter, img_counter = downloadImages(s.json(), labels_train, labels_test, savepath)

  3%|▎         | 214/6775 [04:56<2:08:06,  1.17s/it]

In [24]:
counter = 0
for dirname, subdirs, filenames in os.walk(savepath):
    for f in filenames:
        counter += 1
counter

35

In [25]:
img_counter

2

In [14]:
counter

0

In [20]:
ds.PatientID

'Calc-Training_P_00474_LEFT_MLO_1'

In [8]:
downloadImages(s.json(), labels_train, labels_test, savepath)

 94%|█████████▍| 6384/6775 [2:08:50<10:54,  1.67s/it]  

AttributeError: 'NoneType' object has no attribute 'split'

 94%|█████████▍| 6384/6775 [2:09:10<10:54,  1.67s/it]

In [13]:
counter = 0
for dirname, subdirs, filenames in os.walk(savepath):
    for f in filenames:
        counter += 1
print(counter)

1044


In [24]:
counter = 0
for dirname, subdirs, filenames in os.walk(savepath):
    for f in filenames:
        counter += 1
print(counter)

1572


In [14]:
savepath

'/data/CBIS-DDSM'

In [25]:
sid = ds.SeriesInstanceUID
for c in labels.columns[1:]:
    for i, item in enumerate(labels[c]):
        if sid in item.split('/'):
            label = labels.pathology.iloc[i]

In [26]:
label

NameError: name 'label' is not defined

In [27]:
labels.shape

(2864, 3)

In [28]:
sid = 'Mass-Test_P_01595_LEFT_CC_1'
label = None
for c in labels.columns[1:]:
    for i, item in enumerate(labels[c]):
        if sid in item.split('/'):
            label = labels.pathology.iloc[i]
print(label)

None


In [10]:
downloadImages(test[:10], labels, savepath, train=False)

100%|██████████| 10/10 [00:03<00:00,  2.86it/s]


In [3]:
calc = pd.read_csv('calc_case_description_train_set.csv')
print(calc.shape)
mass = pd.read_csv('mass_case_description_train_set.csv')
df = calc.append(mass)
print(df.shape)

(1546, 14)
(2864, 17)


of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort,


In [6]:
dcm_files = [os.path.join(savepath, f) for f in os.listdir(savepath) if '.dcm' in f]
dcm_files

['/data/CBIS-DDSM/1-292.dcm',
 '/data/CBIS-DDSM/2-136.dcm',
 '/data/CBIS-DDSM/1-118.dcm',
 '/data/CBIS-DDSM/2-246.dcm',
 '/data/CBIS-DDSM/2-168.dcm',
 '/data/CBIS-DDSM/1-250.dcm',
 '/data/CBIS-DDSM/2-122.dcm',
 '/data/CBIS-DDSM/2-102.dcm',
 '/data/CBIS-DDSM/2-112.dcm']

In [18]:
img_files = df.loc[:, [c for c in calc.columns if 'image file path' in c or 'pathology' in c]]
img_files.head()

Unnamed: 0,pathology,image file path,cropped image file path
0,MALIGNANT,Calc-Training_P_00005_RIGHT_CC/1.3.6.1.4.1.959...,Calc-Training_P_00005_RIGHT_CC_1/1.3.6.1.4.1.9...
1,MALIGNANT,Calc-Training_P_00005_RIGHT_MLO/1.3.6.1.4.1.95...,Calc-Training_P_00005_RIGHT_MLO_1/1.3.6.1.4.1....
2,BENIGN,Calc-Training_P_00007_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00007_LEFT_CC_1/1.3.6.1.4.1.95...
3,BENIGN,Calc-Training_P_00007_LEFT_MLO/1.3.6.1.4.1.959...,Calc-Training_P_00007_LEFT_MLO_1/1.3.6.1.4.1.9...
4,BENIGN_WITHOUT_CALLBACK,Calc-Training_P_00008_LEFT_CC/1.3.6.1.4.1.9590...,Calc-Training_P_00008_LEFT_CC_1/1.3.6.1.4.1.95...


In [14]:
ds.SeriesInstanceUID

'1.3.6.1.4.1.9590.100.1.2.43873839610761788013224723323225482381'

In [16]:
for f in dcm_files:
    ds = pydicom.dcmread(f)
    print(ds.SeriesDescription)

cropped images
cropped images
cropped images
cropped images
cropped images
cropped images
cropped images
cropped images
cropped images


In [15]:
for f in dcm_files:
    ds = pydicom.dcmread(f)
    sop = ds.SeriesInstanceUID
    for c in img_files.columns:
        for i, item in enumerate(img_files[c]):
            if sop in item.split('/'):
                print(f, i, item, c, '\n')

/data/CBIS-DDSM/1-292.dcm 1591 Mass-Training_P_00061_RIGHT_MLO_1/1.3.6.1.4.1.9590.100.1.2.195593486612988388325770883972107282733/1.3.6.1.4.1.9590.100.1.2.43873839610761788013224723323225482381/000000.dcm cropped image file path 

/data/CBIS-DDSM/1-292.dcm 1591 Mass-Training_P_00061_RIGHT_MLO_1/1.3.6.1.4.1.9590.100.1.2.195593486612988388325770883972107282733/1.3.6.1.4.1.9590.100.1.2.43873839610761788013224723323225482381/000001.dcm
 ROI mask file path 

/data/CBIS-DDSM/2-136.dcm 1913 Mass-Training_P_00519_RIGHT_CC_1/1.3.6.1.4.1.9590.100.1.2.143790815011189157220543617630697462001/1.3.6.1.4.1.9590.100.1.2.245633900110007082034118990512969470333/000000.dcm cropped image file path 

/data/CBIS-DDSM/2-136.dcm 1913 Mass-Training_P_00519_RIGHT_CC_1/1.3.6.1.4.1.9590.100.1.2.143790815011189157220543617630697462001/1.3.6.1.4.1.9590.100.1.2.245633900110007082034118990512969470333/000001.dcm
 ROI mask file path 

/data/CBIS-DDSM/1-118.dcm 2421 Mass-Training_P_01250_RIGHT_MLO_1/1.3.6.1.4.1.9590.10

In [None]:
for item in img_files.iloc[1591, :]:
    