In [None]:
import pydicom as dcm

from Converters import parse_cvi42_xml

import pickle

from zipfile import ZipFile

import os

import glob

from IPython.display import clear_output

import re

from matplotlib.path import Path as mPath

import numpy as np

from itertools import compress

import warnings

import matplotlib.pyplot as plt


In [None]:
wsxDir = './data/pericardial/wsx_round2/'#directory where finalised wsx files are kept.

pairedDir = os.path.join(wsxDir,'paired') #subdirectory for outputs.

if not os.path.isdir(pairedDir):
    os.mkdir(pairedDir)

wsxFiles = glob.glob(os.path.join(wsxDir,'*.cvi42wsx'))

#parse all the wsx files into pickles.
[parse_cvi42_xml.parseFile(w,output_dir=pairedDir) for w in wsxFiles]

#get only the pickle files referring to individual slice names - i.e. named using uids.
pickles = glob.glob(os.path.join(pairedDir,'*.pickle'))
correctPickle = re.compile('[\d.]*.pickle') #only numeric filenames... heuristic but probably good enough
pickles = [p for p in pickles if correctPickle.match(os.path.basename(p))]

In [None]:
def find_and_extract_relevant_dcm(picklePaths, outputDir = '.',zippedDataPath='data/imaging_by_participant',zipFilter='[\S\s]*'):
    '''takes a pickle file, or list/array thereof (presumablty exported from a cvi42wsx file) and finds the correctn corresponding dicom file
    picklePaths: list or array of paths to pickle files created by parseFile()
    outputDir: where to put the dicom file
    zippedDataPath: the top-level directory within which all zipped dicom files reside.
    zipFilter: a regex that can be used to filter for only the zipfiles we care about.
    '''
        
    #if 1 file, make it a list
    if type(picklePaths) == str:
        picklePaths = [picklePaths]
    
    #use names of pickles to get names of their (expected) dicom file
    dicomNames = [os.path.basename(p.replace('.pickle','.dcm')) for p in picklePaths]
    #uniqueify
    dicomNames = list(set(dicomNames))
    
    #create list of the outputs!
    dicomPaths = [os.path.join(outputDir,d) for d in dicomNames] 
    #check for dicom files in the output directory, so we can subset and avoid duplicated work
    alreadyThere = [os.path.basename(f) for f in glob.glob(os.path.join(outputDir,'*.dcm'))]
    dicomNames = list(set(dicomNames) - set(alreadyThere))
    
    if len(dicomNames)==0:
        print('no work to do!!')
    else:
        print('getting list of all zipfiles in path...')
        #get list of ALL dicoms within top-level directory
        allZips = glob.glob(os.path.join(zippedDataPath,'**','*.zip'),recursive = True)
        
        
        #filter names of zips using regex, and give some idea of how much this has achieved.
        nAllZips = len(allZips)
        zipFilter = re.compile(zipFilter)
        allZips = [z for z in allZips if zipFilter.match(os.path.basename(z))]
        nFilteredZips = len(allZips)
        print('regex filtering reduced ' + str(nAllZips) + ' zipfiles to ' + str(nFilteredZips) )

        i=0
        while len(dicomNames) > 0 and i < len(allZips):
            zf = ZipFile(allZips[i])

            contents = zf.namelist()
            for d in dicomNames:
                if d in contents:
                    zf.extract(d,path=outputDir)
                    dicomNames.remove(d)
                    #give some indication of how much is done
                    print(str(100*((len(dicomPaths) - len(dicomNames))/len(dicomNames))) + '% found and extracted')
            zf.close()
            i+=1
                        
        if len(dicomNames) != 0:
            print('warning: not all dicoms found. consider broadening your regex. files not found:\n' + '\n'.join(dicomNames))

    return dicomPaths


In [None]:
dicomPaths = find_and_extract_relevant_dcm(picklePaths=pickles,outputDir=pairedDir,zipFilter='[\S\s]*_longaxis') #as we are only looking for long axis images.

#subset for those with image...
dcmFound = [os.path.isfile(d) for d in dicomPaths]

pickles = list(compress(pickles,dcmFound))
dicomPaths = list(compress(dicomPaths,dcmFound))


In [None]:
#now, it is possible that dicom and pickle paths are not in the same order... check that they are matched.
pickles = sorted(pickles)
dicomPaths = sorted(dicomPaths)


In [None]:
def centered_slice(X, L):
    L = np.asarray(L)
    shape = np.array(X.shape)

    # verify assumptions
    assert L.shape == (X.ndim,)
    assert ((0 <= L) & (L <= shape)).all()

    # calculate start and end indices for each axis
    starts = (shape - L) // 2
    stops = starts + L

    # convert to a single index
    idx = tuple(np.s_[a:b] for a, b in zip(starts, stops))
    return X[idx]

def pad_voxels(voxels,pad_size):
    
    nx,ny = voxels.shape    

    #calculate edges and create tuple to ensure correct dimension
    xedge = np.maximum((pad_size[0] - nx) //2,0)
    yedge = np.maximum((pad_size[1] - ny) //2,0)
    pad_width = ( (int(np.floor(xedge)),int(np.ceil(xedge))) , (int(np.floor(yedge)),int(np.ceil(yedge))) )

    voxels= np.pad(voxels,pad_width,'constant')
    
    if np.any([nx,ny] > pad_size): 
        warnings.warn('Image is larger than padding dimension you specified, so you are losing pixels at the edges')
        
        voxels = centered_slice(voxels, pad_size)
    
    return voxels



def load_image_and_mask(picklePath,dicomPath,pad_size = None, collapse=True,labelFilter=''):

    '''takes paths to matched files - a pickle output from parse cvi42wsx, and the corresponding dicom
    padSize is the size of the output images - it currently allows cropping or padding.
    labelFilter allows passing in of a regex string for the NAMES of the different contours. 
    collapse specifies whether the different contours are or-ed (i.e. forcing a single-channel boolean mask)
    WARNING - will have unexpected behaviour with collapse=False and heterogeneous labels 
    '''
    
    #load dicom image.
    image = dcm.dcmread(dicomPath,stop_before_pixels=False)
    
    #load the pickled contour
    with open(picklePath,'rb') as f:
        contour = pickle.load(f)
    
    #consider case where there are >=1 contours per image
    nContours = len(contour)
    
    #get dimensions of image
    nx,ny = image.pixel_array.shape
    
    #create indexers for filling in mask
    x,y = np.meshgrid(range(nx),range(ny))
    x = x.reshape(-1,1)
    y = y.reshape(-1,1)
    xy = np.concatenate((x,y),axis=1) #xy matrix
    
#     print(xy.shape)
    
    #create mask which can contain all contours.
    mask = np.zeros((*image.pixel_array.shape,nContours),dtype = 'bool')
    
    #if no filter specified, the default one will always match
    labelFilter = re.compile(labelFilter)
    
    for ind,c in enumerate(sorted(contour.keys())):
        #if regex for the name of the contour is correct, use it... default argument for labelFilter will always match
        if labelFilter.match(c):
            #get grid points inside contour
            path = mPath(contour[c])
            inContour = path.contains_points(xy)
            #index into mask...
            mask[y[inContour],x[inContour],ind] = True
    
    #if specified, collapse down to 1D representation
    if collapse:
        mask = np.max(mask,axis=2)
        
    #extract the raw pixel values from the dicom file, and normalise to 0-1
    minVal = np.min(image.pixel_array)
    maxVal = np.max(image.pixel_array)
    im = (image.pixel_array - minVal) / (maxVal - minVal)
    
    #get size of pixels(required for downstream analysis)
    pxSize = np.product(image.PixelSpacing)
    
    if pad_size != None:
        
        im = pad_voxels(im,pad_size)
        mask = pad_voxels(mask,pad_size)

    return im,mask,pxSize

In [None]:
#load all files, and put into arrays of dimension (m,x,y)
PADSIZE = [192,208]

m = len(pickles)
X = np.zeros((m,*PADSIZE))
Y = np.zeros((m,*PADSIZE),dtype='bool')
pxSize = np.zeros(m)

for ind,(p,d) in enumerate(zip(pickles,dicomPaths)):
    
    X[ind,:,:],Y[ind,:,:],pxSize[ind] = load_image_and_mask(p,d,PADSIZE,labelFilter='freeDraw')
    
#remove images without any contours. 
use = np.max(np.max(Y,axis=2),axis=1) >0 
X = X[use,:,:]
Y = Y[use,:,:]

#also filter pickles and dicom paths for later, just in case
pickles = list(compress(pickles,use))
dicomPaths = list(compress(dicomPaths,use))

# save X and Y for use in the ML dev notebook
np.save(os.path.join(wsxDir,'X.npy'),X)
np.save(os.path.join(wsxDir,'Y.npy'),Y)
np.save(os.path.join(wsxDir,'pxSize.npy'),pxSize)

In [None]:
#summary statistics for the area of pcf....

fatArea = np.sum(Y,axis=(1,2)) * pxSize/100 #in mm^2

plt.hist(fatArea)

In [None]:

plt.figure(figsize = (10,5*m))
#lets ave a look
for i in range(m):
    plt.subplot(m,2,i*2+1)
    plt.imshow(X[i,:,:])
    plt.subplot(m,2,i*2+2)
    plt.imshow(Y[i,:,:])