In [None]:
import pydicom as dcm

from Converters import parse_cvi42_xml

import pickle

from zipfile import ZipFile

import os

import glob

from IPython.display import clear_output

import re

from matplotlib.path import Path as mPath

import numpy as np

from itertools import compress

import warnings

import matplotlib.pyplot as plt

from mask_utils import load_image_and_mask


In [None]:
wsxDir = './data/pericardial/wsx_round2/'#directory where finalised wsx files are kept.

pairedDir = os.path.join(wsxDir,'paired') #subdirectory for outputs.

if not os.path.isdir(pairedDir):
    os.mkdir(pairedDir)

wsxFiles = glob.glob(os.path.join(wsxDir,'*.cvi42wsx'))

#parse all the wsx files into pickles.
[parse_cvi42_xml.parseFile(w,output_dir=pairedDir) for w in wsxFiles]

#get only the pickle files referring to individual slice names - i.e. named using uids.
pickles = glob.glob(os.path.join(pairedDir,'*.pickle'))

pickles = [p for p in pickles if '_contours_dct.pickle' not in p] 

In [None]:
def find_and_extract_relevant_dcm(fileNames, outputDir = '.',zippedDataPath='data/imaging_by_participant',zipFilter='[\S\s]*'):
    '''takes a pickle file, or list/array thereof (presumablty exported from a cvi42wsx file) and finds the correctn corresponding dicom file
    fileNames: list or array of paths to pickle files (or corresponding dicom files) created by parseFile()
    outputDir: where to put the dicom file
    zippedDataPath: the top-level directory within which all zipped dicom files reside.
    zipFilter: a regex that can be used to filter for only the zipfiles we care about.
    '''
        
    #if 1 file, make it a list
    if type(fileNames) == str:
        fileNames = [fileNames]
    
    #use names of pickles to get names of their (expected) dicom file
    dicomNames = [os.path.basename(p.replace('.pickle','.dcm')) for p in fileNames]
    
    #uniqueify
    dicomNames = list(set(dicomNames))
    
    #create list of the outputs!
    dicomPaths = [os.path.join(outputDir,d) for d in dicomNames] 
    #check for dicom files in the output directory, so we can subset and avoid duplicated work
    alreadyThere = [os.path.basename(f) for f in glob.glob(os.path.join(outputDir,'*.dcm'))]
    dicomNames = list(set(dicomNames) - set(alreadyThere))
    
    if len(dicomNames)==0:
        print('no work to do!!')
    else:
        print('getting list of all zipfiles in path...')
        #get list of ALL dicoms within top-level directory   
        allZips = glob.glob(os.path.join(zippedDataPath,'**','*.zip'),recursive = True)
        
        
        #filter names of zips using regex, and give some idea of how much this has achieved.
        nAllZips = len(allZips)
        zipFilter = re.compile(zipFilter)
        allZips = [z for z in allZips if zipFilter.match(os.path.basename(z))]
        nFilteredZips = len(allZips)
        print('regex filtering reduced ' + str(nAllZips) + ' zipfiles to ' + str(nFilteredZips) )

        i=0
        while len(dicomNames) > 0 and i < len(allZips):
            zf = ZipFile(allZips[i])

            contents = zf.namelist()
            for d in dicomNames:
                if d in contents:
                    zf.extract(d,path=outputDir)
                    dicomNames.remove(d)
                    #give some indication of how much is done
#                     print(str(100*((len(dicomPaths) - len(dicomNames))/len(dicomNames))) + '% found and extracted')
            zf.close()
            i+=1
                        
        if len(dicomNames) != 0:
            print('warning: not all dicoms found. consider broadening your regex. files not found:\n' + '\n'.join(dicomNames))

    return dicomPaths


In [None]:
dicomPaths = find_and_extract_relevant_dcm(fileNames=pickles,outputDir=pairedDir,zipFilter='[\S\s]*_longaxis') #as we are only looking for long axis images.

#now, it is possible that dicom and pickle paths are not in the same order... check that they are matched.
pickles = sorted(pickles)
dicomPaths = sorted(dicomPaths)

#subset for those with image...
dcmFound = [os.path.isfile(d) for d in dicomPaths]

pickles = list(compress(pickles,dcmFound))
dicomPaths = list(compress(dicomPaths,dcmFound))


In [None]:
#load all files, and put into arrays of dimension (m,x,y)
PADSIZE = [210,210]

m = len(pickles)
X = np.zeros((m,*PADSIZE))
Y = np.zeros((m,*PADSIZE),dtype='bool')
pxSize = np.zeros(m)

for ind,(p,d) in enumerate(zip(pickles,dicomPaths)):
    
    X[ind,:,:],Y[ind,:,:],pxSize[ind] = load_image_and_mask(p,d,PADSIZE,labelFilter='freeDraw')
    
#remove images without any contours. 
use = np.max(np.max(Y,axis=2),axis=1) >0 
X = X[use,:,:]
Y = Y[use,:,:]

# #also filter pickles and dicom paths for later, just in case
# pickles = list(compress(pickles,use))
# dicomPaths = list(compress(dicomPaths,use))

# save X and Y for use in the ML dev notebook
np.save(os.path.join(wsxDir,'X.npy'),X)
np.save(os.path.join(wsxDir,'Y.npy'),Y)
np.save(os.path.join(wsxDir,'pxSize.npy'),pxSize)

In [None]:
#summary statistics for the area of pcf....

fatArea = np.sum(Y,axis=(1,2)) * pxSize/100 #in mm^2

plt.hist(fatArea)

In [None]:

plt.figure(figsize = (10,2*m))
#lets ave a look
for i in range(m):
    plt.subplot(m,2,i*2+1)
    plt.imshow(X[i,:,:])
    plt.subplot(m,2,i*2+2)
    plt.imshow(Y[i,:,:])