This notebook is concerned with using a model (created by unet_dev.ipynb or mrunet_dev.ipynb) to predict the areas of pericardial fat in a (much!) larger subsample of the UKBiobank dataset.  

In [None]:
import pandas as pd

import numpy as np

from mask_utils import load_image

from tensorflow.keras.models import model_from_json

import os

from network_utils import gpu_memory_limit,predict_stochastic
from MultiResUNet.MultiResUNet import MultiResUnet

import pickle

import tempfile

import zipfile

import re

import glob

import tensorflow as tf

import pydicom as dcm

In [None]:
#limit how much GPU RAM can be allocated by this notebook... 8GB is 1/3 of available
# gpu_memory_limit(6000)

gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)

First, load the model

In [None]:
#load the model
modelBaseName = 'mrunet_bayesian_2020-07-13_13:40' 

#location of the actual saved model
modelBaseName = os.path.join('data','models',modelBaseName)

modelParamFile = modelBaseName + '.h5'
modelArchitecture = modelBaseName + '.json'

with open( modelArchitecture , 'r') as json_file:
    MODEL = model_from_json( json_file.read() )

MODEL.load_weights(modelParamFile)

#hyperparameter N, defined according to quantify_model_performance.ipynb
N = 15


accuracyModelPath = modelBaseName + '_prediction_conversion.pickle'
ACCURACYMODEL = pickle.load(open(accuracyModelPath,'rb'))

#file for writing results
RESULTSFILE = os.path.join('data','UKB_pericardial_fat_predictions.csv')

Now, load the details for image preprocessing:


In [None]:
PADSIZE = pickle.load(open(os.path.join('data','PADSIZE.pickle'),'rb'))
PXSPACING = pickle.load(open(os.path.join('data','PXSPACING.pickle'),'rb'))
PXAREA = np.product(PXSPACING)

In [None]:
#get list of all LAX zipfiles   
allZips = glob.glob(os.path.join('data','imaging_by_participant','**','*_longaxis.zip'),recursive = True)

In [None]:
def get_manifest(zipfileObject):
    
    allFiles= zipfileObject.namelist()
    
    with tempfile.TemporaryDirectory() as tempDir:
        reg = re.compile('manifest*')
        manifestFiles = [f for f in allFiles if reg.match(f)]

        if len(manifestFiles) != 1:
            print('no manifest found')
            return None
        else:
            zipfileObject.extract(manifestFiles[0],path=tempDir)
            manifest = pd.read_csv(os.path.join(tempDir,manifestFiles[0]) , index_col=False)
            
    return manifest

In [None]:
def first_image_in_series(zipfileObject,listOfDicomFiles):
    
    #first sort the list, as usually the one with the lowest trigger time is also the first one after sorting
    sortedList = np.sort(listOfDicomFiles)
        
    with tempfile.TemporaryDirectory() as tempDir:
        for dicom in sortedList:
            zipfileObject.extract(dicom,path=tempDir)
            triggerTime = dcm.read_file(os.path.join(tempDir,dicom)).TriggerTime
            if triggerTime == 0.0:
                return dicom

def extract_first_4Ch_image(zipfilePath):
    
    '''THIS VERSION DOES NO CHECKS!!!!! IT JUST TRIES TO LOAD THE FIRST FILE'''
    
    
    zipfileObject = zipfile.ZipFile(zipfilePath)

    manifest = get_manifest(zipfileObject)

    #index for 4ch images
    Index4Ch = (manifest =='CINE_segmented_LAX_4Ch').max(axis=1)
    
    
    if not Index4Ch.any():
        #if nothing labelled as a 4-chamber image, return nothing
        return None,None

    else: #if there *are* images labelled as 4Ch
        with tempfile.TemporaryDirectory() as tempDir:
            #get only the 4Chamber ones
            manifest = manifest.loc[Index4Ch,:]
            #separate the series
            series = manifest.groupby(['series discription','seriesid'])
            #get the date
            imagedDate = manifest['date'].iloc[0]

            if series.count().shape[0] == 1:    
                #if there is only one series used, then get the first image from that one.
                firstDicom = first_image_in_series(zipfileObject,manifest['filename'].values)
                zipfileObject.extract(firstDicom,path=tempDir)
            
            else: 
#                 print('more than one series found...')
                #if there is more than one series, do some logic

                #filter for number of images - should be exactly 50
                manifest  = series.filter(lambda x: x.count().max() == 50)

                #get all the first dicoms...
                firstDicoms = manifest.groupby(['series discription','seriesid']).apply(lambda x: first_image_in_series(zipfileObject,x['filename']))

                #get the series times out...
                firstDicoms.apply(lambda x: zipfileObject.extract(x,path=tempDir))
                seriesTime  = firstDicoms.apply(lambda x: dcm.read_file(os.path.join(tempDir,x)).SeriesTime)

                #and extract the latest one (assuming it will be better...)
                firstDicom = firstDicoms.values[ np.argmax(seriesTime.values) ]

            try:
                image = load_image(dicomPath=os.path.join(tempDir,firstDicom),desiredPxSpacing=PXSPACING, padSize=PADSIZE)
                return image[0],imagedDate
            except:
                return None,None

In [None]:
#FIXMEEEE update this list to match the output arguments in network_utils/predict_stochastic
# RESNAMES = ['consensus','uncertainty','meanArea (mm2)','stdArea (mm2)','mpDsc','gDsc','mpIou','gIou']
RESNAMES = ['meanArea (cm2)','stdArea (cm2)','predicted DSC']

def get_feid(zipfilePath):
    return os.path.basename(zipfilePath)[:7]

def quantify_fat(zipfilePath):
    
    feid = get_feid(zipfilePath)
    #create dictionary for returning results.
    resultDict = {'f.eid':feid}
    
    #extract the pixels for each image.
    im,imagedDate = extract_first_4Ch_image(zipfilePath)
    
    if im is not None:
        resultDict['date'] = imagedDate
        im = im.reshape((1,*im.shape,1))
        res = predict_stochastic(MODEL,N,ACCURACYMODEL,im) #FIXMEE remove the unnecessary metrics

        #wrap up into a dict for easy DataFram-ing
        resultDict.update(dict(zip(RESNAMES,res[2:])))

        #ensure that units of area are correct...
        resultDict['meanArea (cm2)'] *= (PXAREA/100)
        resultDict['stdArea (cm2)'] *= (PXAREA/100)
        return resultDict
    else:
        return resultDict
    
    

In [None]:
#create a dataframe to store results
results = pd.DataFrame()

for zipfilePath in allZips:
#     if get_feid(zipfilePath) not in results.index:
    result = quantify_fat(zipfilePath)
    results = results.append(result,ignore_index=True)
        
results.to_csv(RESULTSFILE)