This notebook takes the image/mask pairs sorted by extract_dcm_for_wsx, and:
 - loads them, gets the areas of fat shown in the manually-created masks
 - maps them back to the pairs created by matching_diabetics_healthy_controls
 - writes out a sensibly-formatted spreadsheet, with one healthy/diabetic pair per row
 - does some basic statistics allowing us to draw conclusions about whether or not we are measuring a sensible thing

In [None]:
import pandas as pd

import numpy as np

from mask_utils import load_image_and_mask

import glob

import os

import matplotlib.pyplot as plt

from scipy import stats

import pydicom as dcm

In [None]:
wsxDir = './data/pericardial/wsx_round2/'#directory where finalised wsx files are kept.
pairedDir = os.path.join(wsxDir,'paired') #subdirectory for outputs.

nameMapFile = './data/TT_804_32k_bridge_by_feid.csv'#this file contains a mapping between patient names within UK Biobank and the f.eid which were somehow stripped from dicoms because I messed up

graphDir = './graphs'# directory for graph outputs

In [None]:
nameMap = pd.read_csv(nameMapFile).set_index('Patient_name')

In [None]:
#these are all the dicoms extracted by extract_dcm_for_wsx
extractedDicoms = glob.glob(os.path.join(pairedDir,'*.dcm'))

#get the corresponding pickles containing masks - as the folder will contain many more pickles, using the dicoms is the simplest way
maskPickles = [d.replace('.dcm','.pickle') for d in extractedDicoms]

assert all([os.path.isfile(p) for p in maskPickles]) # if this fails, something else has gone very wrong.

In [None]:
def get_fat_area(picklePath,dicomPath):
    '''this gets takes a mask pickle and dicom file, and returns the area of fat defined by the mask'''
    
    image,mask,pxSize = load_image_and_mask(picklePath,dicomPath)
    
    fatArea = np.sum(mask.flatten()) * pxSize/100 #in pXsize is in mm^2, so this gives cm^2
    
    return fatArea

def get_feid_from_dicom(dicomPath):
    
    #load the dicom file
    d = dcm.dcmread(dicomPath,stop_before_pixels=True)
    #extract the name, and modify string to be in the same format as that of nameMap
    patientName = str(d.PatientName).replace('^Bio','').replace(' ','').upper()

    #FIXME THIS COULD POTENTIALLY USE PatientID field as well... but not a problem with my dataset. Also NO MISSING VALUE HANDLING!! (although there should never be missing values)
    
    #look up the corresponding f.eid in nameMap
    return nameMap.loc[patientName,'f.eid']

#use these two functions to construct a dictionary, which can then be used with a dataframe
fatAreas = {get_feid_from_dicom(d):get_fat_area(p,d) for p,d in zip(maskPickles,extractedDicoms)}

In [None]:
paired = pd.read_csv('./data/matched_diabetics_healthy_250.csv')

In [None]:
def get_fat(feid):
    try:
        return fatAreas[feid]
    except:
        return np.nan

In [None]:
paired.loc[:,'healthy fat area (cm2)'] = paired['healthy feid'].apply(get_fat)

paired.loc[:,'diabetic fat area (cm2)'] = paired['diabetic feid'].apply(get_fat)

In [None]:

plt.figure(figsize=(5,5))

lims = [0,130]
plt.plot(lims,lims,label = 'line of unity',c='k')
plt.scatter(paired['healthy fat area (cm2)'],paired['diabetic fat area (cm2)'],label = 'age/sex-matched pairs',alpha=0.5)
plt.xlabel('healthy fat area (cm$^2$)')
plt.ylabel('diabetic fat area (cm$^2$)')
# plt.axis('equal')
plt.xlim(lims)
plt.ylim(lims)

plt.legend()

plt.savefig(os.path.join(graphDir,'healthy_diabetic_paired_scatter.svg'))
plt.savefig(os.path.join(graphDir,'healthy_diabetic_paired_scatter.png'))

In [None]:
#normality test...
stat,healthy_normal_p = stats.normaltest(paired['healthy fat area (cm2)'],nan_policy='omit')
stat,diabetic_normal_p = stats.normaltest(paired['diabetic fat area (cm2)'],nan_policy='omit')


In [None]:
bins = np.arange(0,130,5)

plt.hist(paired.dropna()['healthy fat area (cm2)'],density=True,alpha=0.5,bins = bins,label = 'healthy, p = ' + f'{healthy_normal_p:.3}')
plt.hist(paired.dropna()['diabetic fat area (cm2)'],density=True,alpha = 0.5,bins = bins,label = 'diabetic, p = '+ f'{diabetic_normal_p:.3}')

plt.xlim(0,130)

plt.legend()

plt.xlabel('pericardial fat area (cm$^2$)')
plt.ylabel('probability density')

plt.savefig(os.path.join(graphDir,'healthy_diabetic_histogram.svg'))
plt.savefig(os.path.join(graphDir,'healthy_diabetic_histogram.png'))

In [None]:
#normality test...
stat,healthy_lognormal_p = stats.normaltest(np.log(paired['healthy fat area (cm2)']),nan_policy='omit')
stat,diabetic_lognormal_p = stats.normaltest(np.log(paired['diabetic fat area (cm2)']),nan_policy='omit')


In [None]:
bins = np.arange(0.6,2.4,0.1)

plt.hist(np.log10(paired.dropna()['healthy fat area (cm2)']),density=True,bins=bins,alpha=0.5,label = 'healthy, p = ' + f'{healthy_lognormal_p:.3}')
plt.hist(np.log10(paired.dropna()['diabetic fat area (cm2)']),density=True,bins=bins,alpha = 0.5,label = 'diabetic, p = ' + f'{diabetic_lognormal_p:.3}')

plt.legend()

plt.xlabel('log$_{10}$(pericardial fat area (cm$^2$))')
plt.ylabel('probability density')

plt.savefig(os.path.join(graphDir,'healthy_diabetic_log_histogram.svg'))
plt.savefig(os.path.join(graphDir,'healthy_diabetic_log_histogram.png'))

In [None]:
t,p = stats.ttest_rel(np.log(paired['healthy fat area (cm2)']),np.log(paired['diabetic fat area (cm2)']),nan_policy='omit')

print('difference between healthy and diabetic fat area is' + ' not'*(p>0.05) + ' statistically significant, with t = ' + f'{t:.3}' + ', p = ' + f'{p:.3}' + ' (n = ' + str(paired.dropna().shape[0]) + ')')

In [None]:
#write out the file
paired.to_csv('./data/matched_diabetics_healthy_250_with_manual_fat.csv')
paired.to_csv('./data/pericardial/matched_diabetics_healthy_250_with_manual_fat.csv')

In [None]:
#corresponding stuff for the original spreadsheet, which has more patient data associated

originalFile = './data/healthy_diabetics.csv'

original = pd.read_csv(originalFile,index_col=0)

original.loc[:,'manual fat area (cm2)'] = original['f.eid'].apply(get_fat)

original.to_csv(originalFile)