This notebook is used for randomly sampling from the test set for quality control and inter-observer variability quantification

In [None]:
import numpy as np

import glob

import os

import pandas as pd

import shutil

from zipfile import ZipFile

In [None]:
DATADIR = './data/pericardial/wsx_round2/' #where are the wsx files stored

RAWDATADIR = './data/imaging_by_participant/' # where the original zipfiles with dicoms are stored

nameMapFile = './data/TT_804_32k_bridge_by_feid.csv'#this file contains a mapping between patient names within UK Biobank and the f.eid which were somehow stripped from dicoms because I messed up

nameMap = pd.read_csv(nameMapFile).set_index('Patient_name')

In [None]:
#get a list of all of the wsx files
allwsx = glob.glob(os.path.join(DATADIR,'*.cvi42wsx'))

First - a random subset of 44 for an individual assessment by SP.

In [None]:
#determinism yo
np.random.seed(11811)

sampleSize = np.ceil(len(allwsx) / 10).astype(int)

#pick a subsample
wsxSample = np.random.choice(allwsx,size = sampleSize,replace=False)

#create a subfolder
SAMPLEDIR = os.path.join(DATADIR,'QC_sample')

if not os.path.isdir(SAMPLEDIR):
    os.mkdir(SAMPLEDIR)

#remove the filepaths from the names...
wsxSampleNames = [os.path.basename(x) for x in wsxSample]

#get the patient names
patientNames = [x[:8] for x in wsxSampleNames]

#get the f.eids using the lookup table..
feids = nameMap.loc[patientNames,'f.eid']
feidStr = feids.apply(str)
#create a subdirectory for each feid
feidStr.apply(lambda x: os.mkdir(os.path.join(SAMPLEDIR,x)))


#copy all the mask files across, converting name to <f.eid>_contour
[shutil.copy(s, os.path.join(SAMPLEDIR,d,d) + '_contour.cvi42wsx') for s,d in zip(wsxSample,feidStr.values)]

#get the zipfile for each feid
zipFilePaths = feidStr.apply(lambda x: os.path.join(RAWDATADIR,x[:2] + 'xxxxx',x,x + '_longaxis.zip'))
assert zipFilePaths.apply(os.path.isfile).unique(),"you messed something up and one or more of the files aren't where you thought"

# #copy the zipfiles into SAMPLEDIR
# [shutil.copy(s,SAMPLEDIR) for s in zipFilePaths]

#unzip each zipfile into its own subdirectory
zipFilePaths.apply(lambda x: ZipFile(x,'r').extractall( os.path.join(SAMPLEDIR,os.path.basename(x)[:7]) ) );




In [None]:
feids.sort_index()

Second - a random sample of 50 for inter-observer variability

In [None]:
#determinism yo
np.random.seed(42069)

sampleSize = 50

#pick a subsample
wsxSample = np.random.choice(allwsx,size = sampleSize,replace=False)

#create a subfolder
SAMPLEDIR = os.path.join(DATADIR,'interObserver_sample')
if not os.path.isdir(SAMPLEDIR):
    os.mkdir(SAMPLEDIR)

#remove the filepaths from the names...
wsxSampleNames = [os.path.basename(x) for x in wsxSample]

#get the patient names
patientNames = [x[:8] for x in wsxSampleNames]

#get the f.eids using the lookup table..
feids = nameMap.loc[patientNames,'f.eid']
feidStr = feids.apply(str)
#create a subdirectory for each feid
feidStr.apply(lambda x: os.mkdir(os.path.join(SAMPLEDIR,x)))

#DO NOT COPY THE MASKS!!!!!

#get the zipfile for each feid
zipFilePaths = feidStr.apply(lambda x: os.path.join(RAWDATADIR,x[:2] + 'xxxxx',x,x + '_longaxis.zip'))
assert zipFilePaths.apply(os.path.isfile).unique(),"you messed something up and one or more of the files aren't where you thought"

#unzip each zipfile into its own subdirectory
zipFilePaths.apply(lambda x: ZipFile(x,'r').extractall( os.path.join(SAMPLEDIR,os.path.basename(x)[:7]) ) );

#write out a csv so we can look at it later
