This notebook exists so that we can try and clean up the data missing from UK biobank... preliminary work suggests that we can download anythign that is listed in the bulk files.

In [None]:
import os

import zipfile

from itertools import compress

import glob

from IPython.display import clear_output

Load and parse all .bulk files, looking only at their union, and subset for those referring to CMR data.

In [None]:
#load bulk files, this will produce a list of strings
bulkFiles = ['/workspace/storage/restricted-biobank/releases/REVISION_May2019/ID-27289/ukb27289.bulk','/workspace/storage/restricted-biobank/releases/REVISION_May2019/ID-29801/ukb29801.bulk']

bulk = []

for b in bulkFiles:
    with open(b,'r') as f:
        bulk+=f.readlines()

#uniqueify
bulk = list(set(bulk))

print('total of ' + str(len(bulk)) + ' files listed in bulk')

In [None]:
#define dictionaries for mapping between human-readable names of zipfiles, and the UKB codes

code2name = {'20207_2_0':'_scout',
             '20208_2_0':'_longaxis',
             '20209_2_0':'_shortaxis',
             '20210_2_0':'_aorticdistensibility',
             '20211_2_0':'_cinetagging',
             '20212_2_0':'_lvot',
             '20213_2_0':'_bloodflow',
             '20214_2_0':'_experimentalshmollisequence',
            }

name2code = {v:k for k,v in code2name.items()}

#loop over bulk list, keeping elements which have been marked as CMR
bulk = [f for f in bulk if f[8:-1] in code2name.keys()]

print('subsetted for only cmr, there are ' + str(len(bulk)) + ' files.')


Convert the resulting list into names of zipfiles. 

Check for the existence of those zipfiles at the appropriate location, and subset.

In [None]:
def bulk2zip(bulk):
    
    #remove newline characters and any file extension that may pop up
    bulk = os.path.splitext(bulk.replace('\n',''))[0]
    
    feid = bulk[:7]
    
    code = bulk[8:]
    
    extname = code2name[code]
    
    return  feid + extname + '.zip'


def zip2path(mainDir,zipName):
    
    feid = zipName[:7]
    
    path = os.path.join(mainDir,feid[:2] + 'xxxxx',feid,zipName)
    
    return path

In [None]:
def check_file(mainDir,bulk):
    
    '''take a bulk field and check whether its zip equivalent exists in expected location within the file hierarchy. returns True if missing!'''

    zipName = bulk2zip(bulk)
    
    zipPath = zip2path(mainDir,zipName)
    
    if os.path.isfile(zipPath):
        
        return False #because the file IS THERE
    
    else: #if the file is NOT found
        
        #check special case for misspelled aorticdistensibility
        if 'aorticdistensibility' in zipName:
            #check for the alternative spelling
            zipPath = zipPath.replace('aorticdistensibility','aorticdistensibilty')
            #true if the file is not there...
            return not os.path.isfile(zipPath)
        
        #otherwise jusr return true (no other misspelligs I am aware of currently)
        else:
            return True
        

#create the root file which can contain the subtree of downloaded/renamed files
mergable = './data/downloaded'
if not os.path.isdir(mergable):
    os.mkdir(mergable)
    
# notThere = [check_file('./data/imaging_by_participant',f) and check_file(mergable,f) for f in bulk]

notThere = [check_file('./data/imaging_by_participant',f) for f in bulk]

print('of ' + str(len(bulk)) + ' cmr files listed in bulk, there are ' + str(sum(notThere)) + ' files missing (' + str(100*sum(notThere)//len(bulk)) + '%).')

Split into batches of 500 and use ukbfetch. Write logs so that download failures can be checked later

In [None]:
fetch = list(compress(bulk,notThere))

nBatches = 1 + len(fetch)//500#batches of 500 files at a time

#create a folder for batches and their corresponding logs
batchDir = './data/batches'
if not os.path.isdir(batchDir):
    os.mkdir(batchDir)

for b in range(nBatches):
    
    clear_output()
    print('downloading batch ' + str(b+1) + '/' + str(nBatches))
    
    #write a bulk file for the current batch of 500
    batchFile = os.path.join(batchDir,'batch' + str(b).zfill(3) + '.bulk')
    
    with open(batchFile,'w+') as f:
        f.write(''.join(fetch[b*500:(b+1)*500]))

    #invoke ukbfetch, write log
    outFile = batchFile.replace('.bulk','')
    os.system('ukbfetch -a./data/k2964.key -b' + batchFile + ' -o' + outFile)

clear_output()

#get the list of files which have actually been downloaded

downloadedLists = glob.glob(os.path.join(batchDir,'*.lis'))

downloaded = []
for l in downloadedLists:
    with open(l,'r') as f:
        downloaded += f.readlines()
        
print('from ' + str(len(fetch)) + ' attempted downloads, ' +str(len(downloaded)) + ' succeeded (' + str(100*len(downloaded)//len(fetch)) + '%).')

#no need to log these... any failures will be re-attempted as part of the next run. Also so far there have been no failures

Check the integrity of the zipfiles (!), and move valid ones into a directory structure consistent with that already used so it can be merged.

In [None]:

zipNames = [bulk2zip(b) for b in downloaded]

corruptedFiles = []

mergableZipPaths = [zip2path(mergable,z) for z in zipNames]

legitSource = []
legitDest = []

In [None]:
for d,m in zip(downloaded,mergableZipPaths):
    try:
        _ = zipfile.ZipFile(d) #which will fail if the zip is malformed in some way
        
        legitSource += [d]
        legitDest += [m]
        
    except:
        #if zip is corrupted, add to list in bulk format so it can be recorded
        bulkName = list(d.replace('.zip',''))
        bulkName[7] = ' '
        bulkName = ''.join(bulkName)
        
        corruptedFiles.append(bulkName)
        
#write out the list of failed/corrupted files...
with open('./data/batches/corrupted.bulk','w+') as f:
    f.write(''.join(corruptedFiles))

    
print('of ' + str(len(fetch)) +' downloaded files, ' + str(len(corruptedFiles)) + ' gave corrupted zips.')        


In [None]:
for s,d in zip(legitSource,legitDest):
    
    destDir = os.path.dirname(d)
#     pri
    if not os.path.isdir(destDir):
        os.makedirs(destDir)
    #do the move
    os.rename(s,d)

In [None]:
#clean up/remove corrupted files.
[os.remove(f.replace(' ','_').replace('\n','.zip')) for f in corruptedFiles];