This notebook is for developing the script for syncing data to a local machine, in order to speed up manual segmentation. 

In [None]:
import pandas as pd

import os

import glob

from IPython.display import clear_output

import numpy as np

import datetime

In [None]:
#load the relevant spreadsheet

matchedSubjects = pd.read_csv('./data/matched_diabetics_healthy_all.csv') #created by matching_diabetics_healthy_controls.ipynb

#convert to just a list of all of the feids
allFeids = pd.concat((matchedSubjects['diabetic feid'].dropna().astype('int'),matchedSubjects['healthy feid'].dropna().astype('int'))).reset_index(drop=True).apply(str)

In [None]:
original = pd.read_csv('./data/healthy_diabetics.csv')

allFeids = original['f.eid'].apply(str)

In [None]:

rawTopDir = './data/imaging_by_participant/'

def get_path_to_participant_raw(topDir,feid , suffix ,search = True):
    
    #search allows us to chase up files which aren't in the expected location... but is SLOW.
    
    tryIt = os.path.join(topDir, (feid[:2] + 'xxxxx'), feid, (feid + suffix) )
    
    if os.path.isfile(tryIt):        
        return tryIt
    elif search:
        print('searching...')
        allOptions = glob.glob('./data/**/'+ feid + suffix,recursive=True)
        clear_output()
        if len(allOptions) >= 1:
            return allOptions[0]
        else:
            return ''
    else:
        return ''
    
allLaxFiles = allFeids.apply(lambda x: get_path_to_participant_raw(rawTopDir, str(x) , '_longaxis.zip' , search=False))


allScoutFiles = allFeids.apply(lambda x: get_path_to_participant_raw(rawTopDir, str(x) , '_scout.zip' , search=False))

print('missing data for ' + str((allLaxFiles=='').sum()) + ' LAX files, and ' + str((allScoutFiles=='').sum()) + ' scout files')

In [None]:
np.logical_and(allScoutFiles=='',~(allLaxFiles=='')).sum()

Having manually checked these, they are not on the GPU machine... Why not? :'(

anyway...

In [None]:
bulkFile1 = '/workspace/storage/restricted-biobank/releases/REVISION_May2019/ID-27289/ukb27289.bulk'

bulkFile2 = '/workspace/storage/restricted-biobank/releases/REVISION_May2019/ID-29801/ukb29801.bulk'


#load the bulk files as raw text... easier to search for 
with open(bulkFile1,'r') as f:
    
    bulk = f.read()
    
with open(bulkFile2,'r') as f:
    
    bulk += f.read()
    

In [None]:
scoutInBulk = allFeids.apply(lambda x: (x + ' 20207') in bulk)

laxInBulk = allFeids.apply(lambda x: (x + ' 20208') in bulk)

In [None]:
LaxInBulkButNoData = np.logical_and(laxInBulk, allLaxFiles=='')
ScoutInBulkButNoData = np.logical_and(scoutInBulk, allScoutFiles=='')

print(str(LaxInBulkButNoData.sum()) + ' LAX files mentioned in bulk file but not found. ' + str(ScoutInBulkButNoData.sum()) + ' similar for Scout files')

In [None]:
r9a1 = pd.read_csv('/images/imaging_by_participant/index_and_data_extract_unprocessed_r9a.csv')

r9a2 = pd.read_csv('/images/imaging_by_participant/inventory_heart_MRI_plus_values_r9a.csv')

csvFeids = np.unique(np.concatenate((r9a1['f.eid'].values,r9a2['feid'].values)) ).astype(str)

in_r9acsv = allFeids.apply(lambda x: x in csvFeids)

print ( 'there are ' + str((~in_r9acsv).sum()) + ' feids not mentioned in the r9a_csv files')

print('of these, ' + str(np.logical_and(~in_r9acsv,allScoutFiles=='').sum()) + ' do not have data')

In [None]:
#overlap between missing from bulk and missing from csv

print('there are ' +
      str(np.logical_and(~in_r9acsv,laxInBulk).sum()) + 
      ' feids that do have lax listed in the bulk but are not in the csvs, and ' + 
      str(np.logical_and(~in_r9acsv,scoutInBulk).sum()) + 
      ' feids that have scouts listed in the bulk but are missing from the csvs '
     )

In [None]:
#data missing but ARE listed in csvs

missingLaxButListedInCsv = np.logical_and(allLaxFiles=='',in_r9acsv)

feids_missingLaxButListedInCsv = allFeids[missingLaxButListedInCsv]

r9a2.set_index(r9a2['feid'].astype(str)).loc[feids_missingLaxButListedInCsv,'Long_axis']

In [None]:
#data missing but ARE listed in csvs

missingScoutButListedInCsv = np.logical_and(allScoutFiles=='',in_r9acsv)

feids_missingScoutButListedInCsv = allFeids[missingScoutButListedInCsv]

r9a2.set_index(r9a2['feid'].astype(str)).loc[feids_missingScoutButListedInCsv,'Scout_images']

turn back into a single dataframe, and write out so it can be circulated.

In [None]:
summary = pd.DataFrame({'feid':allFeids,
                        'listed in inventory_heart_MRI_plus_values_r9a.csv':in_r9acsv,
                        'scout zipfile found':(allScoutFiles!=''),
                        'scout listed in .bulk files':scoutInBulk,
                        'lax zipfile found':(allLaxFiles!=''),
                        'lax listed in .bulk files':laxInBulk,
                       })

laxMissing = summary.loc[allLaxFiles=='',:]

laxMissing.to_csv('./data/summary_lax_missing_records.csv',index=False)

summary.to_csv('./data/summary_records.csv',index=False)

However, we should also just MAKE a new .bulk file, which can be used to make a new query against the UKBB system. This should include both scout and lax images.


In [None]:
missingLax = allLaxFiles==''
missingScout = allScoutFiles==''


#FIRST: THE FILES WHICH ARE LISTED IN BULK FILES ALREADY BUT NOT ON OUR SYSTEM
#add the identifier code to each feid, and convert to a string with a newline for each one.
BulkMissingLax = '\n'.join(allFeids[LaxInBulkButNoData].apply(lambda x: x+' 20208_2_0').values)
BulkMissingScout = '\n'.join(allFeids[ScoutInBulkButNoData].apply(lambda x: x+' 20207_2_0').values)

bulkQuery = BulkMissingLax + '\n' + BulkMissingScout

#write it out
with open('./data/existsNotDownloaded.bulk','w+') as f:
    f.write(bulkQuery)
    
#SECOND: THE FILES WHICH ARE SUPPOSED TO EXIST BUT ARE NOT DOWNLOADABLE
BulkMissingLax = '\n'.join(allFeids[np.logical_and(missingLax,~laxInBulk)].apply(lambda x: x+' 20208_2_0').values)
BulkMissingScout = '\n'.join(allFeids[np.logical_and(missingScout,~scoutInBulk)].apply(lambda x: x+' 20207_2_0').values)

bulkQuery = BulkMissingLax + '\n' + BulkMissingScout
#write it out
with open('./data/shouldExistNotDownloadable.bulk','w+') as f:
    f.write(bulkQuery)
    
    
    

is there anything in particular about the dates on which people were imaged??

In [None]:
attendance = original.loc[:,['f.eid','bio.Date.of.attending.assessment.centre.0.imaging']].set_index(original['f.eid'].astype(str))

attendance = attendance['bio.Date.of.attending.assessment.centre.0.imaging'].apply(lambda x: datetime.datetime.strptime(x,'%d/%m/%Y'))

In [None]:
import matplotlib.pyplot as plt

startyear = 2015
startmonth = 6 

endyear = 2019
endmonth = 4

months = np.array([np.datetime64(datetime.datetime(m//12, m%12+1, 1),'ns') for m in range(startyear*12+startmonth-1, endyear*12+endmonth)])

plt.hist(attendance[allFeids[~(allLaxFiles=='')]].values,alpha = 0.5,bins=months,label = 'found LAX files')

plt.hist(attendance[allFeids[allLaxFiles=='']].values,alpha = 0.5,bins=months,label = 'missing LAX files')

plt.xlabel('date of imaging')

plt.ylabel('n')

plt.legend()