This notebook is a procedure getting age- and sex-matched controls for diabetics within the UK biobank.

In [None]:
import pandas as pd

import numpy as np

import matplotlib.pyplot as plt

import os

In [None]:
MASTERFILE = './data/healthy_diabetics.csv' #provided by Zahra

masterData = pd.read_csv(MASTERFILE,low_memory=False).iloc[:,1:]

diabetics = masterData.loc[masterData['diabetes']=='Diabetes',:].reset_index()

Healthy = masterData.loc[masterData['healthy']=='healthy',:].reset_index()

print('there are ' + str(diabetics.shape[0]) + ' diabetics in the population, and ' + str(Healthy.shape[0]) + ' healthy peoples')

Now, we should subset for those whose data we can access!

A quick sense-check - are there any people who are both diabetic and healthy within this table?

In [None]:
np.logical_and(masterData['diabetes']=='Diabetes' , masterData['healthy']=='healthy').any()

well, that's comforting.

In [None]:
def match_age_sex(row1,row2):
    
    '''returns true if 2 rows have the same age and sex, otherwise false'''
    
    if row1['bio.Sex.0.baseline'] == row2['bio.Sex.0.baseline'] and row1['bio.Age.when.attended.assessment.centre.0.imaging'] == row2['bio.Age.when.attended.assessment.centre.0.imaging']:
        return True
    else:
        return False
    

def get_match_feids(row,otherDF):
    
    matches = otherDF.apply(lambda x: match_age_sex(x,row), axis =1 )
    
    feids = otherDF.loc[matches,'f.eid'].values
    
    return feids
    

In [None]:
numberOfMatches = diabetics.apply(lambda x: get_match_feids(x,Healthy).shape[0],axis=1)

In [None]:
plt.hist(numberOfMatches,bins=np.arange(-0.5,60,1))

plt.xlabel('\# matches for a diabetic')

plt.ylabel('\# diabetics')

In [None]:
b = np.arange(40,85,2.5)

plt.hist(diabetics['bio.Age.when.attended.assessment.centre.0.imaging'],alpha = 0.5, label = 'diabetic',bins = b)

plt.hist(Healthy['bio.Age.when.attended.assessment.centre.0.imaging'],alpha = 0.5, label = 'healthy',bins = b)

plt.xlabel('age (years)')

plt.ylabel('n')

plt.legend()

In [None]:

#set random state to guarantee reproducibility
np.random.seed(42)

#randomly shuffle order of diabetics 
diabetics = diabetics.loc[np.random.permutation(diabetics.shape[0]),:]

matches = []

#move down the list of diabetics, selecting 1 match for each. Remove that match from the list of potential matches for others.
for row in diabetics.iterrows():
    
    diabetic = row[1]
    
    match_feids = get_match_feids(diabetic,Healthy)
    
    #if there are any matches!
    if match_feids.shape[0]>0:
        #select 1
        sel = np.random.randint(match_feids.shape[0])
        match_feid = match_feids[sel]
        
        #remove the corresponding row from Healthy dataframe so they cannot be selected again
        Healthy = Healthy.loc[ Healthy['f.eid'] != match_feid , : ]
        
        
    else:
        # null result (may be needed later)
        match_feid = 0
    
    #add paired results to list
    matches.append({'diabetic feid':diabetic['f.eid'],
                    'healthy feid': match_feid,
                    'sex': diabetic['bio.Sex.0.baseline'],
                    'age':diabetic['bio.Age.when.attended.assessment.centre.0.imaging']
                   })
    
    
matches = pd.DataFrame(matches)

matches.loc[matches['healthy feid']==0,'healthy feid'] = np.nan

In [None]:
plt.hist(matches['age'],alpha = 0.3 ,label = 'all diabetics')

matched = ~matches['healthy feid'].isna()

plt.hist(matches.loc[matched,'age'],alpha = 0.3, label = 'matched diabetics')

plt.hist(matches.loc[~matched,'age'],alpha = 0.3, label = 'unmatched diabetics')

plt.legend()

Checks and balances:
 - Is the number of unmatched diabetics in the matches dataframe >= the number of diabetics for whom no matches exist? 
 - Is the number of unmatched Healthy people consistent with the number of matched?

In [None]:
#does the number of unmatched healthy people + number of matches == the total number of healthy people?

matched = matches.dropna().shape[0]

unmatchedHealthy = Healthy.shape[0]

matched + unmatchedHealthy

So, I am not losing healthy people (see earlier in notebook).

In [None]:
#is the number of unmatched diabetics >= the number for whom no match exists (around 120, see histogram above)?
print('there are ' + str(matches['healthy feid'].isna().sum()) + ' unmatched diabetics')


In [None]:
matches.to_csv('./data/matched_diabetics_healthy_all.csv')

Now, take a random sample of 250 pairs.

In [None]:
#only valid matches
positiveMatches = matches.dropna()

#make sure the healthy ones are written as ints, for easier lookup later
positiveMatches.loc[:,'healthy feid']=positiveMatches['healthy feid'].astype('int')

#set random seed so this is deterministic
np.random.seed(44)

sampleSize = 250 #NUMBER OF PAIRS

#random choice of 250 pairs, no replacement
sampler = np.random.choice(positiveMatches.shape[0],sampleSize,replace=False)

subsample = positiveMatches.iloc[sampler,:]

subsample.to_csv('./data/matched_diabetics_healthy_' + str(sampleSize) + '.csv',index=False)

In [None]:
subsample = pd.read_csv('./data/matched_diabetics_healthy_' + str(sampleSize) + '.csv')

Final check - are the feids unique within both columns? THIS IS IMPORTANT

In [None]:
positiveMatches['diabetic feid'].unique().shape == positiveMatches['diabetic feid'].shape

In [None]:
positiveMatches['healthy feid'].unique().shape == positiveMatches['healthy feid'].shape

Now, for ease of use, we can write out a txt containing the valid data that can be found. This will allow us to scp all the relevant files over... Also I should look at precisely how much space they will take for the destination machine.

In [None]:
#convert to just a list of all of the feids
allFeids = pd.concat((subsample['diabetic feid'].dropna().astype('int'),subsample['healthy feid'].dropna().astype('int'))).reset_index(drop=True).apply(str)

In [None]:

rawTopDir = '/images/imaging_by_participant/'

def get_path_to_participant_raw(topDir,feid , suffix):
        
    tryIt = os.path.join(topDir, (feid[:2] + 'xxxxx'), feid, (feid + suffix) )
    
    if os.path.isfile(tryIt):        
        return tryIt
    else:
        return np.nan
    
#gets a list of paths to raw zipfiles, but with nans where file isn't there...
allRawFiles = allFeids.apply(lambda x: get_path_to_participant_raw(rawTopDir, str(x) , '_longaxis.zip'))

In [None]:
foundRawFiles = allRawFiles.dropna()

print('found ' + str(foundRawFiles.shape[0]) + ' of ' + str(allRawFiles.shape[0]) + '. Sort your shit out for the remaining ' + str(allRawFiles.shape[0] - foundRawFiles.shape[0]))

In [None]:
listOfFilesToSync = '\n'.join(foundRawFiles.values)

with open('./data/listOfLAXZipfiles.txt','w+') as f:
    f.write(listOfFilesToSync)