In [2]:
import pandas as pd
import os
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import pickle
from PIL import Image
import cv2
from collections import Counter


# Get attributes and reformat

In [None]:
attributes_path = '' # get all relevant baseline characterisitcs and attributes from ukbb showcase and save as txt file, '/well/papiez/shared/UKBB/shared/21015_baseline_characteristics.txt'
image_folder = '' # path to the folder containing the images. '/well/papiez/shared/UKBB/DataField_21015/'
train_imgs_path = '' # where to save train imgs
val_imgs_path = '' # where to save val imgs
test_imgs_path = '' # where to save test imgs

Preprocess relevant attributes

In [2]:
attribute_df = pd.read_csv(attributes_path, sep='\t')
attribute_df = attribute_df[['image.id', 'eid', 'image.instance', 'image.array', 'age_at_imaging', 'genetic_sex','bmi_at_imaging']]
attribute_df['Age_multi']=pd.cut(attribute_df['age_at_imaging'], bins=[0,50,60,70,100], labels=[0,1,2,3])
attribute_df['Age_binary']=pd.cut(attribute_df['age_at_imaging'], bins=[0,58,100], labels=[0,1])
attribute_df = attribute_df.dropna()


87540


Unnamed: 0,image.id,eid,image.instance,image.array,age_at_imaging,genetic_sex,bmi_at_imaging
0,1000058_21015_1_0.png,1000058,1,0,63.824176,0.0,36.8
1,1000075_21015_0_0.png,1000075,0,0,64.0,1.0,33.4
2,1000083_21015_0_0.png,1000083,0,0,42.0,1.0,26.9
3,1000143_21015_0_0.png,1000143,0,0,48.0,0.0,21.8
4,1000150_21015_0_0.png,1000150,0,0,44.0,0.0,19.6


In [None]:
attribute_df['genetic_sex'] = attribute_df['genetic_sex'].apply(lambda x: 'F' if x == 0 else ('M' if x == 1 else np.nan))
attribute_df.rename(columns={'genetic_sex': 'Sex'}, inplace=True)

In [None]:
attribute_df['bmi_cat']=pd.cut(attribute_df['bmi_at_imaging'], bins=[0,24,26.5,29.5,100], labels=[0,1,2,3])

In [None]:
attribute_df.rename(columns={'22032-0.0':'22032'}, inplace=True)
attribute_df.rename(columns={'22032':'physical_activity'}, inplace=True)

In [None]:
attribute_df.rename(columns={'26410-0.0':'26410'}, inplace=True)
attribute_df['deprivation_index']=pd.cut(attribute_df['26410'], bins=[0,8,13,23,100], labels=[0,1,2,3])

In [37]:
# for location, will assume that if visits 1 2 or 3 are NA, they are at the same centre as visit 0
attribute_df['54-1.0'].fillna(attribute_df['54-0.0'], inplace=True)
attribute_df['54-2.0'].fillna(attribute_df['54-0.0'], inplace=True)
attribute_df['54-3.0'].fillna(attribute_df['54-0.0'], inplace=True)

attribute_df['54'] = attribute_df.apply(lambda x: x[f'54-{x["image.instance"]}.0'], axis=1)
attribute_df.drop(columns=['54-0.0','54-1.0','54-2.0','54-3.0'], inplace=True)
mapping = {11014: 0, 11016: 1, 11018: 2, 11020: 3, 11021: 4, 11024: 5}
attribute_df['Centre'] = attribute_df['54'].map(mapping)
attribute_df.rename(columns={'54':'assessment_centre'}, inplace=True)

In [38]:
# for alcohol, will just delete 1,2, and 3 as >80% are NA and will assume values haven't changed
attribute_df.drop(columns=['1558-1.0','1558-2.0','1558-3.0'], inplace=True)
attribute_df.rename(columns={'1558-0.0':'1558'}, inplace=True)
attribute_df.rename(columns={'1558':'alcohol'}, inplace=True)

In [None]:
# less than 0.1% differences between first reported ethnicitity and other if reported, so will just keep first
attribute_df['21000'] = attribute_df['21000-0.0']
attribute_df['22006'] = attribute_df['22006-0.0']

attribute_df['gen_ethnicity'] = attribute_df['22006'].apply(lambda x: 1 if x==1 else 0)

attribute_df['Ethnicity'] = attribute_df['21000'].astype(str).str[0]

Adjust BP labels when individual is taking medication:

In [67]:

# get BP for each visit (as oppose to for each image)
attribute_df['4080-0'] = attribute_df[['4080-0.0','4080-0.1']].mean(axis=1)
attribute_df['4080-1'] = attribute_df[['4080-1.0','4080-1.1']].mean(axis=1)
attribute_df['4080-2'] = attribute_df[['4080-2.0','4080-2.1']].mean(axis=1)
attribute_df['4080-3'] = attribute_df[['4080-3.0','4080-3.1']].mean(axis=1)

attribute_df['4079-0'] = attribute_df[['4079-0.0','4079-0.1']].mean(axis=1)
attribute_df['4079-1'] = attribute_df[['4079-1.0','4079-1.1']].mean(axis=1)
attribute_df['4079-2'] = attribute_df[['4079-2.0','4079-2.1']].mean(axis=1)
attribute_df['4079-3'] = attribute_df[['4079-3.0','4079-3.1']].mean(axis=1)

# if there is a nan value in any of those cols, need to replace by the mean of the other three cols
cols = ['4080-0','4080-1','4080-2','4080-3']
for col in cols:
    attribute_df[col].fillna(attribute_df[cols].mean(axis=1), inplace=True)

cols = ['4079-0','4079-1','4079-2','4079-3']
for col in cols:
    attribute_df[col].fillna(attribute_df[cols].mean(axis=1), inplace=True)
# still about 5% of rows are Nans

In [68]:
medication_df = pd.read_csv('/well/papiez/users/hri611/python/MEDFAIR-PROJECT/MEDFAIR/ukbb_medication.txt', sep='\t')
# 6153 is for women, 6177 is for men
# 2 is BP medication

# 1 if any col that starts with '6153-0' or with '6177-0' contains a 2

for i in range(4):
    col_name_a = '6153-' + str(i)
    col_name_b = '6177-' + str(i)
    cols = [col for col in medication_df.columns if col.startswith(col_name_a) or col.startswith(col_name_b)]
    medication_df[f'med-{i}'] = (medication_df[cols] == 2).any(axis=1).astype(int)

# if taking medication at baseline assume they are also taking it later
medication_df['med-1'] = medication_df.apply(lambda x: 1 if x['med-0'] == 1 else x['med-1'], axis=1)
medication_df['med-2'] = medication_df.apply(lambda x: 1 if x['med-0'] == 1 else x['med-2'], axis=1)
medication_df['med-3'] = medication_df.apply(lambda x: 1 if x['med-0'] == 1 else x['med-3'], axis=1)

attribute_df = pd.merge(attribute_df,medication_df[['eid','med-0','med-1','med-2','med-3']],on='eid')

Unnamed: 0,eid,54-0.0,54-1.0,54-2.0,54-3.0,1558-0.0,1558-1.0,1558-2.0,1558-3.0,4079-0.0,...,4080-2,4080-3,4079-0,4079-1,4079-2,4079-3,med-0,med-1,med-2,med-3
0,1000011,11008,,,,5.0,,,,76.0,...,112.0,112.0,74.0,74.0,74.0,74.0,0,0.0,0.0,0.0
1,1000026,11001,,,,2.0,,,,75.0,...,133.0,133.0,75.5,75.5,75.5,75.5,0,0.0,0.0,0.0
2,1000032,11001,,,,6.0,,,,73.0,...,128.5,128.5,70.0,70.0,70.0,70.0,1,1.0,1.0,1.0
3,1000044,11018,,,,5.0,,,,70.0,...,133.5,133.5,71.0,71.0,71.0,71.0,0,0.0,0.0,0.0
4,1000058,11006,11024.0,,,1.0,1.0,,,79.0,...,152.0,152.0,81.5,89.5,85.5,85.5,0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
502361,6024070,11020,,,,5.0,,,,62.0,...,118.0,118.0,64.0,64.0,64.0,64.0,0,0.0,0.0,0.0
502362,6024088,11005,,,,6.0,,,,93.0,...,142.5,142.5,95.0,95.0,95.0,95.0,0,0.0,0.0,0.0
502363,6024092,11004,,,,3.0,,,,77.0,...,118.5,118.5,74.0,74.0,74.0,74.0,0,0.0,0.0,0.0
502364,6024107,11018,,,,2.0,,,,78.0,...,136.5,136.5,80.5,80.5,80.5,80.5,0,0.0,0.0,0.0


In [69]:
# Add 10 and 15 to DBP and SBP respectively

cols = ['4080-0','4080-1','4080-2','4080-3'] # systolic BP
for i in range(4):
    med_col = 'med-' + str(i)
    bp_col = cols[i]
    attribute_df[bp_col] = attribute_df.apply(lambda x: x[bp_col]+15 if x[med_col] == 1 else x[bp_col], axis=1)

cols = ['4079-0','4079-1','4079-2','4079-3'] # diastolic BP
for i in range(4):
    med_col = 'med-' + str(i)
    bp_col = cols[i]
    attribute_df[bp_col] = attribute_df.apply(lambda x: x[bp_col]+10 if x[med_col] == 1 else x[bp_col], axis=1)

In [None]:
attribute_df['adjusted_4079'] = attribute_df.apply(lambda x: x['4079-0'] if x['image.instance'] == 0 else x['4079-1'] if x['image.instance'] == 1 else x['4079-2'] if x['image.instance'] == 2 else x['4079-3'], axis=1)
attribute_df['adjusted_4080'] = attribute_df.apply(lambda x: x['4080-0'] if x['image.instance'] == 0 else x['4080-1'] if x['image.instance'] == 1 else x['4080-2'] if x['image.instance'] == 2 else x['4080-3'], axis=1)
attribute_df['adjusted_high_bp'] = attribute_df.apply(lambda x: 1 if x.loc['adjusted_4079'] >= 80 or x.loc['adjusted_4080'] >= 130 or x['med-'+str(x['image.instance'])]==1 else 0, axis=1)
attribute_df['binaryLabel'] = attribute_df['adjusted_high_bp']

# replace nan values with mean sbp (about 0.2% of values)
mean_sbp = attribute_df['adjusted_4080'].mean()
attribute_df['adjusted_4080'] = attribute_df['adjusted_4080'].fillna(mean_sbp)


Add image paths and merge both dfs

In [None]:
images_paths = [os.path.join(image_folder,x) for x in os.listdir(image_folder) if 'png' in x]

image_paths = pd.DataFrame({'image_path':images_paths})
image_paths['image.id'] = image_paths['image_path'].apply(lambda x: x.split('/')[-1])
image_paths['eid']=image_paths['image.id'].apply(lambda x: x.split('_')[0]).astype(int)
image_paths['image.instance']=image_paths['image.id'].apply(lambda x: x.split('_')[2]).astype(int)
image_paths['image.array']=image_paths['image.id'].apply(lambda x: x.split('_')[3].split('.')[0]).astype(int)

# add image paths to metadata df (inner join to keep only images with metadata and images you have)
all_metadata_df = pd.merge(attribute_df, image_paths, on=['image.id','eid','image.array','image.instance'], how='inner')

Drop rare ethnicity values and one assessment centre

In [None]:
all_metadata_df = all_metadata_df[~all_metadata_df['Ethnicity'].isin(['n', '6', '-'])]
all_metadata_df.loc[all_metadata_df['Ethnicity']=='5', 'Ethnicity'] = '3'
all_metadata_df.loc[all_metadata_df['Ethnicity']=='4', 'Ethnicity'] = '0'
all_metadata_df['Ethnicity'] = all_metadata_df['Ethnicity'].astype(int)

all_metadata_df = all_metadata_df[~all_metadata_df['54'].isin([11022.0])] # drops about 0.2% of images

all_metadata_df.reset_index(inplace=True)

# Split Data

In [89]:
def split_811(all_meta, patient_ids):
    sub_train, sub_val_test = train_test_split(patient_ids, test_size=0.2, random_state=0)
    sub_val, sub_test = train_test_split(sub_val_test, test_size=0.5, random_state=0)
    train_meta = all_meta[all_meta.eid.isin(sub_train)]
    val_meta = all_meta[all_meta.eid.isin(sub_val)]
    test_meta = all_meta[all_meta.eid.isin(sub_test)]
    return train_meta, val_meta, test_meta


In [94]:
sub_train, sub_val, sub_test = split_811(all_metadata_df, np.unique(all_metadata_df['eid'])) # stratify by patient id
sub_train.to_csv(train_imgs_path)
sub_val.to_csv(val_imgs_path)
sub_test.to_csv(test_imgs_path)


# Save PKL files

In [None]:
path_to_data_split = '' # train, val, test csvs
save_path = ''

images = []

df = pd.read_csv(path_to_data_split)

for i in range(len(df)):
    
    img = cv2.imread(df.iloc[i]['image_path'],cv2.IMREAD_GRAYSCALE) #so it only has one channel
    
    # resize to the input size in advance to save time during training
    img = cv2.resize(img, (256, 256))
    images.append(img)


with open(save_path,'wb') as f:
    pickle.dump(images, f)