In [1]:
# Predict EHR records using age sex with only fundus subjects

import pandas as pd

demofile = '/run/media/anton/Elements/UKB/AgeSexDemographics_participant.csv'

df = pd.read_csv(demofile)
df

Unnamed: 0,eid,p21022,p31,p22189,p50_i0,p21002_i0
0,5869068,62,Female,-3.11,157.0,70.8
1,1285065,57,Male,5.33,191.0,107.1
2,3875604,56,Male,-4.27,188.0,80.1
3,2324036,49,Female,0.74,161.0,88.4
4,1063851,59,Female,-4.75,155.0,59.7
...,...,...,...,...,...,...
502138,4305653,62,Female,-4.26,158.0,71.3
502139,2536038,41,Female,5.27,163.0,80.1
502140,3768226,48,Female,0.41,158.0,49.7
502141,2018360,70,Male,-1.48,175.0,89.8


In [4]:
# Get subjects and codes

import json

work_loc = '/run/media/anton/Elements/UKB/21015'

# Get number of codes

with open(f'{work_loc}/codes.json', 'r') as f:
    codes = json.load(f)
    D = len(codes)
    print(D)

with open(f'{work_loc}/eidCodesMap.json', 'r') as f:
    eidCodesMap = json.load(f)

print(len(eidCodesMap))

for k in eidCodesMap:
    print(eidCodesMap[k])
    break

subs = sorted(list(eidCodesMap.keys()))
print(subs[:5])

1470
10566
[[803], [803, 813, 818], [818, 1457], [635, 818, 1400, 1457], [635], [1012, 1012, 1433], [635, 817, 818]]
['1000080', '1000140', '1000309', '1000457', '1000522']


In [9]:
# Get age and sex for all subjects

import numpy as np

ages = []
sexes = []

for sub in subs:
    sub = int(sub)
    row = df[df['eid'] == sub]
    age = row['p21022'].iloc[0]
    sex = int(row['p31'].iloc[0] == 'Male')
    ages.append(age)
    sexes.append(sex)

ages = np.stack(ages)
sexes = np.stack(sexes)

print(ages[:5])
print(sexes[:5])
print(len(ages), len(sexes))

[46 66 56 60 50]
[1 0 0 0 1]
10566 10566


In [38]:
# Create training data with clustered codes

import numpy as np

# x = []
y = []
clusteredCodes = '''C61
C67
D09
D41
K40
N13
N20
N28
N30
N32
N35
N40
N41
N42
N50
R31
R33
R35
R39
R79
T83
Z08
Z46
'''.split()
clusteredCodesInt = [None]*len(clusteredCodes)

for i,code in enumerate(codes):
    if code not in clusteredCodes:
        continue
    idx = clusteredCodes.index(code)
    clusteredCodesInt[idx] = i

print(clusteredCodesInt)

for i,sub in enumerate(subs):
    # img = fundusData[sub]
    diag = 0
    for visit in eidCodesMap[sub]:
        for code in visit:
            if code in clusteredCodesInt:
                diag = 1
                break
        if diag == 1:
            break
    # x.append(img)
    y.append(diag)
    if i % 1000 == 0:
        print(f'Done {i}')

# x = np.stack(x)
y = np.array(y)

print(y[:50])   
# print(x.shape)

[121, 127, 163, 193, 623, 796, 803, 809, 811, 813, 815, 818, 819, 820, 827, 1006, 1008, 1010, 1012, 1049, 1193, 1397, 1426]
Done 0
Done 1000
Done 2000
Done 3000
Done 4000
Done 5000
Done 6000
Done 7000
Done 8000
Done 9000
Done 10000
[1 0 0 1 0 0 1 0 1 0 1 1 1 0 0 0 1 1 0 0 0 0 0 1 1 0 0 0 0 0 0 1 0 1 0 0 0
 0 0 0 1 0 0 0 0 1 0 0 0 0]


In [39]:
from sklearn.model_selection import train_test_split

x = np.stack([ages, sexes], axis=1)

import math

xz = x[np.where(y == 0)]
xo = x[np.where(y == 1)]

n = min(len(xz), len(xo))

xzi = np.arange(len(xz))
np.random.shuffle(xzi)

xoi = np.arange(len(xo))
np.random.shuffle(xoi)

xx = np.concatenate([xz[xzi[:n]], xo[xoi[:n]]])
yy = np.concatenate([np.zeros(n), np.ones(n)])

xtr, xt, ytr, yt = train_test_split(xx, yy, stratify=yy, train_size=math.floor(2*n/3))

print(xtr.shape)
print(ytr.shape)

(2014, 2)
(2014,)


In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix

reg = LogisticRegression(C=1000).fit(xtr, ytr)
logits = reg.predict_proba(xt)
yhat = reg.predict(xt)
p = np.mean(yt)
null = p
if null < 0.5:
    null = 1-null
acc = np.mean((yt == yhat)+0)
tn, fp, fn, tp = confusion_matrix(yt, yhat).ravel()
sp = tn / (tn+fp)
sn = tp / (tp+fn)
fs = 2*tp / (2*tp + fp + fn)
print(null, acc, sp, sn, fs)
# Binary AU ROC
auroc = roc_auc_score(yt, logits[:,1])
print(auroc)

0.5 0.6910669975186104 0.6933002481389579 0.688833746898263 0.6903755284755037
0.7399072711487665
