In [1]:
# Test logistic regression when dropping repeat codes in diagnosis
# Should be much worse than previous LR
# Dropping repeat codes also stops the strategy of just guessing earlier codes

import json

work_loc = '/run/media/anton/Elements/UKB'

# Get number of codes

with open(f'{work_loc}/codes.json', 'r') as f:
    codes = json.load(f)
    D = len(codes)
    print(D)

with open(f'{work_loc}/eidCodesMapDropRepeats.json', 'r') as f:
    eidCodesMap = json.load(f)

print(len(eidCodesMap))

for k in eidCodesMap:
    print(eidCodesMap[k])
    break

1558
33204
[[870, 881], [748]]


In [2]:
# Create input data for LR
# x is one-hot of previous codes
# y is one-hot of last code

import numpy as np

x = []
y = []

for k,v in eidCodesMap.items():
    if len(v) < 2:
        continue
    xone = np.zeros(D)
    yone = np.zeros(D)
    for i in range(len(v)-1):
        for code in v[i]:
            xone[code] = 1
    for code in v[-1]:
        yone[code] = 1
    x.append(xone)
    y.append(yone)

x = np.stack(x)
y = np.stack(y)

print(x.shape)
print(y.shape)

(26710, 1558)
(26710, 1558)


In [13]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix

xtr, xt, ytr, yt = train_test_split(x, y, train_size=20000)

for i in range(ytr.shape[1]):
    ytri = ytr[:,i]
    yti = yt[:,i]
    if not ytri.any() or not yti.any():
        continue
    reg = LogisticRegression(C=1000, max_iter=1000).fit(xtr, ytri)
    yhat = reg.predict(xt)
    acc = np.sum(yti == yhat)/len(yti)
    tn, fp, fn, tp = confusion_matrix(yti, yhat).ravel()
    sp = tn / (tn+fp)
    sn = tp / (tp+fn)
    fs = 2*tp / (2*tp + fp + fn)
    print(i, codes[i], fs, sp, sn)

3 A04 0.0 0.9991042102120036 0.0
7 A08 0.0 0.9998508130687752 0.0
8 A09 0.0 0.9963888052964189 0.0
16 A40 0.0 1.0 0.0
17 A41 0.13793103448275862 0.9988043640711404 0.10526315789473684
21 A49 0.0 0.9998508798091261 0.0
25 A63 0.0 1.0 0.0
26 A69 0.0 1.0 0.0
37 B00 0.0 1.0 0.0
39 B02 0.0 1.0 0.0
41 B07 0.0 0.9998508130687752 0.0
47 B18 0.0 1.0 0.0
56 B34 0.0 0.9992536199432751 0.0
57 B35 0.0 1.0 0.0
59 B37 0.0 0.99865571321882 0.0
60 B44 0.0 1.0 0.0
71 B80 0.0 1.0 0.0
79 B94 0.0 1.0 0.0
80 B95 0.0 0.9992535085100029 0.0
81 B96 0.0 0.9988045427375971 0.0
82 B97 0.0 0.9989547558608332 0.0
83 B98 0.0 0.9989549119140042 0.0
84 B99 0.0 1.0 0.0
93 C09 0.0 1.0 0.0
100 C18 0.0 0.9998508353221957 0.0
101 C19 0.0 1.0 0.0
102 C20 0.0 1.0 0.0
107 C25 0.0 1.0 0.0
113 C34 0.0 0.9995526394273785 0.0
117 C43 0.0 0.9994018244354718 0.0
118 C44 0.0 0.9930165477455595 0.0
122 C49 0.0 1.0 0.0
123 C50 0.030303030303030304 0.9975979582645248 0.02040816326530612
124 C51 0.0 1.0 0.0
127 C54 0.0 0.999701804085284

In [17]:
# Read all non-zero F score results

with open('/run/media/anton/Elements/UKB/lr1000_results.txt', 'r') as f:
    lines = f.readlines()
    for line in lines:
        parts = line.split()
        fscore = float(parts[2])
        if fscore > 0:
            print(line, end='')

17 A41 0.13793103448275862 0.9988043640711404 0.10526315789473684
123 C50 0.030303030303030304 0.9975979582645248 0.02040816326530612
132 C61 0.028169014084507043 0.9966976883818673 0.020833333333333332
176 D12 0.02197802197802198 0.9905946601941747 0.01694915254237288
187 D23 0.06666666666666667 0.9985054550889254 0.05263157894736842
244 E03 0.01904761904761905 0.9971320754716981 0.011764705882352941
281 E78 0.0196078431372549 0.9904158293399289 0.012448132780082987
284 E83 0.06060606060606061 0.997907949790795 0.05555555555555555
288 E87 0.05063291139240506 0.9953481392557023 0.043478260869565216
310 F32 0.016129032258064516 0.9945643967990336 0.011494252873563218
315 F41 0.015037593984962405 0.9957614290039358 0.009615384615384616
317 F43 1.0 1.0 1.0
378 G56 0.031746031746031744 0.9978985289702792 0.020833333333333332
422 H25 0.017391304347826087 0.9921934792591459 0.011299435028248588
423 H26 0.10380622837370242 0.9915267293175165 0.0684931506849315
428 H33 0.09523809523809523 0.99