In [1]:
# Predict age and sex using all 38679 FCs

import pandas as pd

demofile = '/run/media/anton/Elements/UKB/AgeSexDemographics_participant.csv'

df = pd.read_csv(demofile)
df

Unnamed: 0,eid,p21022,p31,p22189,p50_i0,p21002_i0
0,5869068,62,Female,-3.11,157.0,70.8
1,1285065,57,Male,5.33,191.0,107.1
2,3875604,56,Male,-4.27,188.0,80.1
3,2324036,49,Female,0.74,161.0,88.4
4,1063851,59,Female,-4.75,155.0,59.7
...,...,...,...,...,...,...
502138,4305653,62,Female,-4.26,158.0,71.3
502139,2536038,41,Female,5.27,163.0,80.1
502140,3768226,48,Female,0.41,158.0,49.7
502141,2018360,70,Male,-1.48,175.0,89.8


In [2]:
# Read fcs

import pickle

fcsfile = '/run/media/anton/Elements/UKB/fcs.pkl'

fcs = pickle.load(open(fcsfile, 'rb'))

subs = sorted(list(fcs.keys()))
print(len(subs))
print(subs[:10])

38679
[1000217, 1000499, 1000787, 1000888, 1001087, 1001218, 1001389, 1001563, 1002015, 1002250]


In [3]:
import math
import numpy as np

# Height is now Townsend deprivation index!

ages = []
sexes = []
heights = []
selsubs = []

for sub in subs:
    sel = df['eid'] == sub
    age = df[sel]['p21022'].iloc[0]
    age = int(age)
    if math.isnan(age):
        continue
    sex = df[sel]['p31'].iloc[0] == 'Male'
    sex = int(sex)
    h = df[sel]['p22189'].iloc[0]
    if math.isnan(h):
        continue
    h = float(h)
    ages.append(age)
    sexes.append(sex)
    heights.append(h)
    selsubs.append(sub)

print(len(heights))
print(ages[:10])
print(sexes[:10])
print(heights[:10])

38642
[63, 43, 64, 59, 62, 64, 69, 58, 54, 48]
[0, 0, 1, 1, 0, 0, 1, 0, 1, 0]
[-2.14, -0.62, 1.79, -4.82, -2.44, -4.78, -2.19, -5.23, 0.69, -4.65]


In [6]:
from sklearn.linear_model import Ridge, LogisticRegression
from sklearn.model_selection import train_test_split

def rmse(a, b):
    return np.mean((a-b)**2)**0.5

fcsvec = []
yvec = []

# for sub in selsubs[:5000]:
#     fcsvec.append(fcs[sub])

for sex,height,sub in zip(sexes[:5000], heights[:5000], selsubs[:5000]):
    # if sex == 1:
    fcsvec.append(fcs[sub])
    yvec.append(sex)

xtr, xt, ytr, yt = train_test_split(fcsvec, yvec, train_size=0.8)
print(len(xtr))

# reg = Ridge(alpha=1000).fit(xtr, ytr)
# yhat = reg.predict(xt)
# null = rmse(yt, np.mean(ytr))
# loss = rmse(yt, yhat)
# print(null, loss)

reg = LogisticRegression(C=1000).fit(xtr, ytr)
yhat = reg.predict(xt)
null = np.mean(yt)
if null < 0.5:
    null = 1-null
loss = np.mean(yhat == yt)
print(null, loss)


4000
0.527 0.908


In [None]:
# Conclusion: age and sex can be predicted reasonably well
# Weight and height cannot be predicted (controlling for sex)
# Deprivation index cannot be predicted controlling for sex or not