In [1]:
# Get subjects from demographics file

import pickle

demofile = '../../ImageNomer/data/anton/cohorts/test/demographics.pkl'

with open(demofile, 'rb') as f:
    demo = pickle.load(f)
    
print(len(list(demo['race'].keys())))

830


In [2]:
# Load big and small SNPs for all subjects with WRAT and RACE

from pathlib import Path
import numpy as np
import re

snpsdir = '../../ImageNomer/data/anton/cohorts/test/snps/'

big = dict()
small = dict()

for f in Path(snpsdir).iterdir():
    mobj = re.match('([0-9]+)_set-([a-z]+)_snps.npy', f.name)
    if not mobj:
        continue
    snps = np.load(f'{snpsdir}/{f.name}')
    subj = mobj.group(1)
    if mobj.group(2) == 'big':
        big[subj] = snps
    if mobj.group(2) == 'small':
        small[subj] = snps

print(len(list(big.keys())))
print(len(list(small.keys())))

927
927


In [20]:
subs = []
snps = []
wrat = []

for sub,race in demo['race'].items():
    if race == 'AA':
        subs.append(sub)
        snps.append(big[sub])
        wrat.append(demo['wrat'][sub])
        
snps = np.stack(snps)
wrat = np.stack(wrat)

# snps2 = snps
# snps2[np.isnan(snps2)] = 0

x0 = snps == 0
x1 = snps == 1
x2 = snps == 2
snps2 = np.concatenate([x0,x1,x2],axis=1)

print(snps2.shape)
print(wrat.shape)

(326, 106863)
(326,)


In [25]:
from sklearn.linear_model import Lasso
import torch

ntrain = 250

rmses = []

race = 'EA'
subset = 'small'

def save(race, subset, i, w, trsubs, tsubs, rmse):
    fname = f'../../ImageNomer/data/anton/cohorts/test/weights/snps/{race}/lasso-wrat/{subset}{i}.pkl'
    desc = f'Lasso regression WRAT SNPs subset {subset} rmse: {rmse}'
    dct = dict(w=w, trsubs=trsubs, tsubs=tsubs, desc=desc)
    with open(fname, 'wb') as f:
        pickle.dump(dct, f)
        
print(np.mean((wrat-np.mean(wrat))**2)**0.5)

for i in range(20):
    idcs = np.random.permutation(snps2.shape[0])
    x = snps2
    x = x[idcs]
    xtr = x[:ntrain]
    xt = x[ntrain:]

    mux = np.mean(xtr, axis=0, keepdims=True)
    sigx = np.std(xtr, axis=0, keepdims=True)
    xtr = xtr - mux
    xt = xt - mux

    y = wrat
    y = y[idcs]
    ytr = y[:ntrain]
    yt = y[ntrain:]

    mu = np.mean(ytr)
    ytr = ytr - mu
    yt = yt - mu

    clf = Lasso(alpha=0.01, max_iter=10000).fit(xtr, ytr)
    yhat = clf.predict(xt)
    w = clf.coef_.reshape(-1)

#     xxtr = torch.from_numpy(xtr).float().cuda()
#     xxt = torch.from_numpy(xt).float().cuda()
#     yytr = torch.from_numpy(ytr).float().cuda()
#     yyt = torch.from_numpy(yt).float().cuda()
    
#     w,_,_,_ = torch.linalg.lstsq(xxtr, yytr)
#     yhat = xxt@w
#     yhat = yhat.detach().cpu().numpy()
#     w = w.detach().cpu().numpy()
    
    rmse = np.mean((yhat-yt)**2)**0.5
    print(rmse)
    rmses.append(rmse)
    
#     save(race, subset, i, w, 
#          [subs[j] for j in idcs[:ntrain]], [subs[j] for j in idcs[ntrain:]], rmse)
    
print(np.mean(rmses))

13.755878032166695
14.896290115366467
13.684123136004251
14.639846708134424
13.180180869491576
14.945645372954766
13.18500073511403
15.090998057531348
15.027730839125338
13.950272209479326
14.615829176127843
13.725455918922131
16.288350518245082
12.981151721559396
14.765880426583177
14.043708474601676
14.752909443792298
14.568976900299228
15.853515245547403
17.134390563357023
16.907376596942523
14.711881651458967
