In [1]:
import numpy as np
import pandas as pd
import random
import tqdm
import jax
import jax.numpy as jnp
from jax import vmap,grad
from functools import partial
from jax.example_libraries import stax
from jax.example_libraries.stax import Dense,Relu,Flatten,Sigmoid
from jax import jit
from jax.example_libraries import optimizers
from jax.tree_util import tree_multimap

In [36]:
rng=jax.random.PRNGKey(1)
num_task_sample=5
ethnic_grp_min_pop=60 #min population among all subpopulatiosn
reg_weight=0.1
lr=0.001

def linear_model(eth, x):
    # dictionary which contains randomly generated coefficnets for the linear model
    eth_coef = {}
    eth_errors = {}
    eth_errors_sigma2 = np.random.randint(1, 5, size=len(eth))
    eth_errors.update([(ethnicity, errors / 100) for ethnicity, errors in zip(eth, eth_errors_sigma2)])

    for i in eth:
        eth_coef[i] = np.random.uniform(-1, 1, (x.shape[1]))

    y = x.apply(lambda a: a @ eth_coef[a.name[0]] + np.random.normal(scale=eth_errors[a.name[0]]), axis=1)

    return y


def hetero_model(eth, x, max_threshold=20, num_heteromodel_causal_snps=10):
    threshold_vec = np.random.randint(max_threshold,
                                      size=len(eth))  # random choice of thresholds as many as ethnicities
    heteromodel_causal_snps = np.random.choice(x.columns, size=num_heteromodel_causal_snps,
                                               replace=False)  # choose 10 causal snps from columns of x
    heteromodel_coef = []  # array of 0 or 1 depending on whether a particular column is causal snp or not
    heteromodel_thresh = {}  # dictionary of thresholds for each ethnicity
    for i in x.columns:
        if i in heteromodel_causal_snps:
            heteromodel_coef.append(1)
        else:
            heteromodel_coef.append(0)

    j = 0
    for i in eth:
        heteromodel_thresh[i] = threshold_vec[j]
        j = j + 1

    y = x.apply(lambda t: 1 if t @ heteromodel_coef > heteromodel_thresh[t.name[0]] else 0, axis=1)

    return y


def compensatory_model(eth, x, max_threshold1=10, max_threshold2=5, num_heteromodel_causal_snps1=20,
                       num_heteromodel_causal_snps2=50):
    threshold_vec1 = np.random.randint(max_threshold1,
                                       size=len(eth))  # random choice of thresholds as many as ethnicities
    threshold_vec2 = np.random.randint(max_threshold2,
                                       size=len(eth))  # random choice of thresholds as many as ethnicities

    heteromodel_causal_snps = np.random.choice(x.columns,
                                               size=num_heteromodel_causal_snps1 + num_heteromodel_causal_snps2,
                                               replace=False)  # choose 10 causal snps from columns of x
    heteromodel_causal_snps1 = heteromodel_causal_snps[0:num_heteromodel_causal_snps1]
    heteromodel_causal_snps2 = heteromodel_causal_snps[num_heteromodel_causal_snps1:-1]
    heteromodel_coef1 = []  # array of 0 or 1 depending on whether a particular column is causal snp or not
    heteromodel_thresh1 = {}  # dictionary of thresholds for each ethnicity
    for i in x.columns:
        if i in heteromodel_causal_snps1:
            heteromodel_coef1.append(1)
        else:
            heteromodel_coef1.append(0)

    j = 0
    for i in eth:
        heteromodel_thresh1[i] = threshold_vec1[j]
        j = j + 1

    heteromodel_coef2 = []  # array of 0 or 1 depending on whether a particular column is causal snp or not
    heteromodel_thresh2 = {}  # dictionary of thresholds for each ethnicity
    for i in x.columns:
        if i in heteromodel_causal_snps2:
            heteromodel_coef2.append(1)
        else:
            heteromodel_coef2.append(0)

    j = 0
    for i in eth:
        heteromodel_thresh2[i] = threshold_vec2[j]
        j = j + 1

    y = x.apply(lambda t: 0 if (((t @ heteromodel_coef1 > heteromodel_thresh1[t.name[0]]) and
                                 (t @ heteromodel_coef2 > heteromodel_thresh2[t.name[0]])) |
                                ((t @ heteromodel_coef1 <= heteromodel_thresh1[t.name[0]]) and
                                 (t @ heteromodel_coef2 <= heteromodel_thresh2[t.name[0]])))
    else 1, axis=1)

    # return heteromodel_thresh1,heteromodel_thresh2,y
    return y


In [2]:
data=pd.read_csv('processed_data.csv',index_col=[0,1])
eth_ID=pd.read_csv('eth_ID.csv',index_col=0)
eth=eth_ID['eth'].unique()

In [41]:
def maml_logistic_model(eth_train, x_train, y_train, x_test, y_test,epochs=20000,batch_size=20,num_task_sample= 5,reg_weight=reg_weight):
    # define model
    num_features=x_train.shape[1]
    in_shape=(-1,num_features)
    net_init, net_apply = stax.serial(Dense(1),Sigmoid)
    opt_init, opt_update, get_params = optimizers.adam(step_size=lr)
    out_shape, net_params = net_init(rng, in_shape)
    opt_state = opt_init(net_params)

    # auxilliary functions
    def binary_cross_entropy(y_hat, y):
        bce = y * jnp.log(y_hat) + (1 - y) * jnp.log(1 - y_hat)
        return jnp.mean(-bce)

    def loss(params, inputs, targets):
        predictions = net_apply(params, inputs)
        #print(binary_cross_entropy(predictions,targets),jnp.linalg.norm(net_params[0], 1))
        return binary_cross_entropy(predictions,targets)+reg_weight * jnp.linalg.norm(net_params[0][0], 1)

    def accuracy(params, inputs, targets):
        predictions = net_apply(params, inputs)
        return jnp.mean((predictions >= 1/2) == (targets >= 1/2))

    def inner_update(p, x1, y1):
        grads = grad(loss)(p, x1, y1)
        inner_sgd_fn = lambda g, state: (state - lr * g)
        return tree_multimap(inner_sgd_fn,grads,p)
        #return [(w - lr * dw) for w, dw in zip(p, grads)]

    def maml_loss(p, x1, y1, x2, y2):
        p2 = inner_update(p, x1, y1)
        return loss(p2, x2, y2)

    # vmapped version of maml loss.
    # returns scalar for all tasks.
    def batch_maml_loss(p, x1_b, y1_b, x2_b, y2_b):
        task_losses = vmap(partial(maml_loss, p))(x1_b, y1_b, x2_b, y2_b)
        return jnp.mean(task_losses)

    @jit
    def step(i, opt_state, x1, y1, x2, y2):
        p = get_params(opt_state)
        g = grad(batch_maml_loss)(p, x1, y1, x2, y2)
        l = batch_maml_loss(p, x1, y1, x2, y2)
        return opt_update(i, g, opt_state), l

    np_batched_maml_loss = []

    for i in tqdm.tqdm(range(epochs)):
        x1_b, y1_b, x2_b, y2_b = sample_tasks(num_task_sample, batch_size, ethnic_grp_min_pop, eth_train, x_train,
                                              y_train)
        opt_state, l = step(i, opt_state, x1_b, y1_b, x2_b, y2_b)
        np_batched_maml_loss.append(l)
        #if i % 1000 == 0:
        #    print(i, 'maml_loss', l)
    net_params = get_params(opt_state)

    # meta testing
    # meta test; train with batch_size many examples from validation set on desired task

    # pre update prediction
    pre_predictions = vmap(partial(net_apply, net_params))(x_test)
    pre_error = loss(net_params, x_test, y_test)
    #print('pre update loss=' + str(pre_error))
    # post-update prediction
    indx = np.random.randint(x_test.shape[0], size=batch_size)
    test_indx = np.delete(np.arange(x_test.shape[0]), indx)
    x1, y1 = x_test[indx], y_test[indx]
    for i in range(batch_size):
        net_params = inner_update(net_params, x1, y1)
        # print('training loss '+str(l))
        # train_accuracy= accuracy(net_params,x1,y1)
        # print('train accuracy',train_accuracy)
        # post_error= loss(net_params,x_test[test_indx],y_test[test_indx])
        # print('Post step ' + str(i)+' update test MSE='+str(post_error))

    # post_predictions = vmap(partial(net_apply, net_params))(x_test)
    logistic_maml_acc = accuracy(net_params, x_test[test_indx], y_test[test_indx])
    #print('Test Accuracy on Task: MSE = ', logistic_maml_acc)

    return np_batched_maml_loss, logistic_maml_acc


def base_logistic_model(eth_train, x_train, y_train, x_test, y_test,epochs=20000,batch_size=20,reg_weight=reg_weight):
    num_features = x_train.shape[1]
    in_shape = (-1, num_features)

    basenet_init, basenet_apply = stax.serial(Dense(1), Sigmoid)
    out_shape, basenet_params = basenet_init(rng, input_shape=in_shape)
    opt_init, opt_update, get_params = optimizers.adam(step_size=lr)
    opt_state = opt_init(basenet_params)

    @jit
    def step(i, opt_state, x1, y1):
        p = get_params(opt_state)
        g = grad(batch_loss)(p, x1, y1)
        l = batch_loss(p, x1, y1)
        return opt_update(i, g, opt_state), l


    def binary_cross_entropy(y_hat, y):
        bce = y * jnp.log(y_hat) + (1 - y) * jnp.log(1 - y_hat)
        
        return jnp.mean(-bce)

    def loss(params, inputs, targets):
        predictions = basenet_apply(params, inputs)
        bce=binary_cross_entropy(predictions, targets)
        
        return binary_cross_entropy(predictions, targets) + reg_weight * jnp.linalg.norm(basenet_params[0][0], 1)


    def batch_loss(p,x1,y1):
        task_losses = vmap(partial(loss, p))(x1, y1)
        return jnp.mean(task_losses)

    def accuracy(params, inputs, targets):
        predictions = basenet_apply(params, inputs)
        return jnp.mean((predictions >= 1/2) == (targets >= 1/2))

    def update(p, x1, y1):
        grads = grad(loss)(p, x1, y1)
        inner_sgd_fn = lambda g, state: (state - lr * g)
        return tree_multimap(inner_sgd_fn,grads,p)

    np_batched_loss = []
    for i in tqdm.tqdm(range(epochs)):
        indices = np.random.randint(x_train.shape[0], size=batch_size * num_task_sample)
        x1, y1 = x_train.iloc[indices].to_numpy(), y_train.iloc[indices].to_numpy()
        opt_state, l = step(i, opt_state, x1, y1)
        np_batched_loss.append(l)
    basenet_params = get_params(opt_state)
    #  if i % 1000 == 0:
    #     train_loss = loss(basenet_params, x_train.to_numpy(), y_train.to_numpy())
    #    print(i, 'training loss', train_loss)

    indx = np.random.randint(x_test.shape[0], size=batch_size)
    test_indx = np.delete(np.arange(x_test.shape[0]), indx)
    x1, y1 = x_test[indx], y_test[indx]
    for i in range(batch_size):
        basenet_params = update(basenet_params, x1, y1)

    logistic_test_acc = accuracy(basenet_params, x_test, y_test)

    #print('test error accuracy', logistic_test_acc)
    return np_batched_loss, logistic_test_acc


In [42]:
def sample_tasks(outer_batch_size, inner_batch_size,ethnic_grp_min_pop,eth_train,x_train,y_train):
    # Select amplitude and phase for the task
    ethnic_grp_sample=random.sample(list(eth_train), k=outer_batch_size)

    def get_batch():
        xs, ys = [], []
        for j in ethnic_grp_sample:
            indices = np.random.randint(ethnic_grp_min_pop,size=inner_batch_size)
            x= x_train.loc[j].iloc[indices].to_numpy()
            y= y_train.loc[j].iloc[indices].to_numpy()
            xs.append(x)
            ys.append(y)
        return np.stack(xs), np.stack(ys)
    x1, y1 = get_batch()
    x2, y2 = get_batch()
    return x1, y1, x2, y2



In [51]:
epochs=5000
batch_size=30 #inner batch size for inner loop#K=20 #K-shot learning
num_task_sample= 4 #number of tasks to sample to meta train

data1=pd.read_csv('processed_data.csv',index_col=[0,1])
eth_ID=pd.read_csv('eth_ID.csv',index_col=[0])
eth=eth_ID['eth'].unique()

In [52]:
def task_maml(task,x,y):
   # task='MXL' #the ethnic group we want to test
#preparing training data by removing the task data from training data
    x_train=x.drop(task,axis=0)
    y_train=y.drop(task,axis=0)
    eth_train=np.delete(eth,np.where(eth == task))
    x_test=x.loc[task].to_numpy()
    y_test=y.loc[task].to_numpy()
    y_test=np.reshape(y_test,(-1,1))
    eth_train=np.delete(eth, np.where(eth == task))

    #train and test using different models
    beta = base_linear_model(eth_train, x_train, y_train, x_test, y_test, epochs, batch_size)
    lin_test_error = beta[1]
    lin_r2=beta[2]

    alpha=maml_model(eth_train,x_train,y_train,x_test,y_test,epochs,batch_size,num_task_sample)
    maml_error=alpha[1]
    maml_r2=alpha[2]


#report correlation coefficeint or R^2
    plt.plot(alpha[0])
    plt.plot(beta[0])
    plt.title('MSE ' + task)
    plt.savefig('lin_maml_gen_results_'+task+'.png')


    L = task, maml_error, lin_test_error,maml_r2, lin_r2
    lin_error_vec.append(L)
    #return error_vec


def logistic_task_maml(task,x,y,reg_weight,model_type):
 # task='MXL' #the ethnic group we want to test
 # preparing training data by removing the task data from training data
 x_train = x.drop(task, axis=0)
 y_train = y.drop(task, axis=0)
 eth_train = np.delete(eth, np.where(eth == task))
 x_test = x.loc[task].to_numpy()
 y_test = y.loc[task].to_numpy()
 y_test = np.reshape(y_test, (-1, 1))
 eth_train = np.delete(eth, np.where(eth == task))

 beta = base_logistic_model(eth_train, x_train, y_train, x_test, y_test, epochs, batch_size,reg_weight)
 lin_test_error = beta[1]

 alpha = maml_logistic_model(eth_train, x_train, y_train, x_test, y_test, epochs, batch_size, num_task_sample,reg_weight)
 maml_error = alpha[1]

 L = task, maml_error, lin_test_error
 if(model_type=='hetero'):
     hetero_acc_vec.append(L)
     plt.plot(alpha[0])
     plt.plot(beta[0])
     plt.title('loss ' + task)
     plt.savefig('hetero_maml_gen_results_' + task + '.png')

 if(model_type=='compensatory'):
     compensatory_acc_vec.append(L)
     #plt.plot(alpha[0])
     #plt.plot(beta[0])
     #plt.title('loss ' + task)
     #plt.savefig('compensatory_maml_gen_results_' + task + '.png')
 #return logistic_vec
#make changes so that all results are saved in a single file


lin_error_vec=[]
hetero_acc_vec=[]
compensatory_acc_vec=[]
weights_vec=np.arange(11)
tasks= random.sample(list(eth),k=3)
for reg in weights_vec:
    for t in tasks:
        for i in range(5):
            x = data1
            #y=linear_model(eth,data1)
            #task_maml(t,x,y)

            #y2= hetero_model(eth, data1)
            #logistic_task_maml(t,x,y2,model_type='hetero')

            y3=compensatory_model(eth,data1)
            logistic_task_maml(t, x, y3,reg/10,model_type='compensatory')

    L3 = pd.DataFrame(compensatory_acc_vec, columns=['task', 'maml_acc', 'lin_acc'])
    # print('Linear phenotype')
    # a=L1.groupby('task')
    # print(a.mean())
    # print('Heterogenous phenotype')
    # b=L2.groupby('task')
    # print(b.mean())
    print('reg weight',reg)
    print('Compensatory phenotype')
    c = L3.groupby('task')
    (c.mean()).to_csv('hyperparameter_optimization/reg'+str(reg)+'.csv')
   # print(c.mean())


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 916.92it/s]
100%|██████████████████████████████████████| 5000/5000 [00:43<00:00, 115.79it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1037.19it/s]
100%|██████████████████████████████████████| 5000/5000 [00:42<00:00, 117.03it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1056.82it/s]
100%|██████████████████████████████████████| 5000/5000 [00:43<00:00, 114.23it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1009.78it/s]
100%|██████████████████████████████████████| 5000/5000 [00:44<00:00, 112.55it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1172.86it/s]
100%|███████████████████████████████████████| 5000/5000 [06:33<00:00, 12.69it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1039.69it/s]
100%|███████████████████████████████████████| 5000/5000 [00:53<00:00, 93.03it/s]


reg weight 0
Compensatory phenotype


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1046.03it/s]
100%|██████████████████████████████████████| 5000/5000 [00:42<00:00, 117.65it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 999.95it/s]
100%|██████████████████████████████████████| 5000/5000 [00:42<00:00, 118.92it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1043.41it/s]
100%|███████████████████████████████████████| 5000/5000 [00:53<00:00, 93.29it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 940.37it/s]
100%|██████████████████████████████████████| 5000/5000 [00:46<00:00, 108.16it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1128.15it/s]
100%|██████████████████████████████████████| 5000/5000 [00:39<00:00, 125.79it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1154.06it/s]
100%|██████████████████████████████████████| 5000/5000 [00:40<00:00, 123.61it/s]


reg weight 1
Compensatory phenotype


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1097.38it/s]
100%|███████████████████████████████████████| 5000/5000 [01:10<00:00, 70.84it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:13<00:00, 362.90it/s]
100%|███████████████████████████████████████| 5000/5000 [26:57<00:00,  3.09it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:22<00:00, 217.55it/s]
100%|███████████████████████████████████████| 5000/5000 [19:14<00:00,  4.33it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:13<00:00, 366.59it/s]
100%|███████████████████████████████████████| 5000/5000 [02:38<00:00, 31.51it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|██████████████████████████████████████| 5000/5000 [00:12<00:00, 400.12it/s]
100%|███████████████████████████████████████| 5000/5000 [44:45<00:00,  1.86it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:12<00:00, 385.57it/s]
100%|███████████████████████████████████████| 5000/5000 [02:31<00:00, 32.98it/s]


reg weight 2
Compensatory phenotype


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:11<00:00, 441.21it/s]
100%|███████████████████████████████████████| 5000/5000 [02:16<00:00, 36.63it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:10<00:00, 495.20it/s]
100%|█████████████████████████████████████| 5000/5000 [1:22:43<00:00,  1.01it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|███████████████████████████████████████| 5000/5000 [15:41<00:00,  5.31it/s]
100%|███████████████████████████████████████| 5000/5000 [02:31<00:00, 32.97it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:13<00:00, 381.29it/s]
100%|███████████████████████████████████████| 5000/5000 [02:28<00:00, 33.56it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|██████████████████████████████████████| 5000/5000 [00:10<00:00, 477.36it/s]
100%|███████████████████████████████████████| 5000/5000 [02:13<00:00, 37.46it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:11<00:00, 436.36it/s]
100%|███████████████████████████████████████| 5000/5000 [02:24<00:00, 34.70it/s]


reg weight 3
Compensatory phenotype


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:10<00:00, 492.86it/s]
100%|███████████████████████████████████████| 5000/5000 [02:07<00:00, 39.12it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:10<00:00, 471.67it/s]
100%|███████████████████████████████████████| 5000/5000 [02:25<00:00, 34.36it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:11<00:00, 451.18it/s]
100%|███████████████████████████████████████| 5000/5000 [02:03<00:00, 40.37it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 504.65it/s]
100%|███████████████████████████████████████| 5000/5000 [02:19<00:00, 35.73it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 503.51it/s]
100%|███████████████████████████████████████| 5000/5000 [02:14<00:00, 37.28it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 505.18it/s]
100%|███████████████████████████████████████| 5000/5000 [02:01<00:00, 41.02it/s]


reg weight 4
Compensatory phenotype


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 510.85it/s]
100%|███████████████████████████████████████| 5000/5000 [02:01<00:00, 41.06it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:10<00:00, 498.42it/s]
100%|███████████████████████████████████████| 5000/5000 [02:02<00:00, 40.87it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 509.62it/s]
100%|███████████████████████████████████████| 5000/5000 [02:02<00:00, 40.96it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 509.21it/s]
100%|███████████████████████████████████████| 5000/5000 [02:01<00:00, 41.00it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|██████████████████████████████████████| 5000/5000 [00:14<00:00, 355.78it/s]
100%|███████████████████████████████████████| 5000/5000 [02:48<00:00, 29.75it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:11<00:00, 416.83it/s]
100%|███████████████████████████████████████| 5000/5000 [02:32<00:00, 32.87it/s]


reg weight 5
Compensatory phenotype


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:11<00:00, 424.22it/s]
100%|███████████████████████████████████████| 5000/5000 [02:28<00:00, 33.66it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:11<00:00, 428.31it/s]
100%|███████████████████████████████████████| 5000/5000 [02:26<00:00, 34.04it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:11<00:00, 434.92it/s]
100%|███████████████████████████████████████| 5000/5000 [02:17<00:00, 36.49it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:10<00:00, 486.36it/s]
100%|███████████████████████████████████████| 5000/5000 [02:20<00:00, 35.47it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 507.16it/s]
100%|███████████████████████████████████████| 5000/5000 [02:02<00:00, 40.71it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 517.12it/s]
100%|███████████████████████████████████████| 5000/5000 [02:02<00:00, 40.77it/s]


reg weight 6
Compensatory phenotype


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:10<00:00, 499.23it/s]
100%|███████████████████████████████████████| 5000/5000 [02:01<00:00, 41.05it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 521.31it/s]
100%|███████████████████████████████████████| 5000/5000 [02:01<00:00, 41.04it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 509.13it/s]
100%|███████████████████████████████████████| 5000/5000 [02:02<00:00, 40.82it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:09<00:00, 513.81it/s]
100%|███████████████████████████████████████| 5000/5000 [02:02<00:00, 40.79it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1171.37it/s]
100%|██████████████████████████████████████| 5000/5000 [00:39<00:00, 125.52it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1130.31it/s]
100%|██████████████████████████████████████| 5000/5000 [00:39<00:00, 125.27it/s]


reg weight 7
Compensatory phenotype


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 906.51it/s]
100%|██████████████████████████████████████| 5000/5000 [00:40<00:00, 124.08it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1183.67it/s]
100%|██████████████████████████████████████| 5000/5000 [00:39<00:00, 127.65it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1194.07it/s]
100%|██████████████████████████████████████| 5000/5000 [00:42<00:00, 117.72it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1189.42it/s]
100%|██████████████████████████████████████| 5000/5000 [00:38<00:00, 129.48it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|██████████████████████████████████████| 5000/5000 [00:05<00:00, 959.43it/s]
100%|██████████████████████████████████████| 5000/5000 [00:40<00:00, 123.46it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1043.37it/s]
100%|██████████████████████████████████████| 5000/5000 [00:41<00:00, 121.20it/s]


reg weight 8
Compensatory phenotype


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1141.63it/s]
100%|██████████████████████████████████████| 5000/5000 [00:38<00:00, 128.73it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1160.33it/s]
100%|██████████████████████████████████████| 5000/5000 [00:41<00:00, 121.30it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1196.42it/s]
100%|███████████████████████████████████████| 5000/5000 [00:50<00:00, 99.06it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|██████████████████████████████████████| 5000/5000 [00:07<00:00, 666.29it/s]
100%|██████████████████████████████████████| 5000/5000 [00:45<00:00, 109.01it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1156.83it/s]
100%|██████████████████████████████████████| 5000/5000 [00:41<00:00, 119.98it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1181.54it/s]
100%|██████████████████████████████████████| 5000/5000 [00:41<00:00, 119.63it/s]


reg weight 9
Compensatory phenotype


  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1149.77it/s]
100%|██████████████████████████████████████| 5000/5000 [00:41<00:00, 119.81it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1134.02it/s]
100%|██████████████████████████████████████| 5000/5000 [00:42<00:00, 119.00it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1107.67it/s]
100%|██████████████████████████████████████| 5000/5000 [00:42<00:00, 119.00it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1184.53it/s]
100%|██████████████████████████████████████| 5000/5000 [00:42<00:00, 118.19it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████

100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1134.33it/s]
100%|██████████████████████████████████████| 5000/5000 [00:39<00:00, 126.58it/s]
  x_train = x.drop(task, axis=0)
  y_train = y.drop(task, axis=0)
100%|█████████████████████████████████████| 5000/5000 [00:04<00:00, 1216.69it/s]
100%|███████████████████████████████████████| 5000/5000 [01:21<00:00, 61.17it/s]


reg weight 10
Compensatory phenotype


In [113]:
prop_var=[]
for i in tqdm.tqdm(range(1,207)):
    for j in range(1,50):
        y,names=hetero_model(eth,data,max_threshold=j,num_heteromodel_causal_snps=i)
        phenotype_subprop=0
        subpop_phenotype_prop=[]
        for subpop in eth:
            subpop_phenotype_prop.append(sum(y.loc[subpop]==0)/y.loc[subpop].size)
        phenotype_subprop= np.mean(subpop_phenotype_prop)
        #print('num_causual_snps',i,'threshold',j,'average prop', phenotype_subprop,'variance',np.var(subpop_phenotype_prop))
        prop_var.append([i,j,phenotype_subprop,np.var(subpop_phenotype_prop),names])

100%|█████████████████████████████████████████| 206/206 [18:40<00:00,  5.44s/it]


In [44]:
from IPython.display import HTML
HTML('''<script>
var code_show_err = false; 
var code_toggle_err = function() {
 var stderrNodes = document.querySelectorAll('[data-mime-type="application/vnd.jupyter.stderr"]')
 var stderr = Array.from(stderrNodes)
 if (code_show_err){
     stderr.forEach(ele => ele.style.display = 'block');
 } else {
     stderr.forEach(ele => ele.style.display = 'none');
 }
 code_show_err = !code_show_err
} 
document.addEventListener('DOMContentLoaded', code_toggle_err);
</script>
To toggle on/off output_stderr, click <a onclick="javascript:code_toggle_err()">here</a>.''')

In [114]:
sweep_data=pd.DataFrame(np.vstack(prop_var))

  ary = asanyarray(ary)


In [120]:
alpha=sweep_data[np.abs(sweep_data[2]-0.5)<0.15]

In [132]:
alpha[alpha[3]<0.1]

Unnamed: 0,0,1,2,3,4
49,2,1,0.578056,0.017251,"[250058, 337299]"
50,2,2,0.507147,0.036235,"[332686, 374483]"
99,3,2,0.577633,0.051004,"[186918, 18774665, 336350]"
100,3,3,0.616316,0.083745,"[90627, 305203, 118696]"
400,9,9,0.350251,0.098266,"[16101460, 19860566, 307589, 231146, 451320, 4..."


In [156]:
compens_prop_var=[]
for i in tqdm.tqdm(range(8,13)):
    for j in tqdm.tqdm(range(18,22)):
        for k in range(46,50):
            for l in range(13,16):
                y,snp1,snp2=compensatory_model(eth,data,max_threshold1=k, max_threshold2=l, num_heteromodel_causal_snps1=i,
                               num_heteromodel_causal_snps2=j)
                phenotype_subprop=0
                subpop_phenotype_prop=[]
                for subpop in eth:
                    subpop_phenotype_prop.append(sum(y.loc[subpop]==0)/y.loc[subpop].size)
                phenotype_subprop= np.mean(subpop_phenotype_prop)
                #print('num_causual_snps',i,'threshold',j,'average prop', phenotype_subprop,'variance',np.var(subpop_phenotype_prop))
                compens_prop_var.append([i,j,k,l,phenotype_subprop,np.var(subpop_phenotype_prop),snp1,snp2])

  0%|                                                     | 0/5 [00:00<?, ?it/s]
  0%|                                                     | 0/4 [00:00<?, ?it/s][A
 25%|███████████▎                                 | 1/4 [00:02<00:07,  2.63s/it][A
 50%|██████████████████████▌                      | 2/4 [00:05<00:05,  2.72s/it][A
 75%|█████████████████████████████████▊           | 3/4 [00:08<00:02,  2.84s/it][A
100%|█████████████████████████████████████████████| 4/4 [00:11<00:00,  2.78s/it][A
 20%|█████████                                    | 1/5 [00:11<00:44, 11.13s/it]
  0%|                                                     | 0/4 [00:00<?, ?it/s][A
 25%|███████████▎                                 | 1/4 [00:02<00:08,  2.78s/it][A
 50%|██████████████████████▌                      | 2/4 [00:05<00:05,  2.82s/it][A
 75%|█████████████████████████████████▊           | 3/4 [00:08<00:02,  2.73s/it][A
100%|█████████████████████████████████████████████| 4/4 [00:11<00:00,  2.84s/it][

In [151]:
compens_prop_var

[[8,
  18,
  46,
  13,
  0.3110262152431703,
  0.1018468358415304,
  array(['319932', '239649', '368580', '18629302', '18672785', '100189',
         '363795', '18892411'], dtype=object),
  array(['155716', '227956', '222652', '234557', '216512', '305203',
         '384510', '18748059', '339622', '18459816', '337299', '240022',
         '256591', '191770', '280230', '288689', '1076045'], dtype=object)],
 [8,
  18,
  46,
  14,
  0.10896106206373556,
  0.0542561043128322,
  array(['276250', '291784', '280464', '19785655', '175689', '87794',
         '19792284', '19168107'], dtype=object),
  array(['90627', '20936', '340890', '886568', '256424', '186961',
         '18246623', '468152', '101217.1', '66541', '353221', '18672785',
         '222652', '51479', '18185740', '157893', '257950'], dtype=object)],
 [8,
  18,
  47,
  13,
  0.19963289051723557,
  0.0803098841251921,
  array(['493007', '213063', '21833', '155716', '386839', '18629302',
         '368845', '154290'], dtype=object),
  arra

In [157]:
csweep_data=pd.DataFrame(np.vstack(compens_prop_var))

  ary = asanyarray(ary)


In [162]:
beta=csweep_data[np.abs(csweep_data[4]-0.5)<0.05]

In [168]:
beta[beta[5]<0.15].loc[205]

0                                                   12
1                                                   19
2                                                   46
3                                                   14
4                                             0.490399
5                                             0.108417
6    [170889, 231146, 102910, 365830, 499561, 28821...
7    [66541, 186918, 101217.1, 18707056, 119767, 19...
Name: 205, dtype: object

In [155]:
csweep_data

Unnamed: 0,0,1,2,3,4,5,6,7
0,8,18,46,13,0.311026,0.101847,"[319932, 239649, 368580, 18629302, 18672785, 1...","[155716, 227956, 222652, 234557, 216512, 30520..."
1,8,18,46,14,0.108961,0.054256,"[276250, 291784, 280464, 19785655, 175689, 877...","[90627, 20936, 340890, 886568, 256424, 186961,..."
2,8,18,47,13,0.199633,0.08031,"[493007, 213063, 21833, 155716, 386839, 186293...","[397177, 353221, 305308, 51150, 237093, 247693..."
3,8,18,47,14,0.487254,0.158623,"[288211, 387312, 18856677, 17011006, 18774665,...","[285823, 124411, 18459816, 363795, 18246623, 1..."
4,8,19,46,13,0.324557,0.123008,"[21833, 307589, 19785655, 212578, 101217, 1076...","[973443, 101217.1, 355808, 314933, 19168107, 2..."
5,8,19,46,14,0.350227,0.127826,"[389017.1, 167376, 157893, 18774665, 918108, 3...","[434815, 155716, 285823, 364060, 399401, 27701..."
6,8,19,47,13,0.289823,0.145854,"[105794, 139418, 223694, 212578, 397177, 11708...","[213063, 336350, 18856677, 346484, 66541, 3372..."
7,8,19,47,14,0.293741,0.105497,"[307140, 387312, 105794, 19961472, 310558, 305...","[305203, 100189, 18748059, 77907, 242096, 3097..."
8,9,18,46,13,0.171131,0.069571,"[247693, 105794, 278735, 19860566, 311447, 144...","[18668339, 368845, 172167, 236224, 451320, 179..."
9,9,18,46,14,0.40895,0.123831,"[139418, 309719, 415044, 239636, 277015, 41622...","[20192154, 405457, 288211, 1039282, 314317, 18..."


In [63]:
#find average accuracy
for i in range(11):
    a=pd.read_csv('hyperparameter_optimization/reg'+str(i)+'.csv',index_col=[0])
    print(i)
    print(a.mean())
    print(a.mean()[0]-a.mean()[1])

0
maml_acc    0.982428
lin_acc     0.986871
dtype: float64
-0.004442818959554007
1
maml_acc    0.980151
lin_acc     0.980603
dtype: float64
-0.00045185089111332566
2
maml_acc    0.986250
lin_acc     0.983712
dtype: float64
0.0025384691026475448
3
maml_acc    0.968377
lin_acc     0.967478
dtype: float64
0.0008994738260906798
4
maml_acc    0.971514
lin_acc     0.970693
dtype: float64
0.0008215586344402226
5
maml_acc    0.967963
lin_acc     0.966407
dtype: float64
0.0015558454725477233
6
maml_acc    0.966335
lin_acc     0.965959
dtype: float64
0.00037598382858994306
7
maml_acc    0.960883
lin_acc     0.963364
dtype: float64
-0.002481365203857333
8
maml_acc    0.961727
lin_acc     0.964110
dtype: float64
-0.0023826316550925597
9
maml_acc    0.958852
lin_acc     0.963063
dtype: float64
-0.0042105102539062855
10
maml_acc    0.957979
lin_acc     0.962614
dtype: float64
-0.0046350652521307545


In [15]:
def linear_model(eth, x, hsq=0.1):
    # dictionary which contains randomly generated coefficnets for the linear model
    eth_coef = {}
    eth_errors = {}
    #eth_errors_sigma2 = np.ndarray()
    #eth_errors.update([(ethnicity, errors / 100) for ethnicity, errors in zip(eth, eth_errors_sigma2)])
    #varx=np.sum(np.multiply(np.cov(x,rowvar=False),np.identity(x.shape[1])))
    #eth_errors_sigma2=-varx+ (varx/hsq)
    for i in eth:
        varx = np.sum(np.multiply(np.cov(x.loc[i], rowvar=False), np.identity(x.shape[1])))
        eth_errors[i]=-varx+ (varx/hsq)
        print(i,eth_errors[i])
    #eth_errors.update(eth,eth_errors.values()/np.max(eth_errors.values()))
    total = sum(eth_errors.values(), 0.0)
    eth_errors = {k: v / total for k, v in eth_errors.items()}
    print(eth_errors)
    for i in eth:
        eth_coef[i] = np.random.uniform(-1, 1, (x.shape[1]))

    y = x.apply(lambda a: a @ eth_coef[a.name[0]] + np.random.normal(scale=eth_errors[a.name[0]]), axis=1)

    return y


In [16]:
y=linear_model(eth,data)

GBR 442.87252747252745
FIN 444.2486085343228
EAS 425.20524916943526
PUR 523.266990291262
CLM 495.2244337680163
IBS 445.83406806559685
PEL 450.52184873949574
SAS 480.387396071816
KHV 422.482374768089
ACB 548.010197368421
GWD 523.7977243994942
ESN 529.7588126159553
MSL 518.8840336134454
STU 474.36080372743146
EUR 450.0055658627086
YRI 529.147945688591
JPT 428.21079163554884
LWK 537.6567717996289
ASW 539.0360655737704
MXL 480.98660714285717
TSI 457.93193440310347
{'GBR': 0.043642088477034886, 'FIN': 0.04377769194693892, 'EAS': 0.04190109784198183, 'PUR': 0.05156441836148804, 'CLM': 0.04880101431858168, 'IBS': 0.04393392824711203, 'PEL': 0.044395877287163786, 'SAS': 0.04733892472912336, 'KHV': 0.041632777008866306, 'ACB': 0.05400269385947308, 'GWD': 0.05161671861375575, 'ESN': 0.052204143489367034, 'MSL': 0.05113250766200093, 'STU': 0.04674504486529513, 'EUR': 0.044345001105892144, 'YRI': 0.05214394669042836, 'JPT': 0.04219727369866951, 'LWK': 0.05298243388242664, 'ASW': 0.0531183539433761

In [69]:
y

eth  ID     
GBR  HG00096    -306.301587
     HG00097     863.037892
     HG00099    -394.822695
     HG00100    -461.646395
     HG00101     335.934810
                   ...     
SAS  NA21137     826.755650
     NA21141    -373.720839
     NA21142     367.485953
     NA21143   -1241.330977
     NA21144    -831.597308
Length: 2503, dtype: float64

In [23]:
ethID=pd.read_csv('ethID.csv')

In [26]:
ethID['eth']

0       FIN
1       FIN
2       FIN
3       FIN
4       FIN
       ... 
2704    GIH
2705    GIH
2706    GIH
2707    GIH
2708    GIH
Name: eth, Length: 2709, dtype: object

In [21]:
data

Unnamed: 0_level_0,Unnamed: 1_level_0,51479,821054,886568,918108,942451,973443,998741,1039282,1076045,21833,...,154290,175689,211156,239649,256777,280230,307589,339622,368580,384510
eth,ID,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
GBR,HG00096,1,1,2,0,2,0,1,2,0,0,...,2,1,1,0,2,2,1,0,2,1
GBR,HG00097,0,0,2,0,2,1,0,2,0,0,...,2,1,1,0,1,1,2,0,1,1
GBR,HG00099,0,1,2,0,2,0,0,2,0,0,...,2,1,2,0,0,1,2,0,0,0
GBR,HG00100,0,0,2,0,2,1,0,2,0,0,...,2,2,0,0,2,0,1,0,1,0
GBR,HG00101,0,0,2,0,2,0,1,1,0,0,...,2,1,1,0,2,2,1,0,2,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
SAS,NA21137,0,0,2,0,2,1,0,1,0,0,...,2,2,1,0,2,1,1,0,0,0
SAS,NA21141,1,0,2,0,2,1,0,2,0,0,...,2,2,0,0,2,2,0,1,2,2
SAS,NA21142,0,1,2,0,2,1,1,2,0,0,...,2,1,1,0,1,2,1,0,1,2
SAS,NA21143,0,0,2,0,2,0,1,2,0,0,...,2,1,2,1,2,1,2,0,0,0


In [27]:
import numpy as np
import pandas as pd



#read data
ethnic_dic=pd.read_csv('sampleID.csv',usecols=['Sample (Male/Female/Unknown)','Population(s)'])
#num_rows=20000
header_line=19
i=1
data1=pd.read_csv('/work/long_lab/1000_Genome_Data/20181203_biallelic_SNV/ftp.1000genomes.ebi.ac.uk/ALL.chr'+ str(i) +'.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz',sep='\t',
                header=header_line,nrows=2000)
for i in range(2,21):
    data=pd.read_csv('/work/long_lab/1000_Genome_Data/20181203_biallelic_SNV/ftp.1000genomes.ebi.ac.uk/ALL.chr'+ str(i) +'.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz',sep='\t',
                header=header_line,nrows=2000)
    data1=pd.concat([data1,data])

#print(data1.head)
#task='MXL' #the ethnic group we want to test
removal_list=['HG00104',
 'HG00134',
 'HG00135',
 'HG00152',
 'HG00156',
 'HG00249',
 'HG00270',
 'HG00302',
 'HG00303',
 'HG00312',
 'HG00359',
 'HG00377',
 'HG01471',
 'HG02168',
 'HG02169',
 'HG02170',
 'HG02173',
 'HG02176',
 'HG02358',
 'HG02405',
 'HG02436',
 'HG03171',
 'HG03393',
 'HG03398',
 'HG03431',
 'HG03462',
 'HG03549',
 'HG04301',
 'HG04302',
 'HG04303',
 'NA18527',
 'NA18576',
 'NA18791',
 'NA18955',
 'NA19044',
 'NA19359',
 'NA19371',
 'NA19398',
 'NA20537',
 'NA20816',
 'NA20829',
 'NA20831',
 'NA20873',
 'NA20883',
 'NA21121']


#processing data
def process_dict(data):
    data=data.rename(columns={'Sample (Male/Female/Unknown)':'ID','Population(s)':'eth'})
    data['eth']=data['eth'].apply(lambda x: x.split(',')[-1][1:])
    data['ID']=data['ID'].apply(lambda x: x.split(' ')[0])
    data.index=data['ID']
    data=data.drop('ID',axis=1)
    data=data.drop('NA18498')
    return data

def process_data(data,num_causal_snps=20):
    data=data[data['ALT'].isin(['A','C','G','T'])] #select SNPs with single ALT allele
    data['INFO']=data['INFO'].apply(lambda x: float(x.split(';')[3].split('=')[-1])) #extract allele freq information
    data=data[data['INFO']>0.05] #choose SNPs with allele freq more than 0.05
    data.index=data['POS'] #set ID col as index
    data=data.drop(['ID','#CHROM','POS','REF','ALT','QUAL','FILTER','INFO','FORMAT'],axis=1) #drop columns other than individual data
    data=data.applymap(lambda x: 2 if x=='1|1' else(0 if x=='0|0' else 1)) #sets 0|0 to 0 ...
    #data=data.drop(removal_list,axis=1)
    data=data.T
    causal_snps=np.arange(0,len(data.columns),len(data.columns)//num_causal_snps)
    data=data[data.columns[causal_snps]]
    data['eth']=ethID['eth']
    data['ID']=data.index
    data=data.set_index(['eth','ID'])
    return data





#eth_ID=process_dict(ethnic_dic)
#eth=eth_ID['eth'].unique()
ethID=pd.read_csv('ethID.csv')
eth=ethID['eth'].unique()


data1=process_data(data)
#data1.to_csv('processed_data_mar21.csv')
#eth_ID.to_csv('eth_ID.csv')

FileNotFoundError: [Errno 2] No such file or directory: '/work/long_lab/1000_Genome_Data/20181203_biallelic_SNV/ftp.1000genomes.ebi.ac.uk/ALL.chr1.shapeit2_integrated_v1a.GRCh38.20181129.phased.vcf.gz'

In [32]:
data1=pd.read_csv('processed_data_mar21.csv')

In [33]:
data1

Unnamed: 0,eth,ID,61849,72982,82590,88108,98593,101438,102611,106775,...,119864,123001,126972,128317,132236,137509,140837,144480,146119,148072
0,,HG00096,0,0,0,2,2,1,2,1,...,0,0,0,1,1,0,0,1,1,1
1,,HG00097,0,0,0,0,0,0,1,1,...,0,0,0,1,1,0,0,1,1,1
2,,HG00099,0,0,0,0,0,0,2,2,...,0,0,0,2,0,0,0,2,1,1
3,,HG00100,0,0,0,1,1,1,1,1,...,0,0,0,0,1,0,1,0,2,2
4,,HG00101,1,1,0,2,0,2,1,1,...,0,0,0,1,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2498,,NA21137,0,0,1,2,0,1,2,1,...,1,1,1,0,1,1,1,0,2,2
2499,,NA21141,1,1,0,2,1,0,1,1,...,0,0,0,0,0,0,1,0,2,2
2500,,NA21142,1,1,0,2,2,0,0,2,...,1,1,1,1,0,1,1,1,0,0
2501,,NA21143,0,0,1,2,0,0,1,1,...,1,1,1,0,0,1,1,0,1,1


In [35]:
a=ethID['ID']

In [36]:
b=data1['ID']

In [42]:
bthings=[]
notbthings=[]
for i in range(len(a)):
    for j in range(len(b)):
        if a[i]==b[j]:
            bthings.append(i)
            continue

In [50]:
data4=pd.read_csv('processed_data_mar21_4.csv')

In [61]:
data4=data4.drop(['Unnamed: 0'],axis=1)

In [72]:
data4['eth']='ALL'

In [71]:
IDs=data4['ID']

In [77]:
data4['ID'][1]==ethID['ID'][10]

False

In [78]:
for i in range(2548):
    for j in range(2709):
        if data4['ID'][i]==ethID['ID'][j]:
            data4['eth'][i]=ethID['eth'][j]
            continue
        
        

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data4['eth'][i]=ethID['eth'][j]


In [81]:
data4=data4.set_index(['eth','ID'])

In [82]:
data4.to_csv('processed_data_mar21.csv')

In [94]:
data=data.set_index(['eth','ID'])

In [64]:
rows_remove_list=[]
for i in range(data.shape[0]):
    for j in range(eth_ID.shape[0]):
        flag=0
        #print(i,j,data.iloc[i].name,eth_ID['ID'].iloc[j])
        if(data.iloc[i].name==eth_ID['ID'].iloc[j]):
            #print('si')
            data['eth'][i]=eth_ID['eth'].iloc[j]
            continue
    if flag==0:
            rows_remove_list.append(j)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['eth'][i]=eth_ID['eth'].iloc[j]


In [69]:
rows_remove_list

[2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,
 2547,

In [95]:
data.to_csv('processed_data_apr05.csv')

In [98]:
data.groupby('eth').count()

POS,61849,65955,66369,67744,68303,72450,72982,73765,77005,78705,...,2000377,2000733,2001588,2001665,2002396,2002459,2003829,2004055,2004404,2004447
eth,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
ACB,97,97,97,97,97,97,97,97,97,97,...,97,97,97,97,97,97,97,97,97,97
ASW,61,61,61,61,61,61,61,61,61,61,...,61,61,61,61,61,61,61,61,61,61
BEB,86,86,86,86,86,86,86,86,86,86,...,86,86,86,86,86,86,86,86,86,86
CDX,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
CEU,99,99,99,99,99,99,99,99,99,99,...,99,99,99,99,99,99,99,99,99,99
CHB,106,106,106,106,106,106,106,106,106,106,...,106,106,106,106,106,106,106,106,106,106
CHS,105,105,105,105,105,105,105,105,105,105,...,105,105,105,105,105,105,105,105,105,105
CLM,95,95,95,95,95,95,95,95,95,95,...,95,95,95,95,95,95,95,95,95,95
ESN,100,100,100,100,100,100,100,100,100,100,...,100,100,100,100,100,100,100,100,100,100
FIN,105,105,105,105,105,105,105,105,105,105,...,105,105,105,105,105,105,105,105,105,105


In [89]:
eth_ID

Unnamed: 0,ID,eth
0,HG00315,FIN
1,HG00327,FIN
2,HG00334,FIN
3,HG00339,FIN
4,HG00341,FIN
...,...,...
2704,NA20905,GIH
2705,NA21092,GIH
2706,NA21117,GIH
2707,NA21124,GIH


In [136]:
eth_ID=ethID.drop(rows_remove_list,axis=0)

In [137]:
eth_ID.to_csv('eth_ID.csv')

In [6]:
eth_ID=pd.read_csv('eth_ID.csv',index_col=[0])
eth=eth_ID['eth'].unique()

In [2]:
data=pd.read_csv('processed_data_mar29_raw_1.csv')

In [15]:
data['eth']=eth_ID['eth']
data['ID']=data.index
data=data.set_index(['eth','ID'])


In [28]:
data=data.T

In [29]:
data['eth'][2]

nan

In [4]:
data.index=data['POS'] #set ID col as index
data=data.drop(['ID','#CHROM','POS','POS.1','REF','ALT','QUAL','FILTER','INFO','FORMAT'],axis=1) #drop columns other than individual data
data=data.applymap(lambda x: 2 if x=='1|1' else(0 if x=='0|0' else 1)) #sets 0|0 to 0 ...
#data=data.drop(removal_list,axis=1)
data=data.T
#data=data[data.columns[causal_snps]]

In [24]:

data['eth']=eth_ID['eth']

In [19]:
data['ID']=data.index
data=data.set_index(['eth','ID'])

2548

In [27]:
eth_ID['eth']

0       FIN
1       FIN
2       FIN
3       FIN
4       FIN
       ... 
2704    GIH
2705    GIH
2706    GIH
2707    GIH
2708    GIH
Name: eth, Length: 2548, dtype: object