# Prep All Files for CAGI6-PRS Container

**Authorship:**
Adam Klie, *09/01/2021*
***
**Description:**
Per title
***
**TODOs:**
 - <font color='green'> Done TODO </font>
 - <font color='orange'> WIP TODO </font>
 - <font color='red'> Queued TODO </font>
***

## Set-up

In [90]:
# The classics
import numpy as np
import pandas as pd

# Other guys
import tqdm
from pyarrow import feather

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

In [15]:
bim = pd.read_csv("../MGB.bim", sep="\t", header=None)

In [16]:
fam = pd.read_csv("../MGB.fam", delim_whitespace=True, header=None)

## Step 1 - Make dummy files for testing

### Step 1a - Build a dummy bed file from given bim and fam

#### Make a PED file

In [4]:
fam.values[:10].shape, np.tile(bim[[4, 5]].values.flatten(), (10, 1)).shape, np.concatenate([fam.values[:10], np.tile(bim[[4, 5]].values.flatten(), (10, 1))], axis=1).shape

((10, 6), (10, 3407714), (10, 3407720))

In [5]:
sample_fam = fam.values[:10]

In [31]:
sample_fam[:, 5] = np.random.choice([1, 2], size=10)

In [33]:
np.savetxt("MGB_10_all/MGB_10_all.ped", 
           np.concatenate([sample_fam, np.tile(bim[[4, 5]].values.flatten(), (10, 1))], axis=1),
           fmt="%s")

#### Make a MAP file

In [25]:
bim[[0, 1, 2, 3]].to_csv(
    "MGB_10_all/MGB_10_all.map", sep="\t", header=None, index=False
)

#### Make a covariates.tsv file

1. We will have the “AGE" and ten PCs (PC1, PC2, .., PC10) in a separate covariate file. The file name is “MGB.covariates.txt". It is a space-delimited plain text file similar to plink covariate files. The first two columns are FID and IID from the MGB.fam file. The sample order can be different between MGB.fam and MGB-covariates.txt. 

In [58]:
pcs = np.random.normal(loc=0, scale=3, size=(10, 10))

In [59]:
pcs.shape, sample_fam[:, :2].shape, ages.shape

((10, 10), (10, 2), (10, 1))

In [60]:
ages = np.random.normal(loc=50, scale=10, size=(10, 1))

In [61]:
sample_covar = np.concatenate([sample_fam[:, :2], ages, pcs], axis=1)

In [62]:
sample_covar[0]

array(['MGB00001', 'MGB00001', 66.66865714162273, 1.2622905687385135,
       0.6632332986461236, -1.3031528057680857, 0.0012940779292404904,
       5.708772427155591, -6.319302885969882, 0.45256534112375046,
       0.28469001727733156, 3.002453648556721, -4.111321613459907],
      dtype=object)

In [63]:
sample_covar_df = pd.DataFrame(sample_covar)

In [64]:
sample_covar_df.columns = ["FID", "IID", "AGE"] + ["PC{}".format(i+1) for i in range(10)]

In [65]:
sample_covar_df.head()

Unnamed: 0,FID,IID,AGE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,MGB00001,MGB00001,66.668657,1.262291,0.663233,-1.303153,0.001294,5.708772,-6.319303,0.452565,0.28469,3.002454,-4.111322
1,MGB00002,MGB00002,53.190195,3.3063,-1.101641,1.97045,-0.872764,-3.840938,-0.671233,-0.165758,-0.421976,0.860991,-4.696233
2,MGB00003,MGB00003,38.929525,6.461285,-4.82919,0.6073,2.843771,-5.527301,-3.029981,5.825115,3.742236,6.044946,1.437086
3,MGB00004,MGB00004,58.494731,3.134901,-0.005092,1.167758,1.454855,3.297935,1.981656,-1.079379,1.793868,0.169963,-0.878017
4,MGB00005,MGB00005,60.674713,1.273421,1.723451,1.46698,2.157379,0.619361,1.180299,1.433365,4.477744,-4.105312,-0.212373


In [66]:
sample_covar_df.to_csv("MGB_10_all/MGB_10_all.covariates.txt", sep=" ", index=False)

### Step 1b - Make dummy age.summary.tsv file for z-scored age

In [67]:
age_means = np.random.normal(loc=50, scale=10, size=(4))

In [68]:
age_stds = np.abs(np.random.normal(loc=0, scale=10, size=(4)))

In [69]:
index = ["BCA", "CAD", "IBD", "T2D"]

In [70]:
pd.DataFrame(index=index, data={"MEAN":age_means, "STD": age_stds}).to_csv("MGB_10_all/age.summary.tsv", sep="\t", index=True)

## Step 2 - Make needed files for actual Docker

### Step 2a - Make SNP summary files from feathers and stats

#### BCA

In [2]:
geno = feather.read_feather("../../../cagi6-prs/features/bca_mixed_no_deviant/bca.age.matched.feather")
geno.head()

Unnamed: 0,index,1000198,1000745,1000752,1000889,1000933,1001179,1001373,1001539,1001624,...,BCAC-99980607,BCAC-99983670,BCAC-99983874,BCAC-99989217,BCAC-99995086,BCAC-99995719,BCAC-99995896,BCAC-99996920,BCAC-99997709,BCAC-99999978
0,rs6692780_C,1.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,...,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0
1,rs6701289_A,1.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,...,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0
2,rs6693486_T,1.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,...,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0
3,rs6656223_A,1.0,1.0,2.0,2.0,2.0,1.0,2.0,2.0,1.0,...,2.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,2.0
4,rs4394703_A,2.0,2.0,2.0,2.0,1.0,1.0,2.0,1.0,1.0,...,2.0,2.0,1.0,2.0,2.0,1.0,2.0,1.0,2.0,2.0


In [6]:
snps = ["_".join(var.split("_")[:-1]) for var in geno["index"]]
alleles = [var.split(".")[0][-1] for var in geno["index"]]
allele_mp = dict(zip(snps, alleles))
snps[:5], alleles[:5]

(['rs6692780', 'rs6701289', 'rs6693486', 'rs6656223', 'rs4394703'],
 ['C', 'A', 'T', 'A', 'A'])

In [7]:
snp_map = pd.read_csv("../../../cagi6-prs/snps/ukb.extracted.intersected.mgb.snps.tsv", sep="\t")
subset_snp_map = snp_map[snp_map["UKB_ID"].isin(snps)][["UKB_ID", "MGB_ID"]]
subset_snp_map

Unnamed: 0,UKB_ID,MGB_ID
233,rs6692780,1:5014583_T_C
234,rs6701289,1:5014728_G_A
235,rs6693486,1:5015467_A_T
236,rs6656223,1:5015844_G_A
255,rs4394703,1:5231892_C_A
...,...,...
1617655,rs532436,9:136149830_G_A
1617656,rs600038,9:136151806_T_C
1617663,rs11244061,9:136153981_C_T
1617665,rs649129,9:136154304_C_T


In [8]:
stats = pd.read_csv("../../../cagi6-prs/features/bca_mixed_no_deviant/bca.age.matched.train.stats.tsv", sep="\t")
subset_stats = stats[stats["0"].isin(snps)]
subset_stats

Unnamed: 0,0,mean,std
0,rs6692780,1.327469,0.669289
1,rs6701289,1.325825,0.669558
2,rs6693486,1.325820,0.669546
3,rs6656223,1.321889,0.670334
4,rs4394703,1.493254,0.616890
...,...,...,...
24436,rs532436,1.608489,0.561703
24437,rs600038,1.565365,0.583841
24438,rs11244061,1.760899,0.458790
24439,rs649129,1.565618,0.583823


In [9]:
summary_df = pd.merge(subset_snp_map, subset_stats, left_on="UKB_ID", right_on="0")
summary_df

Unnamed: 0,UKB_ID,MGB_ID,0,mean,std
0,rs6692780,1:5014583_T_C,rs6692780,1.327469,0.669289
1,rs6701289,1:5014728_G_A,rs6701289,1.325825,0.669558
2,rs6693486,1:5015467_A_T,rs6693486,1.325820,0.669546
3,rs6656223,1:5015844_G_A,rs6656223,1.321889,0.670334
4,rs4394703,1:5231892_C_A,rs4394703,1.493254,0.616890
...,...,...,...,...,...
24436,rs532436,9:136149830_G_A,rs532436,1.608489,0.561703
24437,rs600038,9:136151806_T_C,rs600038,1.565365,0.583841
24438,rs11244061,9:136153981_C_T,rs11244061,1.760899,0.458790
24439,rs649129,9:136154304_C_T,rs649129,1.565618,0.583823


In [10]:
summary_df = summary_df.set_index("UKB_ID").loc[pd.Index(snps).drop_duplicates()].reset_index()
summary_df

Unnamed: 0,index,MGB_ID,0,mean,std
0,rs6692780,1:5014583_T_C,rs6692780,1.327469,0.669289
1,rs6701289,1:5014728_G_A,rs6701289,1.325825,0.669558
2,rs6693486,1:5015467_A_T,rs6693486,1.325820,0.669546
3,rs6656223,1:5015844_G_A,rs6656223,1.321889,0.670334
4,rs4394703,1:5231892_C_A,rs4394703,1.493254,0.616890
...,...,...,...,...,...
24436,rs532436,9:136149830_G_A,rs532436,1.608489,0.561703
24437,rs600038,9:136151806_T_C,rs600038,1.565365,0.583841
24438,rs11244061,9:136153981_C_T,rs11244061,1.760899,0.458790
24439,rs649129,9:136154304_C_T,rs649129,1.565618,0.583823


In [11]:
summary_df["expected_allele"] = summary_df["index"].map(allele_mp)

In [12]:
final_df = summary_df[["index", "MGB_ID", "expected_allele", "mean", "std"]]

In [13]:
final_df.columns = ["UKB_ID", "MGB_ID", "EXPECTED_ALLELE", "MEAN", "STD"]

**Sanity checks**

In [17]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [19]:
actual_feather = feather.read_feather("MGB_10.bca.zscored.feather")

In [24]:
(final_df["MGB_ID"] == actual_feather["0"]).all()

True

In [29]:
(final_df["UKB_ID"] == stats["0"]).all()

True

In [31]:
(summary_df["index"] == stats["0"]).all()

True

**Save**

In [404]:
final_df.to_csv("bca.summary.tsv", sep="\t", index=False)

In [406]:
final_df["MGB_ID"].to_csv("bca.extract.txt", header=None, index=False)

#### CAD

In [33]:
geno = feather.read_feather("../../../cagi6-prs/features/baseline_snps_no_deviant-baseline_plus_samples/cad_age_matched/cad.age.matched.feather")
geno

Unnamed: 0,index,2437417,2283331,5568615,4626449,1610609,3617323,2209865,1919537,2067331,...,3420212,1193929,4872319,1838942,3294657,4556083,4620563,4056167,4531036,4174059
0,rs1692584_T,0.0000,0.0196,0.00000,1.0000,1.0,0.0,1.0,1.0,0.0,...,0.0,2.0000,1.0000,1.0,1.0,0.0,1.00000,2.0000,2.0,1.00000
1,rs1692585_T,0.2078,0.0000,0.26666,1.4745,1.0,0.0,1.0,1.0,0.0,...,0.0,2.0000,1.0000,1.0,1.0,0.0,0.95294,2.0000,2.0,1.08234
2,rs76282923_T,2.0000,2.0000,2.00000,1.0000,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0000,2.0000,2.0,2.0,2.0,2.00000,2.0000,1.0,2.00000
3,rs142707962_T,2.0000,2.0000,2.00000,1.0000,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0000,2.0000,2.0,2.0,2.0,2.00000,2.0000,1.0,2.00000
4,rs74590598_G,2.0000,2.0000,2.00000,1.0000,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0000,2.0000,2.0,2.0,2.0,2.00000,2.0000,1.0,2.00000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7464,rs559723_A,0.0000,2.0000,0.00000,1.0000,0.0,0.0,1.0,1.0,0.0,...,0.0,0.0000,0.0000,0.0,0.0,0.0,0.00000,1.0000,1.0,1.00000
7465,rs600038_T,1.0000,2.0000,2.00000,2.0000,0.0,2.0,2.0,2.0,2.0,...,1.0,1.0000,0.0000,1.0,2.0,2.0,1.00000,1.0000,1.0,2.00000
7466,rs11244061_C,1.0000,2.0000,2.00000,2.0000,1.0,2.0,2.0,2.0,2.0,...,1.0,2.0000,1.0000,2.0,2.0,2.0,2.00000,1.0039,1.0,2.00000
7467,rs649129_C,1.0000,2.0000,2.00000,2.0000,0.0,2.0,2.0,2.0,2.0,...,1.0,1.0039,0.0079,1.0,2.0,2.0,1.00000,1.0000,1.0,2.00000


In [34]:
geno[geno["index"].str.contains("rs12142712")] 

Unnamed: 0,index,2437417,2283331,5568615,4626449,1610609,3617323,2209865,1919537,2067331,...,3420212,1193929,4872319,1838942,3294657,4556083,4620563,4056167,4531036,4174059
219,rs12142712_A,2.0,2.0,2.0,2.0,2.0,1.0,2.0,2.0,2.0,...,2.0,2.0,2.0,0.0,1.0,2.0,2.0,2.0,2.0,2.0
220,rs12142712_A.1,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,...,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0,2.0


In [35]:
snp_map[snp_map["UKB_ID"].str.contains("rs12142712")]

Unnamed: 0,CHROM,POS,VARIANT,ALT_VARIANT,UKB_ID,UKB_REF,UKB_ALT,MGB_ID,MGB_A1,MGB_A2
436805,1,197572672,1:197572672:A:G,1:197572672:G:A,rs12142712,A,G,1:197572672_A_G,G,A


In [36]:
snps = ["_".join(var.split("_")[:-1]) for var in geno["index"]]
alleles = [var.split(".")[0][-1] for var in geno["index"]]
allele_mp = dict(zip(snps, alleles))
snps[:5], alleles[:5]

(['rs1692584', 'rs1692585', 'rs76282923', 'rs142707962', 'rs74590598'],
 ['T', 'T', 'T', 'T', 'G'])

In [37]:
snp_map = pd.read_csv("../../../cagi6-prs/snps/ukb.extracted.intersected.mgb.snps.tsv", sep="\t")
subset_snp_map = snp_map[snp_map["UKB_ID"].isin(snps)][["UKB_ID", "MGB_ID"]]
subset_snp_map

Unnamed: 0,UKB_ID,MGB_ID
4595,rs2184104,1:56912222_T_C
4596,rs6693179,1:56914602_G_A
4597,rs6684929,1:56914738_T_C
4598,rs6421496,1:56915593_C_T
4599,rs1889145,1:56915820_G_A
...,...,...
1617655,rs532436,9:136149830_G_A
1617656,rs600038,9:136151806_T_C
1617663,rs11244061,9:136153981_C_T
1617665,rs649129,9:136154304_C_T


In [38]:
stats = pd.read_csv("../../../cagi6-prs/features/baseline_snps_no_deviant-baseline_plus_samples/cad_age_matched/cad.age.matched.train.stats.tsv", sep="\t")
subset_stats = stats[stats["0"].isin(snps)]
subset_stats

Unnamed: 0,0,mean,std
0,rs1692584,1.234933,0.677531
1,rs1692585,1.247089,0.670771
2,rs76282923,1.901087,0.306297
3,rs142707962,1.901117,0.306254
4,rs74590598,1.901464,0.305765
...,...,...,...
7464,rs559723,0.974722,0.703103
7465,rs600038,1.573194,0.577394
7466,rs11244061,1.769694,0.451291
7467,rs649129,1.573252,0.577947


In [39]:
summary_df = pd.merge(subset_snp_map, subset_stats, left_on="UKB_ID", right_on="0")
summary_df

Unnamed: 0,UKB_ID,MGB_ID,0,mean,std
0,rs2184104,1:56912222_T_C,rs2184104,0.443815,0.587821
1,rs6693179,1:56914602_G_A,rs6693179,0.443409,0.587740
2,rs6684929,1:56914738_T_C,rs6684929,0.443407,0.587745
3,rs6421496,1:56915593_C_T,rs6421496,0.215563,0.433631
4,rs1889145,1:56915820_G_A,rs1889145,0.443433,0.587718
...,...,...,...,...,...
7464,rs532436,9:136149830_G_A,rs532436,1.617514,0.554517
7465,rs600038,9:136151806_T_C,rs600038,1.573194,0.577394
7466,rs11244061,9:136153981_C_T,rs11244061,1.769694,0.451291
7467,rs649129,9:136154304_C_T,rs649129,1.573252,0.577947


In [40]:
summary_df = summary_df.set_index("UKB_ID").loc[pd.Index(snps).drop_duplicates()].reset_index()
summary_df

Unnamed: 0,UKB_ID,MGB_ID,0,mean,std
0,rs1692584,1:2172917_T_C,rs1692584,1.234933,0.677531
1,rs1692585,1:2172919_T_C,rs1692585,1.247089,0.670771
2,rs76282923,1:14612626_T_C,rs76282923,1.901087,0.306297
3,rs142707962,1:14615321_T_C,rs142707962,1.901117,0.306254
4,rs74590598,1:14620817_G_C,rs74590598,1.901464,0.305765
...,...,...,...,...,...
7464,rs559723,9:136150484_A_G,rs559723,0.974722,0.703103
7465,rs600038,9:136151806_T_C,rs600038,1.573194,0.577394
7466,rs11244061,9:136153981_C_T,rs11244061,1.769694,0.451291
7467,rs649129,9:136154304_C_T,rs649129,1.573252,0.577947


In [41]:
summary_df["expected_allele"] = summary_df["UKB_ID"].map(allele_mp)

In [42]:
final_df = summary_df[["UKB_ID", "MGB_ID", "expected_allele", "mean", "std"]]

In [43]:
final_df.columns = ["UKB_ID", "MGB_ID", "EXPECTED_ALLELE", "MEAN", "STD"]

**Sanity checks**

In [44]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [46]:
actual_feather = feather.read_feather("MGB_10.cad.zscored.feather")

In [47]:
(final_df["MGB_ID"] == actual_feather["0"]).all()

True

In [48]:
(final_df["UKB_ID"] == stats["0"]).all()

True

In [50]:
(summary_df["UKB_ID"] == stats["0"]).all()

True

**Save**

In [419]:
final_df.to_csv("cad.summary.tsv", sep="\t", index=False)

In [420]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [421]:
final_df["MGB_ID"].to_csv("cad.extract.txt", header=None, index=False)

#### IBD

In [51]:
geno = feather.read_feather("../../../cagi6-prs/features/baseline_snps_no_deviant-baseline_plus_samples/ibd_age_matched/ibd.age.matched.feather")

In [52]:
snps = ["_".join(var.split("_")[:-1]) for var in geno["index"]]
alleles = [var.split(".")[0][-1] for var in geno["index"]]
allele_mp = dict(zip(snps, alleles))

In [54]:
snp_map = pd.read_csv("../../../cagi6-prs/snps/ukb.extracted.intersected.mgb.snps.tsv", sep="\t")
subset_snp_map = snp_map[snp_map["UKB_ID"].isin(snps)][["UKB_ID", "MGB_ID"]]
subset_snp_map

Unnamed: 0,UKB_ID,MGB_ID
1,rs2748983,1:1860087_C_T
3,rs3121830,1:1864526_C_T
4,rs2803316,1:1865298_A_G
5,rs3121831,1:1866004_G_A
7,rs2803322,1:1868066_C_G
...,...,...
1618412,rs1130640,9:139317979_C_T
1618413,rs10747043,9:139318310_G_A
1618414,rs10870171,9:139318530_C_T
1618415,rs10747044,9:139319129_G_T


In [55]:
stats = pd.read_csv("../../../cagi6-prs/features/baseline_snps_no_deviant-baseline_plus_samples/ibd_age_matched/ibd.age.matched.train.stats.tsv", sep="\t")
subset_stats = stats[stats["0"].isin(snps)]
subset_stats

Unnamed: 0,0,mean,std
0,rs4648827,1.465251,0.621988
1,rs6603809,1.464785,0.624073
2,rs742359,1.457913,0.625642
3,rs4648727,1.122482,0.697143
4,rs6681938,1.389553,0.645701
...,...,...,...
13170,rs1130640,1.483059,0.623002
13171,rs10747043,1.425462,0.642939
13172,rs10870171,1.425471,0.642822
13173,rs10747044,1.425117,0.642796


In [56]:
summary_df = pd.merge(subset_snp_map, subset_stats, left_on="UKB_ID", right_on="0").drop_duplicates("UKB_ID")

In [57]:
summary_df = summary_df.set_index("UKB_ID").loc[pd.Index(snps)].reset_index()

In [58]:
summary_df["expected_allele"] = summary_df["index"].map(allele_mp)

In [59]:
final_df = summary_df[["index", "MGB_ID", "expected_allele", "mean", "std"]]

In [60]:
final_df.columns = ["UKB_ID", "MGB_ID", "EXPECTED_ALLELE", "MEAN", "STD"]

**Sanity checks**

In [61]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [64]:
actual_feather = feather.read_feather("MGB_10.ibd.zscored.feather")

In [65]:
(final_df["MGB_ID"] == actual_feather["0"]).all()

True

In [66]:
(final_df["UKB_ID"] == stats["0"]).all()

True

In [67]:
(summary_df["index"] == stats["0"]).all()

True

**Save**

In [659]:
final_df.to_csv("ibd.summary.tsv", sep="\t", index=False)

In [660]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [661]:
final_df["MGB_ID"].to_csv("ibd.extract.txt", header=None, index=False)

In [662]:
geno

Unnamed: 0,index,3581610,2354527,4202089,2760159,3255707,2874909,5666210,5311838,2440028,...,4240432,2186161,4672889,3592212,5654809,2499270,1286390,4134497,3794449,1472707
0,rs4648827_C,1.0,2.0,2.0000,2.0,1.0,1.0,1.0039,2.0000,2.0,...,1.0000,2.0,1.0,2.0000,2.0,0.0,1.0000,1.0157,2.0,1.0
1,rs6603809_G,1.0,2.0,2.0000,2.0,1.0,1.0,1.0000,2.0000,2.0,...,1.0000,2.0,1.0,2.0000,2.0,0.0,1.0000,1.0000,2.0,1.0
2,rs742359_C,1.0,2.0,2.0000,2.0,1.0,1.0,1.0000,2.0000,2.0,...,1.0000,2.0,1.0,1.9922,2.0,0.0,1.0000,1.0000,2.0,1.0
3,rs4648727_C,1.0,2.0,2.0000,2.0,1.0,1.0,1.0000,1.0000,2.0,...,1.0000,2.0,1.0,2.0000,2.0,0.0,1.0000,1.0000,1.0,0.0
4,rs6681938_T,2.0,2.0,1.0039,1.0,1.0,1.0,1.0000,1.0000,1.0,...,1.0039,1.0,2.0,1.0941,1.0,2.0,1.0196,1.0039,1.0,2.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13170,rs1130640_C,2.0,2.0,1.0000,2.0,2.0,2.0,1.0000,2.0000,2.0,...,1.0000,2.0,1.0,1.0000,2.0,1.0,2.0000,2.0000,1.0,2.0
13171,rs10747043_G,2.0,2.0,1.0000,2.0,2.0,2.0,1.0000,2.0000,2.0,...,1.0000,2.0,0.0,1.0000,2.0,1.0,2.0000,1.0000,1.0,1.0
13172,rs10870171_C,2.0,2.0,1.0000,2.0,2.0,2.0,1.0000,2.0000,2.0,...,1.0000,2.0,0.0,1.0000,2.0,1.0,2.0000,1.0000,1.0,1.0
13173,rs10747044_G,2.0,2.0,1.0000,2.0,2.0,2.0,1.0000,1.9961,2.0,...,1.0000,2.0,0.0,1.0000,2.0,1.0,2.0000,1.0000,1.0,1.0


In [668]:
(final_df["UKB_ID"] == snps).all()

True

#### T2D

In [68]:
geno = feather.read_feather("../../../cagi6-prs/features/baseline_snps_no_deviant-baseline_plus_samples/t2d_age_matched/t2d.age.matched.feather")
geno

Unnamed: 0,index,2390912,4497935,2439435,3156276,5595263,2365645,1541814,1940466,4445773,...,5128738,2749635,2465958,2668039,2510487,1810779,4104408,2874016,5798474,1990762
0,rs2032562_T,2.0,0.0,0.0,2.0,0.0000,1.0,1.0,0.0000,0.0000,...,1.0,0.0,1.0,1.0,1.0000,0.0000,1.0,1.0,0.0,0.0
1,rs10864313_T,2.0,0.0,0.0,2.0,0.0000,1.0,1.0,0.0000,0.0000,...,1.0,0.0,1.0,1.0,1.0000,0.0000,1.0,1.0,0.0,0.0
2,rs4908693_C,2.0,0.0,0.0,2.0,0.0000,1.0,1.0,0.0000,0.6588,...,1.0,0.0,1.0,1.0,1.0000,0.0000,1.0,1.0,0.0,0.0
3,rs11121016_A,2.0,0.0,0.0,2.0,0.0000,1.0,1.0,0.0000,0.0000,...,1.0,0.0,1.0,1.0,1.0000,0.0000,1.0,1.0,0.0,0.0
4,rs11121018_T,2.0,0.0,0.0,2.0,0.0000,1.0,1.0,0.0000,0.0000,...,1.0,0.0,1.0,1.0,1.0000,0.0000,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6281,rs8176645_T,2.0,2.0,2.0,1.0,1.0039,0.0,2.0,1.1961,2.0000,...,2.0,2.0,2.0,1.0,1.3333,1.3176,2.0,1.0,2.0,1.0
6282,rs529565_T,2.0,2.0,2.0,1.0,2.0000,0.0,2.0,1.0000,2.0000,...,2.0,2.0,2.0,2.0,1.0000,1.0000,2.0,2.0,2.0,1.0
6283,rs532436_G,2.0,2.0,2.0,1.0,2.0000,0.0,2.0,2.0000,2.0000,...,2.0,2.0,2.0,2.0,1.0000,2.0000,2.0,2.0,2.0,1.0
6284,rs600038_T,2.0,1.0,2.0,1.0,1.0000,0.0,2.0,2.0000,1.0000,...,2.0,2.0,2.0,1.0,1.0000,2.0000,2.0,1.0,2.0,1.0


In [69]:
snps = ["_".join(var.split("_")[:-1]) for var in geno["index"]]
alleles = [var.split(".")[0][-1] for var in geno["index"]]
allele_mp = dict(zip(snps, alleles))

In [70]:
snp_map = pd.read_csv("../../../cagi6-prs/snps/ukb.extracted.intersected.mgb.snps.tsv", sep="\t")
subset_snp_map = snp_map[snp_map["UKB_ID"].isin(snps)][["UKB_ID", "MGB_ID"]]
subset_snp_map

Unnamed: 0,UKB_ID,MGB_ID
513,rs2032562,1:7801167_T_G
514,rs10864313,1:7801880_T_G
515,rs4908693,1:7803892_C_T
516,rs11121016,1:7804515_A_G
517,rs11121018,1:7807394_T_C
...,...,...
1617651,rs8176645,9:136149098_T_A
1617653,rs529565,9:136149500_T_C
1617655,rs532436,9:136149830_G_A
1617656,rs600038,9:136151806_T_C


In [71]:
stats = pd.read_csv("../../../cagi6-prs/features/baseline_snps_no_deviant-baseline_plus_samples/t2d_age_matched/t2d.age.matched.train.stats.tsv", sep="\t")
subset_stats = stats[stats["0"].isin(snps)]
subset_stats

Unnamed: 0,0,mean,std
0,rs2032562,0.559089,0.632914
1,rs10864313,0.559158,0.632983
2,rs4908693,0.559035,0.632656
3,rs11121016,0.558671,0.633194
4,rs11121018,0.558363,0.633092
...,...,...,...
6281,rs8176645,1.477631,0.570210
6282,rs529565,1.356116,0.659504
6283,rs532436,1.619826,0.554665
6284,rs600038,1.575347,0.577725


In [72]:
summary_df = pd.merge(subset_snp_map, subset_stats, left_on="UKB_ID", right_on="0").drop_duplicates("UKB_ID")

In [73]:
summary_df = summary_df.set_index("UKB_ID").loc[pd.Index(snps)].reset_index()

In [74]:
summary_df["expected_allele"] = summary_df["index"].map(allele_mp)

In [75]:
final_df = summary_df[["index", "MGB_ID", "expected_allele", "mean", "std"]]

In [76]:
final_df.columns = ["UKB_ID", "MGB_ID", "EXPECTED_ALLELE", "MEAN", "STD"]

**Sanity checks**

In [77]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [78]:
actual_feather = feather.read_feather("MGB_10.t2d.zscored.feather")

In [79]:
(final_df["MGB_ID"] == actual_feather["0"]).all()

True

In [80]:
(final_df["UKB_ID"] == stats["0"]).all()

True

In [81]:
(summary_df["index"] == stats["0"]).all()

True

In [82]:
(final_df["UKB_ID"] == snps).all()

True

**Save**

In [532]:
final_df.to_csv("t2d.summary.tsv", sep="\t", index=False)

In [533]:
final_df.shape

(6286, 5)

In [534]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [535]:
final_df["MGB_ID"].to_csv("t2d.extract.txt", header=None, index=False)

### Step 2b - Make an age summary file

In [83]:
age_df = pd.DataFrame(index=["BCA", "CAD", "IBD", "T2D"], columns=["MEAN", "STD"])

In [84]:
# bca
bca_tsv = pd.read_csv("/cellar/shared/carterlab/projects/InSNPtion/cagi6-prs/features/bca_mixed_no_deviant/bca.age.matched.zage.tsv", sep="\t")
bca_age = bca_tsv[bca_tsv["TRAIN"] == 1]["AGE"]
bca_age_mean, bca_age_std = bca_age.mean(), bca_age.std()
age_df.loc["BCA"] = [bca_age_mean, bca_age_std]

In [85]:
# cad
cad_tsv = pd.read_csv("/cellar/shared/carterlab/projects/InSNPtion/cagi6-prs/features/baseline_snps_no_deviant-baseline_plus_samples/cad_age_matched/cad.age.matched.zage.tsv", sep="\t")
cad_age = cad_tsv[cad_tsv["TRAIN"] == 1]["AGE"]
cad_age_mean, cad_age_std = cad_age.mean(), cad_age.std()
age_df.loc["CAD"] = [cad_age_mean, cad_age_std]

In [86]:
# ibd
ibd_tsv = pd.read_csv("/cellar/shared/carterlab/projects/InSNPtion/cagi6-prs/features/baseline_snps_no_deviant-baseline_plus_samples/ibd_age_matched/ibd.age.matched.zage.tsv", sep="\t")
ibd_age = ibd_tsv[ibd_tsv["TRAIN"] == 1]["AGE"]
ibd_age_mean, ibd_age_std = ibd_age.mean(), ibd_age.std()
age_df.loc["IBD"] = [ibd_age_mean, ibd_age_std]

In [87]:
# t2d
t2d_tsv = pd.read_csv("/cellar/shared/carterlab/projects/InSNPtion/cagi6-prs/features/baseline_snps_no_deviant-baseline_plus_samples/t2d_age_matched/t2d.age.matched.zage.tsv", sep="\t")
t2d_age = t2d_tsv[t2d_tsv["TRAIN"] == 1]["AGE"]
t2d_age_mean, t2d_age_std = t2d_age.mean(), t2d_age.std()
age_df.loc["T2D"] = [t2d_age_mean, t2d_age_std]

In [88]:
age_df.head()

Unnamed: 0,MEAN,STD
BCA,60.205014,9.961745
CAD,59.85252,6.374422
IBD,57.272552,7.832338
T2D,60.522807,6.395605


In [89]:
age_df.to_csv("age.summary.tsv", sep="\t", index=True)

## Step 3 - Test scripts for prepping files

### Step 3a - Build a feather file from raw and double check it matches model allele
Basis for `make-feather-for-loader.py` script

In [28]:
raw = pd.read_csv("MGB_10_all/MGB_10_all.raw", delim_whitespace=True)

In [30]:
IIDs = raw["IID"]

In [31]:
raw = raw.iloc[:, 6:].T
raw.columns = IIDs
raw.index.name = "0"

In [33]:
summary = pd.read_csv("MGB_10_all/ibd.age.old.summary.tsv", sep="\t")

In [37]:
snps = ["_".join(var.split("_")[:-1]) for var in raw.index]
alleles = [var[-1] for var in raw.index]
#allele_mp = dict(zip(snps, alleles))

In [46]:
raw_summary = pd.DataFrame({"INDEX":raw.index, "MGB_ID":snps, "ACTUAL_ALLELE":alleles})

In [53]:
merged_summary = pd.merge(raw_summary, summary, on="MGB_ID").set_index("INDEX")

In [58]:
ordered_merged_summary = merged_summary.loc[raw.index]

In [65]:
mismatched_pos = np.where(ordered_merged_summary["ACTUAL_ALLELE"] != ordered_merged_summary["EXPECTED_ALLELE"])[0]

In [70]:
raw.iloc[mismatched_pos, :] = 2 - raw.iloc[mismatched_pos, :]

In [74]:
zraw = raw.subtract(ordered_merged_summary["MEAN"].values, axis="index")
zraw = zraw.div(ordered_merged_summary["STD"].values, axis="index")

In [76]:
feather.read_feather("MGB_10_all/MGB_10_all.zscored.feather")

Unnamed: 0,0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,MGB00010
0,1:1776269_C_A_C,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405
1,1:1781220_T_C_T,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103
2,1:1796616_G_A_G,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522
3,1:1860087_C_T_T,0.138641,0.138641,0.138641,0.138641,0.138641,0.138641,0.138641,0.138641,0.138641,0.138641
4,1:1864526_C_T_T,0.144202,0.144202,0.144202,0.144202,0.144202,0.144202,0.144202,0.144202,0.144202,0.144202
...,...,...,...,...,...,...,...,...,...,...,...
5719,22:39774525_A_G_G,0.572654,0.572654,0.572654,0.572654,0.572654,0.572654,0.572654,0.572654,0.572654,0.572654
5720,22:43561675_T_G_G,0.463974,0.463974,0.463974,0.463974,0.463974,0.463974,0.463974,0.463974,0.463974,0.463974
5721,22:43561982_C_T_T,0.463861,0.463861,0.463861,0.463861,0.463861,0.463861,0.463861,0.463861,0.463861,0.463861
5722,22:43562306_A_G_G,0.474088,0.474088,0.474088,0.474088,0.474088,0.474088,0.474088,0.474088,0.474088,0.474088


#### 100x1000

In [151]:
feather.write_feather(raw.reset_index(), "MGB_100_1000.feather")

In [152]:
feather.read_feather("MGB_100_1000.feather")

Unnamed: 0,0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,...,MGB00091,MGB00092,MGB00093,MGB00094,MGB00095,MGB00096,MGB00097,MGB00098,MGB00099,MGB00100
0,rs3131972_A,1,0,0,2,1,2,1,2,0,...,1,1,1,2,1,0,1,1,0,1
1,1:840327_G_A_A,1,1,0,1,0,0,2,2,1,...,2,1,0,0,1,2,0,2,0,2
2,rs4970382_T,2,1,2,1,0,1,2,1,0,...,1,1,1,2,0,1,1,1,1,1
3,1:846808_C_T_C,1,0,1,0,1,0,0,1,2,...,0,1,0,1,1,1,0,1,0,1
4,Affx-15447216_C,0,1,0,2,2,1,1,0,1,...,1,1,0,2,2,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1:4496256_C_T_T,2,2,0,1,1,1,0,1,1,...,0,2,1,2,1,1,0,1,1,0
996,1:4496659_C_T_T,0,1,0,2,0,2,0,1,1,...,2,0,0,1,1,0,1,1,1,0
997,1:4497097_G_A_A,1,1,1,1,1,1,0,2,2,...,0,1,2,0,2,0,0,1,1,2
998,1:4497118_C_T_C,1,0,2,0,1,1,1,0,1,...,2,1,1,2,0,0,2,1,1,1


In [193]:
stats = pd.DataFrame(data={"0": raw.index[:1000], "mean": np.random.normal(size=1000), "std": np.random.normal(size=1000)})

In [194]:
stats.to_csv("MGB_100_1000.stats.tsv", sep="\t", index=False)

In [198]:
raw = feather.read_feather("MGB_100_1000.feather").set_index("0")

In [226]:
zraw = raw.subtract(stats["mean"].values, axis="index")
zraw = zraw.div(stats["std"].values, axis="index")

In [236]:
feather.write_feather(zraw.reset_index(), "MGB_100_1000.zscored.feather")

In [238]:
zraw

Unnamed: 0_level_0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,MGB00010,...,MGB00091,MGB00092,MGB00093,MGB00094,MGB00095,MGB00096,MGB00097,MGB00098,MGB00099,MGB00100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rs3131972_A,-0.435274,0.145781,0.145781,-1.016330,-0.435274,-1.016330,-0.435274,-1.016330,0.145781,-1.016330,...,-0.435274,-0.435274,-0.435274,-1.016330,-0.435274,0.145781,-0.435274,-0.435274,0.145781,-0.435274
1:840327_G_A_A,3.971477,3.971477,0.743766,3.971477,0.743766,0.743766,7.199187,7.199187,3.971477,7.199187,...,7.199187,3.971477,0.743766,0.743766,3.971477,7.199187,0.743766,7.199187,0.743766,7.199187
rs4970382_T,-3.526750,-2.746769,-3.526750,-2.746769,-1.966788,-2.746769,-3.526750,-2.746769,-1.966788,-3.526750,...,-2.746769,-2.746769,-2.746769,-3.526750,-1.966788,-2.746769,-2.746769,-2.746769,-2.746769,-2.746769
1:846808_C_T_C,2.722384,1.299216,2.722384,1.299216,2.722384,1.299216,1.299216,2.722384,4.145552,2.722384,...,1.299216,2.722384,1.299216,2.722384,2.722384,2.722384,1.299216,2.722384,1.299216,2.722384
Affx-15447216_C,5.412217,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,5.412217,1.196706,1.196706,...,1.196706,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,1.196706,5.412217,5.412217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1:4496256_C_T_T,-1.126842,-1.126842,0.850135,-0.138353,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,-0.138353,...,0.850135,-1.126842,-0.138353,-1.126842,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,0.850135
1:4496659_C_T_T,-2.201933,-4.277609,-2.201933,-6.353284,-2.201933,-6.353284,-2.201933,-4.277609,-4.277609,-2.201933,...,-6.353284,-2.201933,-2.201933,-4.277609,-4.277609,-2.201933,-4.277609,-4.277609,-4.277609,-2.201933
1:4497097_G_A_A,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,0.356939,-0.948368,-0.948368,0.356939,...,0.356939,-0.295714,-0.948368,0.356939,-0.948368,0.356939,0.356939,-0.295714,-0.295714,-0.948368
1:4497118_C_T_C,-11.307576,-20.985076,-1.630075,-20.985076,-11.307576,-11.307576,-11.307576,-20.985076,-11.307576,-20.985076,...,-1.630075,-11.307576,-11.307576,-1.630075,-20.985076,-20.985076,-1.630075,-11.307576,-11.307576,-11.307576


In [237]:
feather.read_feather("../../test.zscored.feather")

Unnamed: 0,0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,...,MGB00091,MGB00092,MGB00093,MGB00094,MGB00095,MGB00096,MGB00097,MGB00098,MGB00099,MGB00100
0,rs3131972_A,-0.435274,0.145781,0.145781,-1.016330,-0.435274,-1.016330,-0.435274,-1.016330,0.145781,...,-0.435274,-0.435274,-0.435274,-1.016330,-0.435274,0.145781,-0.435274,-0.435274,0.145781,-0.435274
1,1:840327_G_A_A,3.971477,3.971477,0.743766,3.971477,0.743766,0.743766,7.199187,7.199187,3.971477,...,7.199187,3.971477,0.743766,0.743766,3.971477,7.199187,0.743766,7.199187,0.743766,7.199187
2,rs4970382_T,-3.526750,-2.746769,-3.526750,-2.746769,-1.966788,-2.746769,-3.526750,-2.746769,-1.966788,...,-2.746769,-2.746769,-2.746769,-3.526750,-1.966788,-2.746769,-2.746769,-2.746769,-2.746769,-2.746769
3,1:846808_C_T_C,2.722384,1.299216,2.722384,1.299216,2.722384,1.299216,1.299216,2.722384,4.145552,...,1.299216,2.722384,1.299216,2.722384,2.722384,2.722384,1.299216,2.722384,1.299216,2.722384
4,Affx-15447216_C,5.412217,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,5.412217,1.196706,...,1.196706,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,1.196706,5.412217,5.412217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1:4496256_C_T_T,-1.126842,-1.126842,0.850135,-0.138353,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,...,0.850135,-1.126842,-0.138353,-1.126842,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,0.850135
996,1:4496659_C_T_T,-2.201933,-4.277609,-2.201933,-6.353284,-2.201933,-6.353284,-2.201933,-4.277609,-4.277609,...,-6.353284,-2.201933,-2.201933,-4.277609,-4.277609,-2.201933,-4.277609,-4.277609,-4.277609,-2.201933
997,1:4497097_G_A_A,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,0.356939,-0.948368,-0.948368,...,0.356939,-0.295714,-0.948368,0.356939,-0.948368,0.356939,0.356939,-0.295714,-0.295714,-0.948368
998,1:4497118_C_T_C,-11.307576,-20.985076,-1.630075,-20.985076,-11.307576,-11.307576,-11.307576,-20.985076,-11.307576,...,-1.630075,-11.307576,-11.307576,-1.630075,-20.985076,-20.985076,-1.630075,-11.307576,-11.307576,-11.307576


### Step 3b - Phenotype files for dataloading
Build a tsv and an ids file for dataloading. Basis for `make-pheno-for-loader.py` script

In [130]:
tsv = fam.iloc[:100, :].copy()

In [131]:
tsv.columns = ["FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE"]

In [132]:
tsv["AGE"] = -9

In [133]:
tsv["FH"] = -9

In [134]:
tsv["ETH"] = "EUR"

In [161]:
tsv.to_csv("MGB_100_1000.tsv", sep="\t")

In [138]:
ids = tsv["IID"].values

In [139]:
np.savetxt("MGB_100_1000.ids.txt", ids, fmt="%s")

### Step 3c - Test dataloading

In [71]:
import sys

In [72]:
sys.path.append("/cellar/users/aklie/projects/InSNPtion/cagi6-prs-docker/ThisIsTheGoodPartPecker_ThisIsWhereTheJobGetsFun_AskAndYouShallReceive/")

In [73]:
import SNPAndClinicalLoader

In [85]:
feath = feather.read_feather("MGB_10_all/MGB_10_all.bca.zscored.feather")
feath = feath.set_index("0")

In [86]:
loader = SNPAndClinicalLoader.get_loader(
    ids_file="MGB_10_all/MGB_10_all.ids.txt",
    genotype_file=feath,
    phenotype_file="MGB_10_all/MGB_10_all.bca.tsv",
    disease_column="PHENOTYPE",
    batch_size=10,
    shuffle=True,
    num_workers=2,
)

In [90]:
for batch_num, (snp, pheno, eth, fh, sex, age) in enumerate(loader):
    print(batch_num)

0


In [92]:
snp.size(), pheno.size(), eth.size(), fh.size(), sex.size(), age.size()

(torch.Size([10, 12034]),
 torch.Size([10, 1]),
 torch.Size([10, 1]),
 torch.Size([10, 1]),
 torch.Size([10, 1]),
 torch.Size([10, 1]))

In [97]:
snp[0], pheno[0], eth[0], fh[0], sex[0], age[0]

(tensor([ 0.4628,  0.4584,  0.4585,  ..., -1.2344, -1.2725, -1.8072]),
 tensor([0]),
 tensor([0.]),
 tensor([-9]),
 tensor([2]),
 tensor([1.2881], dtype=torch.float64))

In [104]:
np.unique(snp[:, 1000], return_counts=True)

(array([0.9720436], dtype=float32), array([10]))

# Scratch
Place for old or testing code

## Building other test sets

### 100 x 1000

#### PED

In [96]:
# Run this to generate a 100 individual by 1000 SNP simulated test set
ped = []
for i, (row, ind) in tqdm.tqdm(enumerate(fam.iloc[:100, :].iterrows())):
    rand_genotype = np.concatenate(
        np.stack(bim.iloc[:1000, :][[4, 5]].apply(np.random.choice, size=2, axis=1))
    )
    ped.append((list(ind.values) + list(rand_genotype)))
ped = np.array(ped)

100it [00:06, 15.61it/s]


In [97]:
ped[:, 5] = np.random.choice([1, 2], size=100)

In [98]:
np.savetxt("MGB_100_1000/MGB_100_1000.ped", ped, delimiter=" ", fmt="%s")

#### BIM

In [99]:
bim[[0, 1, 2, 3]].iloc[:1000, :].to_csv(
    "MGB_100_1000/MGB_100_1000.map", sep="\t", header=None, index=False
)

### Full set

In [20]:
num = 100
for i in range(int(len(fam)/num)+1):
    start = num*i
    end = num*(i+1)
    if end > len(fam):
        end = len(fam)
    print(start, end)
    np.savetxt("full/MGB_{}_{}.ped".format(start, end), np.concatenate([fam.values[start:end], np.tile(bim[[4, 5]].values.flatten(), (num, 1))], axis=1), fmt="%s")
    if i == 3:
        break

0 100
100 200
200 300
300 400


## Dataloading tests

In [595]:
#Step 1: Load raw and save intermediate data structures
print("Loading raw file from {}...".format("MGB_10.cad.raw"))
raw = pd.read_csv("MGB_10.cad.raw", delim_whitespace=True)
IIDs = raw["IID"]
raw = raw.iloc[:, 6:].T
raw.columns = IIDs
snps = ["_".join(var.split("_")[:-1]) for var in raw.index]
raw.index.name = "0"
alleles = [var[-1] for var in raw.index]
raw.index = snps
raw_summary = pd.DataFrame({"INDEX":raw.index, "MGB_ID":snps, "ACTUAL_ALLELE":alleles})

Loading raw file from MGB_10.cad.raw...


In [597]:
#Step 2: Load summary file for SNPs
print("Loading SNP summary file from {}...".format("cad.summary.tsv"))
summary = pd.read_csv("cad.summary.tsv", sep="\t")

Loading SNP summary file from cad.summary.tsv...


In [598]:
len(snps)

7451

In [599]:
summary

Unnamed: 0,UKB_ID,MGB_ID,EXPECTED_ALLELE,MEAN,STD
0,rs1692584,1:2172917_T_C,T,1.234933,0.677531
1,rs1692585,1:2172919_T_C,T,1.247089,0.670771
2,rs76282923,1:14612626_T_C,T,1.901087,0.306297
3,rs142707962,1:14615321_T_C,T,1.901117,0.306254
4,rs74590598,1:14620817_G_C,G,1.901464,0.305765
...,...,...,...,...,...
7464,rs559723,9:136150484_A_G,A,0.974722,0.703103
7465,rs600038,9:136151806_T_C,T,1.573194,0.577394
7466,rs11244061,9:136153981_C_T,C,1.769694,0.451291
7467,rs649129,9:136154304_C_T,C,1.573252,0.577947


In [600]:
#Step 3: Combine these files
print("Merging raw with SNP summary file...")
merged_summary = pd.merge(summary, raw_summary, on="MGB_ID").set_index("INDEX")

Merging raw with SNP summary file...


In [601]:
merged_summary

Unnamed: 0_level_0,UKB_ID,MGB_ID,EXPECTED_ALLELE,MEAN,STD,ACTUAL_ALLELE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1:2172917_T_C,rs1692584,1:2172917_T_C,T,1.234933,0.677531,T
1:2172919_T_C,rs1692585,1:2172919_T_C,T,1.247089,0.670771,T
1:14612626_T_C,rs76282923,1:14612626_T_C,T,1.901087,0.306297,T
1:14615321_T_C,rs142707962,1:14615321_T_C,T,1.901117,0.306254,T
1:14620817_G_C,rs74590598,1:14620817_G_C,G,1.901464,0.305765,G
...,...,...,...,...,...,...
9:136150484_A_G,rs559723,9:136150484_A_G,A,0.974722,0.703103,G
9:136151806_T_C,rs600038,9:136151806_T_C,T,1.573194,0.577394,T
9:136153981_C_T,rs11244061,9:136153981_C_T,C,1.769694,0.451291,C
9:136154304_C_T,rs649129,9:136154304_C_T,C,1.573252,0.577947,C


In [602]:
#Step 4: Clean and order raw
dups = merged_summary[merged_summary["MGB_ID"].duplicated()]
add_on = pd.DataFrame(index=dups["MGB_ID"], columns=raw.columns, data=np.repeat(dups["MEAN"].values.reshape(len(dups),1), raw.shape[1], axis=1))
raw = pd.concat([raw, add_on])

In [603]:
ordered_raw = raw.loc[merged_summary["MGB_ID"].drop_duplicates()]

In [604]:
(ordered_raw.index == merged_summary.index).all()

True

In [605]:
geno

Unnamed: 0,index,2390912,4497935,2439435,3156276,5595263,2365645,1541814,1940466,4445773,...,5128738,2749635,2465958,2668039,2510487,1810779,4104408,2874016,5798474,1990762
0,rs2032562_T,2.0,0.0,0.0,2.0,0.0000,1.0,1.0,0.0000,0.0000,...,1.0,0.0,1.0,1.0,1.0000,0.0000,1.0,1.0,0.0,0.0
1,rs10864313_T,2.0,0.0,0.0,2.0,0.0000,1.0,1.0,0.0000,0.0000,...,1.0,0.0,1.0,1.0,1.0000,0.0000,1.0,1.0,0.0,0.0
2,rs4908693_C,2.0,0.0,0.0,2.0,0.0000,1.0,1.0,0.0000,0.6588,...,1.0,0.0,1.0,1.0,1.0000,0.0000,1.0,1.0,0.0,0.0
3,rs11121016_A,2.0,0.0,0.0,2.0,0.0000,1.0,1.0,0.0000,0.0000,...,1.0,0.0,1.0,1.0,1.0000,0.0000,1.0,1.0,0.0,0.0
4,rs11121018_T,2.0,0.0,0.0,2.0,0.0000,1.0,1.0,0.0000,0.0000,...,1.0,0.0,1.0,1.0,1.0000,0.0000,1.0,1.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6281,rs8176645_T,2.0,2.0,2.0,1.0,1.0039,0.0,2.0,1.1961,2.0000,...,2.0,2.0,2.0,1.0,1.3333,1.3176,2.0,1.0,2.0,1.0
6282,rs529565_T,2.0,2.0,2.0,1.0,2.0000,0.0,2.0,1.0000,2.0000,...,2.0,2.0,2.0,2.0,1.0000,1.0000,2.0,2.0,2.0,1.0
6283,rs532436_G,2.0,2.0,2.0,1.0,2.0000,0.0,2.0,2.0000,2.0000,...,2.0,2.0,2.0,2.0,1.0000,2.0000,2.0,2.0,2.0,1.0
6284,rs600038_T,2.0,1.0,2.0,1.0,1.0000,0.0,2.0,2.0000,1.0000,...,2.0,2.0,2.0,1.0,1.0000,2.0000,2.0,1.0,2.0,1.0


In [606]:
merged_summary

Unnamed: 0_level_0,UKB_ID,MGB_ID,EXPECTED_ALLELE,MEAN,STD,ACTUAL_ALLELE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1:2172917_T_C,rs1692584,1:2172917_T_C,T,1.234933,0.677531,T
1:2172919_T_C,rs1692585,1:2172919_T_C,T,1.247089,0.670771,T
1:14612626_T_C,rs76282923,1:14612626_T_C,T,1.901087,0.306297,T
1:14615321_T_C,rs142707962,1:14615321_T_C,T,1.901117,0.306254,T
1:14620817_G_C,rs74590598,1:14620817_G_C,G,1.901464,0.305765,G
...,...,...,...,...,...,...
9:136150484_A_G,rs559723,9:136150484_A_G,A,0.974722,0.703103,G
9:136151806_T_C,rs600038,9:136151806_T_C,T,1.573194,0.577394,T
9:136153981_C_T,rs11244061,9:136153981_C_T,C,1.769694,0.451291,C
9:136154304_C_T,rs649129,9:136154304_C_T,C,1.573252,0.577947,C


In [607]:
ordered_raw[ordered_raw.index.str.contains("8:106256501_C_A")]

IID,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,MGB00010
8:106256501_C_A,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8:106256501_C_A,1.999862,1.999862,1.999862,1.999862,1.999862,1.999862,1.999862,1.999862,1.999862,1.999862


In [608]:
merged_summary[merged_summary["MGB_ID"].str.contains("8:106256501_C_A")]

Unnamed: 0_level_0,UKB_ID,MGB_ID,EXPECTED_ALLELE,MEAN,STD,ACTUAL_ALLELE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
8:106256501_C_A,rs10110132,8:106256501_C_A,C,1.409267,0.653169,C
8:106256501_C_A,rs10110132,8:106256501_C_A,C,1.999862,0.005697,C


In [609]:
geno[geno["index"].str.contains("rs10110132")]

Unnamed: 0,index,2390912,4497935,2439435,3156276,5595263,2365645,1541814,1940466,4445773,...,5128738,2749635,2465958,2668039,2510487,1810779,4104408,2874016,5798474,1990762


In [610]:
#Step 4: Correcting mismatched alleles, TODO: put number corrected
print("Correcting mismatched alleles...")
mismatched_pos = np.where(merged_summary["ACTUAL_ALLELE"] != merged_summary["EXPECTED_ALLELE"])[0]
ordered_raw.iloc[mismatched_pos, :] = 2 - ordered_raw.iloc[mismatched_pos, :]

#Step5 Z-score
print("Z-scoring genotypes...")
zraw = ordered_raw.subtract(merged_summary["MEAN"].values, axis="index")
zraw = zraw.div(merged_summary["STD"].values, axis="index")


Correcting mismatched alleles...
Z-scoring genotypes...


In [611]:
ordered_raw[""]

IID,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,MGB00010
1:2172917_T_C,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1:2172919_T_C,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1:14612626_T_C,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1:14615321_T_C,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1:14620817_G_C,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
...,...,...,...,...,...,...,...,...,...,...
9:136150484_A_G,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9:136151806_T_C,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9:136153981_C_T,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9:136154304_C_T,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [612]:
merged_summary

Unnamed: 0_level_0,UKB_ID,MGB_ID,EXPECTED_ALLELE,MEAN,STD,ACTUAL_ALLELE
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1:2172917_T_C,rs1692584,1:2172917_T_C,T,1.234933,0.677531,T
1:2172919_T_C,rs1692585,1:2172919_T_C,T,1.247089,0.670771,T
1:14612626_T_C,rs76282923,1:14612626_T_C,T,1.901087,0.306297,T
1:14615321_T_C,rs142707962,1:14615321_T_C,T,1.901117,0.306254,T
1:14620817_G_C,rs74590598,1:14620817_G_C,G,1.901464,0.305765,G
...,...,...,...,...,...,...
9:136150484_A_G,rs559723,9:136150484_A_G,A,0.974722,0.703103,G
9:136151806_T_C,rs600038,9:136151806_T_C,T,1.573194,0.577394,T
9:136153981_C_T,rs11244061,9:136153981_C_T,C,1.769694,0.451291,C
9:136154304_C_T,rs649129,9:136154304_C_T,C,1.573252,0.577947,C


In [614]:
zraw[zraw.index.str.contains("8:106256501_C_A")]

IID,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,MGB00010
8:106256501_C_A,-0.626586,-0.626586,-0.626586,-0.626586,-0.626586,-0.626586,-0.626586,-0.626586,-0.626586,-0.626586
8:106256501_C_A,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [687]:
(zraw["MGB00002"] == zraw["MGB00004"]).all()

True

In [678]:
zraw

IID,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,MGB00010
1:2172917_T_C,-0.346748,-0.346748,-0.346748,-0.346748,-0.346748,-0.346748,-0.346748,-0.346748,-0.346748,-0.346748
1:2172919_T_C,-0.368366,-0.368366,-0.368366,-0.368366,-0.368366,-0.368366,-0.368366,-0.368366,-0.368366,-0.368366
1:14612626_T_C,-2.941869,-2.941869,-2.941869,-2.941869,-2.941869,-2.941869,-2.941869,-2.941869,-2.941869,-2.941869
1:14615321_T_C,-2.942387,-2.942387,-2.942387,-2.942387,-2.942387,-2.942387,-2.942387,-2.942387,-2.942387,-2.942387
1:14620817_G_C,-2.948226,-2.948226,-2.948226,-2.948226,-2.948226,-2.948226,-2.948226,-2.948226,-2.948226,-2.948226
...,...,...,...,...,...,...,...,...,...,...
9:136150484_A_G,0.035952,0.035952,0.035952,0.035952,0.035952,0.035952,0.035952,0.035952,0.035952,0.035952
9:136151806_T_C,-0.992726,-0.992726,-0.992726,-0.992726,-0.992726,-0.992726,-0.992726,-0.992726,-0.992726,-0.992726
9:136153981_C_T,-1.705538,-1.705538,-1.705538,-1.705538,-1.705538,-1.705538,-1.705538,-1.705538,-1.705538,-1.705538
9:136154304_C_T,-0.991877,-0.991877,-0.991877,-0.991877,-0.991877,-0.991877,-0.991877,-0.991877,-0.991877,-0.991877


# References