# Notebook Title

**Authorship:**
Author, *MM/DD/YYYY*
***
**Description:**
Notebook to do some cool stuff
***
**TODOs:**
 - <font color='green'> Done TODO </font>
 - <font color='orange'> WIP TODO </font>
 - <font color='red'> Queued TODO </font>
***

## Set-up

In [178]:
# The classics
import numpy as np
import pandas as pd

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

## Part 1 - Build a dummy bed file from given bim and fam

In [128]:
bim = pd.read_csv("MGB.bim", sep="\t", header=None)

In [129]:
fam = pd.read_csv("MGB.fam", delim_whitespace=True, header=None)

In [93]:
ped = []
for row, ind in fam.iloc[:100, :].iterrows():
    rand_genotype = np.concatenate(
        np.stack(bim.iloc[:1000, :][[4, 5]].apply(np.random.choice, size=2, axis=1))
    )
    ped.append((list(ind.values) + list(rand_genotype)))

In [94]:
np.array(ped).shape

(100, 2006)

In [96]:
np.savetxt("MGB_100_1000.ped", np.array(ped), delimiter=" ", fmt="%s")

In [97]:
bim[[0, 1, 2, 3]].iloc[:1000, :].to_csv(
    "MGB_100_1000.map", sep="\t", header=None, index=False
)

## Part 2 - Build a feather file from raw and double check it matches model allele
If alleles don't match, do the 2- thing

In [144]:
from pyarrow import feather

In [145]:
raw = pd.read_csv("MGB_100_1000.raw", delim_whitespace=True)

In [146]:
IIDs = raw["IID"]

In [147]:
raw = raw.iloc[:, 6:].T
raw.columns = IIDs
raw.index.name = "0"

In [151]:
feather.write_feather(raw.reset_index(), "MGB_100_1000.feather")

In [152]:
feather.read_feather("MGB_100_1000.feather")

Unnamed: 0,0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,...,MGB00091,MGB00092,MGB00093,MGB00094,MGB00095,MGB00096,MGB00097,MGB00098,MGB00099,MGB00100
0,rs3131972_A,1,0,0,2,1,2,1,2,0,...,1,1,1,2,1,0,1,1,0,1
1,1:840327_G_A_A,1,1,0,1,0,0,2,2,1,...,2,1,0,0,1,2,0,2,0,2
2,rs4970382_T,2,1,2,1,0,1,2,1,0,...,1,1,1,2,0,1,1,1,1,1
3,1:846808_C_T_C,1,0,1,0,1,0,0,1,2,...,0,1,0,1,1,1,0,1,0,1
4,Affx-15447216_C,0,1,0,2,2,1,1,0,1,...,1,1,0,2,2,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1:4496256_C_T_T,2,2,0,1,1,1,0,1,1,...,0,2,1,2,1,1,0,1,1,0
996,1:4496659_C_T_T,0,1,0,2,0,2,0,1,1,...,2,0,0,1,1,0,1,1,1,0
997,1:4497097_G_A_A,1,1,1,1,1,1,0,2,2,...,0,1,2,0,2,0,0,1,1,2
998,1:4497118_C_T_C,1,0,2,0,1,1,1,0,1,...,2,1,1,2,0,0,2,1,1,1


## Part 3 - Z-score based on training metrics

In [193]:
stats = pd.DataFrame(data={"0": raw.index[:1000], "mean": np.random.normal(size=1000), "std": np.random.normal(size=1000)})

In [194]:
stats.to_csv("MGB_100_1000.stats.tsv", sep="\t", index=False)

In [198]:
raw = feather.read_feather("MGB_100_1000.feather").set_index("0")

In [226]:
zraw = raw.subtract(stats["mean"].values, axis="index")
zraw = zraw.div(stats["std"].values, axis="index")

In [236]:
feather.write_feather(zraw.reset_index(), "MGB_100_1000.zscored.feather")

In [238]:
zraw

Unnamed: 0_level_0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,MGB00010,...,MGB00091,MGB00092,MGB00093,MGB00094,MGB00095,MGB00096,MGB00097,MGB00098,MGB00099,MGB00100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rs3131972_A,-0.435274,0.145781,0.145781,-1.016330,-0.435274,-1.016330,-0.435274,-1.016330,0.145781,-1.016330,...,-0.435274,-0.435274,-0.435274,-1.016330,-0.435274,0.145781,-0.435274,-0.435274,0.145781,-0.435274
1:840327_G_A_A,3.971477,3.971477,0.743766,3.971477,0.743766,0.743766,7.199187,7.199187,3.971477,7.199187,...,7.199187,3.971477,0.743766,0.743766,3.971477,7.199187,0.743766,7.199187,0.743766,7.199187
rs4970382_T,-3.526750,-2.746769,-3.526750,-2.746769,-1.966788,-2.746769,-3.526750,-2.746769,-1.966788,-3.526750,...,-2.746769,-2.746769,-2.746769,-3.526750,-1.966788,-2.746769,-2.746769,-2.746769,-2.746769,-2.746769
1:846808_C_T_C,2.722384,1.299216,2.722384,1.299216,2.722384,1.299216,1.299216,2.722384,4.145552,2.722384,...,1.299216,2.722384,1.299216,2.722384,2.722384,2.722384,1.299216,2.722384,1.299216,2.722384
Affx-15447216_C,5.412217,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,5.412217,1.196706,1.196706,...,1.196706,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,1.196706,5.412217,5.412217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1:4496256_C_T_T,-1.126842,-1.126842,0.850135,-0.138353,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,-0.138353,...,0.850135,-1.126842,-0.138353,-1.126842,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,0.850135
1:4496659_C_T_T,-2.201933,-4.277609,-2.201933,-6.353284,-2.201933,-6.353284,-2.201933,-4.277609,-4.277609,-2.201933,...,-6.353284,-2.201933,-2.201933,-4.277609,-4.277609,-2.201933,-4.277609,-4.277609,-4.277609,-2.201933
1:4497097_G_A_A,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,0.356939,-0.948368,-0.948368,0.356939,...,0.356939,-0.295714,-0.948368,0.356939,-0.948368,0.356939,0.356939,-0.295714,-0.295714,-0.948368
1:4497118_C_T_C,-11.307576,-20.985076,-1.630075,-20.985076,-11.307576,-11.307576,-11.307576,-20.985076,-11.307576,-20.985076,...,-1.630075,-11.307576,-11.307576,-1.630075,-20.985076,-20.985076,-1.630075,-11.307576,-11.307576,-11.307576


In [237]:
feather.read_feather("../../test.zscored.feather")

Unnamed: 0,0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,...,MGB00091,MGB00092,MGB00093,MGB00094,MGB00095,MGB00096,MGB00097,MGB00098,MGB00099,MGB00100
0,rs3131972_A,-0.435274,0.145781,0.145781,-1.016330,-0.435274,-1.016330,-0.435274,-1.016330,0.145781,...,-0.435274,-0.435274,-0.435274,-1.016330,-0.435274,0.145781,-0.435274,-0.435274,0.145781,-0.435274
1,1:840327_G_A_A,3.971477,3.971477,0.743766,3.971477,0.743766,0.743766,7.199187,7.199187,3.971477,...,7.199187,3.971477,0.743766,0.743766,3.971477,7.199187,0.743766,7.199187,0.743766,7.199187
2,rs4970382_T,-3.526750,-2.746769,-3.526750,-2.746769,-1.966788,-2.746769,-3.526750,-2.746769,-1.966788,...,-2.746769,-2.746769,-2.746769,-3.526750,-1.966788,-2.746769,-2.746769,-2.746769,-2.746769,-2.746769
3,1:846808_C_T_C,2.722384,1.299216,2.722384,1.299216,2.722384,1.299216,1.299216,2.722384,4.145552,...,1.299216,2.722384,1.299216,2.722384,2.722384,2.722384,1.299216,2.722384,1.299216,2.722384
4,Affx-15447216_C,5.412217,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,5.412217,1.196706,...,1.196706,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,1.196706,5.412217,5.412217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1:4496256_C_T_T,-1.126842,-1.126842,0.850135,-0.138353,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,...,0.850135,-1.126842,-0.138353,-1.126842,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,0.850135
996,1:4496659_C_T_T,-2.201933,-4.277609,-2.201933,-6.353284,-2.201933,-6.353284,-2.201933,-4.277609,-4.277609,...,-6.353284,-2.201933,-2.201933,-4.277609,-4.277609,-2.201933,-4.277609,-4.277609,-4.277609,-2.201933
997,1:4497097_G_A_A,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,0.356939,-0.948368,-0.948368,...,0.356939,-0.295714,-0.948368,0.356939,-0.948368,0.356939,0.356939,-0.295714,-0.295714,-0.948368
998,1:4497118_C_T_C,-11.307576,-20.985076,-1.630075,-20.985076,-11.307576,-11.307576,-11.307576,-20.985076,-11.307576,...,-1.630075,-11.307576,-11.307576,-1.630075,-20.985076,-20.985076,-1.630075,-11.307576,-11.307576,-11.307576


## Part 4 - Phenotype files for dataloading
Build a tsv and an ids file for dataloading

In [130]:
tsv = fam.iloc[:100, :].copy()

In [131]:
tsv.columns = ["FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE"]

In [132]:
tsv["AGE"] = -9

In [133]:
tsv["FH"] = -9

In [134]:
tsv["ETH"] = "EUR"

In [161]:
tsv.to_csv("MGB_100_1000.tsv", sep="\t")

In [138]:
ids = tsv["IID"].values

In [139]:
np.savetxt("MGB_100_1000.ids.txt", ids, fmt="%s")

## Part 5 - Test dataloading

In [239]:
import sys

In [240]:
sys.path.append("/cellar/users/aklie/cagi6-prs-docker")

In [241]:
import SNPLoader

In [243]:
loader = SNPLoader.get_loader(
    ids_file="../test/MGB_100_1000.ids.txt",
    genotype_file="../test/MGB_100_1000.zscored.feather",
    phenotype_file="../test/MGB_100_1000.tsv",
    disease_column="PHENOTYPE",
    batch_size=100,
    shuffle=True,
    num_workers=2,
)

In [244]:
for batch_num, (snp, pheno, eth, fh) in enumerate(loader):
    print(batch_num)

0


In [245]:
snp.size(), pheno.size(), eth.size(), fh.size()

(torch.Size([100, 1000]),
 torch.Size([100, 1]),
 torch.Size([100, 1]),
 torch.Size([100, 1]))

In [251]:
np.unique(snp[:, 0], return_counts=True)

(array([-1.0163296 , -0.43527448,  0.14578073], dtype=float32),
 array([15, 65, 20]))

# Scratch
Place for old or testing code

# References