# Notebook Title

**Authorship:**
Author, *MM/DD/YYYY*
***
**Description:**
Notebook to do some cool stuff
***
**TODOs:**
 - <font color='green'> Done TODO </font>
 - <font color='orange'> WIP TODO </font>
 - <font color='red'> Queued TODO </font>
***

## Set-up

In [36]:
# The classics
import numpy as np
import pandas as pd

# Other guys
import tqdm
from pyarrow import feather

# Autoreload extension
if 'autoreload' not in get_ipython().extension_manager.loaded:
    %load_ext autoreload
%autoreload 2

## Part 0 - Make dummy files for testing

### Part 0a - Build a dummy bed file from given bim and fam

In [2]:
bim = pd.read_csv("../val/MGB.bim", sep="\t", header=None)

In [3]:
fam = pd.read_csv("../val/MGB.fam", delim_whitespace=True, header=None)

#### 100 x 1000

##### PED

In [96]:
# Run this to generate a 100 individual by 1000 SNP simulated test set
ped = []
for i, (row, ind) in tqdm.tqdm(enumerate(fam.iloc[:100, :].iterrows())):
    rand_genotype = np.concatenate(
        np.stack(bim.iloc[:1000, :][[4, 5]].apply(np.random.choice, size=2, axis=1))
    )
    ped.append((list(ind.values) + list(rand_genotype)))
ped = np.array(ped)

100it [00:06, 15.61it/s]


In [97]:
ped[:, 5] = np.random.choice([1, 2], size=100)

In [98]:
np.savetxt("MGB_100_1000/MGB_100_1000.ped", ped, delimiter=" ", fmt="%s")

##### BIM

In [99]:
bim[[0, 1, 2, 3]].iloc[:1000, :].to_csv(
    "MGB_100_1000/MGB_100_1000.map", sep="\t", header=None, index=False
)

#### 100 x All SNPs

##### PED

In [4]:
fam.values[:10].shape, np.tile(bim[[4, 5]].values.flatten(), (10, 1)).shape, np.concatenate([fam.values[:10], np.tile(bim[[4, 5]].values.flatten(), (10, 1))], axis=1).shape

((10, 6), (10, 3407714), (10, 3407720))

In [5]:
sample_fam = fam.values[:10]

In [31]:
sample_fam[:, 5] = np.random.choice([1, 2], size=10)

In [33]:
np.savetxt("MGB_10_all/MGB_10_all.ped", 
           np.concatenate([sample_fam, np.tile(bim[[4, 5]].values.flatten(), (10, 1))], axis=1),
           fmt="%s")

##### MAP

In [25]:
bim[[0, 1, 2, 3]].to_csv(
    "MGB_10_all/MGB_10_all.map", sep="\t", header=None, index=False
)

#### Full set

In [20]:
num = 100
for i in range(int(len(fam)/num)+1):
    start = num*i
    end = num*(i+1)
    if end > len(fam):
        end = len(fam)
    print(start, end)
    np.savetxt("full/MGB_{}_{}.ped".format(start, end), np.concatenate([fam.values[start:end], np.tile(bim[[4, 5]].values.flatten(), (num, 1))], axis=1), fmt="%s")
    if i == 3:
        break

0 100
100 200
200 300
300 400


### Part 0b - Make a dummy covariates.tsv file

1. We will have the “AGE" and ten PCs (PC1, PC2, .., PC10) in a separate covariate file. The file name is “MGB.covariates.txt". It is a space-delimited plain text file similar to plink covariate files. The first two columns are FID and IID from the MGB.fam file. The sample order can be different between MGB.fam and MGB-covariates.txt. 

In [58]:
pcs = np.random.normal(loc=0, scale=3, size=(10, 10))

In [59]:
pcs.shape, sample_fam[:, :2].shape, ages.shape

((10, 10), (10, 2), (10, 1))

In [60]:
ages = np.random.normal(loc=50, scale=10, size=(10, 1))

In [61]:
sample_covar = np.concatenate([sample_fam[:, :2], ages, pcs], axis=1)

In [62]:
sample_covar[0]

array(['MGB00001', 'MGB00001', 66.66865714162273, 1.2622905687385135,
       0.6632332986461236, -1.3031528057680857, 0.0012940779292404904,
       5.708772427155591, -6.319302885969882, 0.45256534112375046,
       0.28469001727733156, 3.002453648556721, -4.111321613459907],
      dtype=object)

In [63]:
sample_covar_df = pd.DataFrame(sample_covar)

In [64]:
sample_covar_df.columns = ["FID", "IID", "AGE"] + ["PC{}".format(i+1) for i in range(10)]

In [65]:
sample_covar_df.head()

Unnamed: 0,FID,IID,AGE,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,MGB00001,MGB00001,66.668657,1.262291,0.663233,-1.303153,0.001294,5.708772,-6.319303,0.452565,0.28469,3.002454,-4.111322
1,MGB00002,MGB00002,53.190195,3.3063,-1.101641,1.97045,-0.872764,-3.840938,-0.671233,-0.165758,-0.421976,0.860991,-4.696233
2,MGB00003,MGB00003,38.929525,6.461285,-4.82919,0.6073,2.843771,-5.527301,-3.029981,5.825115,3.742236,6.044946,1.437086
3,MGB00004,MGB00004,58.494731,3.134901,-0.005092,1.167758,1.454855,3.297935,1.981656,-1.079379,1.793868,0.169963,-0.878017
4,MGB00005,MGB00005,60.674713,1.273421,1.723451,1.46698,2.157379,0.619361,1.180299,1.433365,4.477744,-4.105312,-0.212373


In [66]:
sample_covar_df.to_csv("MGB_10_all/MGB_10_all.covariates.txt", sep=" ", index=False)

### Part 0c - Make dummy age.summary.tsv file

In [67]:
age_means = np.random.normal(loc=50, scale=10, size=(4))

In [68]:
age_stds = np.abs(np.random.normal(loc=0, scale=10, size=(4)))

In [69]:
index = ["BCA", "CAD", "IBD", "T2D"]

In [70]:
pd.DataFrame(index=index, data={"MEAN":age_means, "STD": age_stds}).to_csv("MGB_10_all/age.summary.tsv", sep="\t", index=True)

## Part 1 - Make a SNP summary file

#### BCA

In [100]:
geno = feather.read_feather("../../../cagi6-prs/features/baseline_snps-baseline_plus_samples/bca_age_old/bca.age.old.feather")

In [101]:
snps = ["_".join(var.split("_")[:-1]) for var in geno["index"]]
alleles = [var[-1] for var in geno["index"]]
allele_mp = dict(zip(snps, alleles))

In [102]:
snp_map = pd.read_csv("../../../cagi6-prs/snps/ukb.extracted.intersected.mgb.snps.tsv", sep="\t")

In [103]:
subset_snp_map = snp_map[snp_map["UKB_ID"].isin(snps)][["UKB_ID", "MGB_ID"]]

In [104]:
stats = pd.read_csv("../../../cagi6-prs/features/baseline_snps-baseline_plus_samples/bca_age_old/bca.age.old.train.stats.tsv", sep="\t")

In [105]:
subset_stats = stats[stats["0"].isin(snps)]

In [106]:
summary_df = pd.merge(subset_snp_map, subset_stats, left_on="UKB_ID", right_on="0")

In [107]:
summary_df["expected_allele"] = summary_df["UKB_ID"].map(allele_mp)

In [108]:
final_df = summary_df[["UKB_ID", "MGB_ID", "expected_allele", "mean", "std"]]

In [109]:
final_df.columns = ["UKB_ID", "MGB_ID", "EXPECTED_ALLELE", "MEAN", "STD"]

In [112]:
final_df.to_csv("MGB_10_all/bca.age.old.summary.tsv", sep="\t", index=False)

In [113]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [114]:
final_df["MGB_ID"].to_csv("MGB_10_all/bca.age.old.extract.txt", header=None, index=False)

#### CAD

In [115]:
geno = feather.read_feather("../../../cagi6-prs/features/baseline_snps-baseline_plus_samples/cad_age_old/cad.age.old.feather")

In [116]:
snps = ["_".join(var.split("_")[:-1]) for var in geno["index"]]
alleles = [var[-1] for var in geno["index"]]
allele_mp = dict(zip(snps, alleles))

In [117]:
snp_map = pd.read_csv("../../../cagi6-prs/snps/ukb.extracted.intersected.mgb.snps.tsv", sep="\t")

In [118]:
subset_snp_map = snp_map[snp_map["UKB_ID"].isin(snps)][["UKB_ID", "MGB_ID"]]

In [119]:
stats = pd.read_csv("../../../cagi6-prs/features/baseline_snps-baseline_plus_samples/cad_age_old/cad.age.old.train.stats.tsv", sep="\t")

In [120]:
subset_stats = stats[stats["0"].isin(snps)]

In [121]:
summary_df = pd.merge(subset_snp_map, subset_stats, left_on="UKB_ID", right_on="0")

In [122]:
summary_df["expected_allele"] = summary_df["UKB_ID"].map(allele_mp)

In [123]:
final_df = summary_df[["UKB_ID", "MGB_ID", "expected_allele", "mean", "std"]]

In [124]:
final_df.columns = ["UKB_ID", "MGB_ID", "EXPECTED_ALLELE", "MEAN", "STD"]

In [125]:
final_df

Unnamed: 0,UKB_ID,MGB_ID,EXPECTED_ALLELE,MEAN,STD
0,rs2184104,1:56912222_T_C,T,0.454777,0.584347
1,rs6693179,1:56914602_G_A,G,0.454400,0.584899
2,rs6684929,1:56914738_T_C,T,0.454394,0.584916
3,rs6421496,1:56915593_C_T,C,0.220789,0.434829
4,rs1889145,1:56915820_G_A,G,0.454426,0.584876
...,...,...,...,...,...
3959,rs2519093,9:136141870_C_T,C,1.613001,0.558514
3960,rs532436,9:136149830_G_A,G,1.612299,0.558720
3961,rs600038,9:136151806_T_C,T,1.569488,0.581841
3962,rs649129,9:136154304_C_T,C,1.569779,0.581575


In [126]:
final_df.to_csv("MGB_10_all/cad.age.old.summary.tsv", sep="\t", index=False)

In [127]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [128]:
final_df["MGB_ID"].to_csv("MGB_10_all/cad.age.old.extract.txt", header=None, index=False)

#### IBD

In [3]:
geno = feather.read_feather("../../../cagi6-prs/features/baseline_snps-baseline_plus_samples/ibd_age_old/ibd.age.old.feather")

In [4]:
snps = ["_".join(var.split("_")[:-1]) for var in geno["index"]]
alleles = [var[-1] for var in geno["index"]]
allele_mp = dict(zip(snps, alleles))

In [2]:
snp_map = pd.read_csv("../../../cagi6-prs/snps/ukb.extracted.intersected.mgb.snps.tsv", sep="\t")

In [5]:
subset_snp_map = snp_map[snp_map["UKB_ID"].isin(snps)][["UKB_ID", "MGB_ID"]]

In [6]:
stats = pd.read_csv("../../../cagi6-prs/features/baseline_snps-baseline_plus_samples/ibd_age_old/ibd.age.old.train.stats.tsv", sep="\t")

In [7]:
subset_stats = stats[stats["0"].isin(snps)]

In [8]:
summary_df = pd.merge(subset_snp_map, subset_stats, left_on="UKB_ID", right_on="0")

In [10]:
summary_df["expected_allele"] = summary_df["UKB_ID"].map(allele_mp)

In [14]:
final_df = summary_df[["UKB_ID", "MGB_ID", "expected_allele", "mean", "std"]]

In [15]:
final_df.columns = ["UKB_ID", "MGB_ID", "EXPECTED_ALLELE", "MEAN", "STD"]

In [18]:
final_df.to_csv("MGB_10_all/ibd.age.old.summary.tsv", sep="\t", index=False)

In [25]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [27]:
final_df["MGB_ID"].to_csv("MGB_10_all/ibd.age.old.extract.txt", header=None, index=False)

#### T2D

In [129]:
geno = feather.read_feather("../../../cagi6-prs/features/baseline_snps-baseline_plus_samples/t2d_age_old/t2d.age.old.feather")

In [130]:
snps = ["_".join(var.split("_")[:-1]) for var in geno["index"]]
alleles = [var[-1] for var in geno["index"]]
allele_mp = dict(zip(snps, alleles))

In [131]:
snp_map = pd.read_csv("../../../cagi6-prs/snps/ukb.extracted.intersected.mgb.snps.tsv", sep="\t")

In [132]:
subset_snp_map = snp_map[snp_map["UKB_ID"].isin(snps)][["UKB_ID", "MGB_ID"]]

In [133]:
stats = pd.read_csv("../../../cagi6-prs/features/baseline_snps-baseline_plus_samples/t2d_age_old/t2d.age.old.train.stats.tsv", sep="\t")

In [134]:
subset_stats = stats[stats["0"].isin(snps)]

In [135]:
summary_df = pd.merge(subset_snp_map, subset_stats, left_on="UKB_ID", right_on="0")

In [136]:
summary_df["expected_allele"] = summary_df["UKB_ID"].map(allele_mp)

In [137]:
final_df = summary_df[["UKB_ID", "MGB_ID", "expected_allele", "mean", "std"]]

In [138]:
final_df.columns = ["UKB_ID", "MGB_ID", "EXPECTED_ALLELE", "MEAN", "STD"]

In [139]:
final_df.to_csv("MGB_10_all/t2d.age.old.summary.tsv", sep="\t", index=False)

In [140]:
final_df.shape

(3462, 5)

In [141]:
final_df["MGB_ID"].isin(bim[1]).all()

True

In [142]:
final_df["MGB_ID"].to_csv("MGB_10_all/t2d.age.old.extract.txt", header=None, index=False)

## Part 2 - Build a feather file from raw and double check it matches model allele
Basis for `make-feather-for-loader.py` script

In [28]:
raw = pd.read_csv("MGB_10_all/MGB_10_all.raw", delim_whitespace=True)

In [30]:
IIDs = raw["IID"]

In [31]:
raw = raw.iloc[:, 6:].T
raw.columns = IIDs
raw.index.name = "0"

In [33]:
summary = pd.read_csv("MGB_10_all/ibd.age.old.summary.tsv", sep="\t")

In [37]:
snps = ["_".join(var.split("_")[:-1]) for var in raw.index]
alleles = [var[-1] for var in raw.index]
#allele_mp = dict(zip(snps, alleles))

In [46]:
raw_summary = pd.DataFrame({"INDEX":raw.index, "MGB_ID":snps, "ACTUAL_ALLELE":alleles})

In [53]:
merged_summary = pd.merge(raw_summary, summary, on="MGB_ID").set_index("INDEX")

In [58]:
ordered_merged_summary = merged_summary.loc[raw.index]

In [65]:
mismatched_pos = np.where(ordered_merged_summary["ACTUAL_ALLELE"] != ordered_merged_summary["EXPECTED_ALLELE"])[0]

In [70]:
raw.iloc[mismatched_pos, :] = 2 - raw.iloc[mismatched_pos, :]

In [74]:
zraw = raw.subtract(ordered_merged_summary["MEAN"].values, axis="index")
zraw = zraw.div(ordered_merged_summary["STD"].values, axis="index")

In [76]:
feather.read_feather("MGB_10_all/MGB_10_all.zscored.feather")

Unnamed: 0,0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,MGB00010
0,1:1776269_C_A_C,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405,-0.175405
1,1:1781220_T_C_T,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103,-0.584103
2,1:1796616_G_A_G,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522,-0.596522
3,1:1860087_C_T_T,0.138641,0.138641,0.138641,0.138641,0.138641,0.138641,0.138641,0.138641,0.138641,0.138641
4,1:1864526_C_T_T,0.144202,0.144202,0.144202,0.144202,0.144202,0.144202,0.144202,0.144202,0.144202,0.144202
...,...,...,...,...,...,...,...,...,...,...,...
5719,22:39774525_A_G_G,0.572654,0.572654,0.572654,0.572654,0.572654,0.572654,0.572654,0.572654,0.572654,0.572654
5720,22:43561675_T_G_G,0.463974,0.463974,0.463974,0.463974,0.463974,0.463974,0.463974,0.463974,0.463974,0.463974
5721,22:43561982_C_T_T,0.463861,0.463861,0.463861,0.463861,0.463861,0.463861,0.463861,0.463861,0.463861,0.463861
5722,22:43562306_A_G_G,0.474088,0.474088,0.474088,0.474088,0.474088,0.474088,0.474088,0.474088,0.474088,0.474088


#### 100x1000

In [151]:
feather.write_feather(raw.reset_index(), "MGB_100_1000.feather")

In [152]:
feather.read_feather("MGB_100_1000.feather")

Unnamed: 0,0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,...,MGB00091,MGB00092,MGB00093,MGB00094,MGB00095,MGB00096,MGB00097,MGB00098,MGB00099,MGB00100
0,rs3131972_A,1,0,0,2,1,2,1,2,0,...,1,1,1,2,1,0,1,1,0,1
1,1:840327_G_A_A,1,1,0,1,0,0,2,2,1,...,2,1,0,0,1,2,0,2,0,2
2,rs4970382_T,2,1,2,1,0,1,2,1,0,...,1,1,1,2,0,1,1,1,1,1
3,1:846808_C_T_C,1,0,1,0,1,0,0,1,2,...,0,1,0,1,1,1,0,1,0,1
4,Affx-15447216_C,0,1,0,2,2,1,1,0,1,...,1,1,0,2,2,1,1,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1:4496256_C_T_T,2,2,0,1,1,1,0,1,1,...,0,2,1,2,1,1,0,1,1,0
996,1:4496659_C_T_T,0,1,0,2,0,2,0,1,1,...,2,0,0,1,1,0,1,1,1,0
997,1:4497097_G_A_A,1,1,1,1,1,1,0,2,2,...,0,1,2,0,2,0,0,1,1,2
998,1:4497118_C_T_C,1,0,2,0,1,1,1,0,1,...,2,1,1,2,0,0,2,1,1,1


In [193]:
stats = pd.DataFrame(data={"0": raw.index[:1000], "mean": np.random.normal(size=1000), "std": np.random.normal(size=1000)})

In [194]:
stats.to_csv("MGB_100_1000.stats.tsv", sep="\t", index=False)

In [198]:
raw = feather.read_feather("MGB_100_1000.feather").set_index("0")

In [226]:
zraw = raw.subtract(stats["mean"].values, axis="index")
zraw = zraw.div(stats["std"].values, axis="index")

In [236]:
feather.write_feather(zraw.reset_index(), "MGB_100_1000.zscored.feather")

In [238]:
zraw

Unnamed: 0_level_0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,MGB00010,...,MGB00091,MGB00092,MGB00093,MGB00094,MGB00095,MGB00096,MGB00097,MGB00098,MGB00099,MGB00100
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
rs3131972_A,-0.435274,0.145781,0.145781,-1.016330,-0.435274,-1.016330,-0.435274,-1.016330,0.145781,-1.016330,...,-0.435274,-0.435274,-0.435274,-1.016330,-0.435274,0.145781,-0.435274,-0.435274,0.145781,-0.435274
1:840327_G_A_A,3.971477,3.971477,0.743766,3.971477,0.743766,0.743766,7.199187,7.199187,3.971477,7.199187,...,7.199187,3.971477,0.743766,0.743766,3.971477,7.199187,0.743766,7.199187,0.743766,7.199187
rs4970382_T,-3.526750,-2.746769,-3.526750,-2.746769,-1.966788,-2.746769,-3.526750,-2.746769,-1.966788,-3.526750,...,-2.746769,-2.746769,-2.746769,-3.526750,-1.966788,-2.746769,-2.746769,-2.746769,-2.746769,-2.746769
1:846808_C_T_C,2.722384,1.299216,2.722384,1.299216,2.722384,1.299216,1.299216,2.722384,4.145552,2.722384,...,1.299216,2.722384,1.299216,2.722384,2.722384,2.722384,1.299216,2.722384,1.299216,2.722384
Affx-15447216_C,5.412217,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,5.412217,1.196706,1.196706,...,1.196706,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,1.196706,5.412217,5.412217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1:4496256_C_T_T,-1.126842,-1.126842,0.850135,-0.138353,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,-0.138353,...,0.850135,-1.126842,-0.138353,-1.126842,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,0.850135
1:4496659_C_T_T,-2.201933,-4.277609,-2.201933,-6.353284,-2.201933,-6.353284,-2.201933,-4.277609,-4.277609,-2.201933,...,-6.353284,-2.201933,-2.201933,-4.277609,-4.277609,-2.201933,-4.277609,-4.277609,-4.277609,-2.201933
1:4497097_G_A_A,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,0.356939,-0.948368,-0.948368,0.356939,...,0.356939,-0.295714,-0.948368,0.356939,-0.948368,0.356939,0.356939,-0.295714,-0.295714,-0.948368
1:4497118_C_T_C,-11.307576,-20.985076,-1.630075,-20.985076,-11.307576,-11.307576,-11.307576,-20.985076,-11.307576,-20.985076,...,-1.630075,-11.307576,-11.307576,-1.630075,-20.985076,-20.985076,-1.630075,-11.307576,-11.307576,-11.307576


In [237]:
feather.read_feather("../../test.zscored.feather")

Unnamed: 0,0,MGB00001,MGB00002,MGB00003,MGB00004,MGB00005,MGB00006,MGB00007,MGB00008,MGB00009,...,MGB00091,MGB00092,MGB00093,MGB00094,MGB00095,MGB00096,MGB00097,MGB00098,MGB00099,MGB00100
0,rs3131972_A,-0.435274,0.145781,0.145781,-1.016330,-0.435274,-1.016330,-0.435274,-1.016330,0.145781,...,-0.435274,-0.435274,-0.435274,-1.016330,-0.435274,0.145781,-0.435274,-0.435274,0.145781,-0.435274
1,1:840327_G_A_A,3.971477,3.971477,0.743766,3.971477,0.743766,0.743766,7.199187,7.199187,3.971477,...,7.199187,3.971477,0.743766,0.743766,3.971477,7.199187,0.743766,7.199187,0.743766,7.199187
2,rs4970382_T,-3.526750,-2.746769,-3.526750,-2.746769,-1.966788,-2.746769,-3.526750,-2.746769,-1.966788,...,-2.746769,-2.746769,-2.746769,-3.526750,-1.966788,-2.746769,-2.746769,-2.746769,-2.746769,-2.746769
3,1:846808_C_T_C,2.722384,1.299216,2.722384,1.299216,2.722384,1.299216,1.299216,2.722384,4.145552,...,1.299216,2.722384,1.299216,2.722384,2.722384,2.722384,1.299216,2.722384,1.299216,2.722384
4,Affx-15447216_C,5.412217,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,5.412217,1.196706,...,1.196706,1.196706,5.412217,-3.018806,-3.018806,1.196706,1.196706,1.196706,5.412217,5.412217
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,1:4496256_C_T_T,-1.126842,-1.126842,0.850135,-0.138353,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,...,0.850135,-1.126842,-0.138353,-1.126842,-0.138353,-0.138353,0.850135,-0.138353,-0.138353,0.850135
996,1:4496659_C_T_T,-2.201933,-4.277609,-2.201933,-6.353284,-2.201933,-6.353284,-2.201933,-4.277609,-4.277609,...,-6.353284,-2.201933,-2.201933,-4.277609,-4.277609,-2.201933,-4.277609,-4.277609,-4.277609,-2.201933
997,1:4497097_G_A_A,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,-0.295714,0.356939,-0.948368,-0.948368,...,0.356939,-0.295714,-0.948368,0.356939,-0.948368,0.356939,0.356939,-0.295714,-0.295714,-0.948368
998,1:4497118_C_T_C,-11.307576,-20.985076,-1.630075,-20.985076,-11.307576,-11.307576,-11.307576,-20.985076,-11.307576,...,-1.630075,-11.307576,-11.307576,-1.630075,-20.985076,-20.985076,-1.630075,-11.307576,-11.307576,-11.307576


## Part 3 - Phenotype files for dataloading
Build a tsv and an ids file for dataloading. Basis for `make-pheno-for-loader.py` script

In [130]:
tsv = fam.iloc[:100, :].copy()

In [131]:
tsv.columns = ["FID", "IID", "PAT", "MAT", "SEX", "PHENOTYPE"]

In [132]:
tsv["AGE"] = -9

In [133]:
tsv["FH"] = -9

In [134]:
tsv["ETH"] = "EUR"

In [161]:
tsv.to_csv("MGB_100_1000.tsv", sep="\t")

In [138]:
ids = tsv["IID"].values

In [139]:
np.savetxt("MGB_100_1000.ids.txt", ids, fmt="%s")

## Part 4 - Test dataloading

In [71]:
import sys

In [72]:
sys.path.append("/cellar/users/aklie/projects/InSNPtion/cagi6-prs-docker/ThisIsTheGoodPartPecker_ThisIsWhereTheJobGetsFun_AskAndYouShallReceive/")

In [73]:
import SNPAndClinicalLoader

In [85]:
feath = feather.read_feather("MGB_10_all/MGB_10_all.bca.zscored.feather")
feath = feath.set_index("0")

In [86]:
loader = SNPAndClinicalLoader.get_loader(
    ids_file="MGB_10_all/MGB_10_all.ids.txt",
    genotype_file=feath,
    phenotype_file="MGB_10_all/MGB_10_all.bca.tsv",
    disease_column="PHENOTYPE",
    batch_size=10,
    shuffle=True,
    num_workers=2,
)

In [90]:
for batch_num, (snp, pheno, eth, fh, sex, age) in enumerate(loader):
    print(batch_num)

0


In [92]:
snp.size(), pheno.size(), eth.size(), fh.size(), sex.size(), age.size()

(torch.Size([10, 12034]),
 torch.Size([10, 1]),
 torch.Size([10, 1]),
 torch.Size([10, 1]),
 torch.Size([10, 1]),
 torch.Size([10, 1]))

In [97]:
snp[0], pheno[0], eth[0], fh[0], sex[0], age[0]

(tensor([ 0.4628,  0.4584,  0.4585,  ..., -1.2344, -1.2725, -1.8072]),
 tensor([0]),
 tensor([0.]),
 tensor([-9]),
 tensor([2]),
 tensor([1.2881], dtype=torch.float64))

In [104]:
np.unique(snp[:, 1000], return_counts=True)

(array([0.9720436], dtype=float32), array([10]))

# Scratch
Place for old or testing code

# References