# oneK Genomes Analysis
#### Adam Klie<br>05/20/2020<br>CSE284 Project
Notebook to look at genotypes and phenotypes in 1000Genomes

In [2]:
import pandas as pd
import numpy as np
import os
from scipy.stats import zscore
from sklearn.model_selection import train_test_split

## Phenotypes
Run oneK_phenotypes.sh to generate tab deliminated genotype information for prediction

In [None]:
# Functions for model probability calculations
def p_blue(m1_sum, m2_sum):
    return np.exp(m1_sum)/(1+np.exp(m1_sum)+np.exp(m2_sum))

def p_other(m1_sum, m2_sum):
    return np.exp(m2_sum)/(1+np.exp(m1_sum)+np.exp(m2_sum))
    
# Load irisplex parameters for each SNP, sort on ID
IRISPLEX = os.path.join(os.environ["HOME"], "project/datasets", "irisplex.bed" )
iris_header = ["chr", "pos1", "pos2", "id", "minor_allele", "b1", "b2"]
iris = pd.read_csv(IRISPLEX, sep='\t', header=None, names=iris_header).set_index("id").sort_index()

# Load the genotypes of individuals at each SNP, sort on ID and reconfigure genotypes for 3 alleles (NEED TO CHECK THIS FOR NEW DATA)
GENOTYPE = os.path.join(os.environ["HOME"], "project/datasets/oneKGenomes", "iris_oneK_genotypes.tab")
gt = pd.read_csv(GENOTYPE, sep='\t').set_index("ID").sort_index()
gt = gt.loc[:, "HG00096":"NA21144"].apply(lambda x: 2-x if x.name in ['rs12896399', 'rs12913832', 'rs16891982'] else x, axis=1)

# Perform predictions for each individual based on genotype and parameters
a1 = 3.94
a2 = 0.65

predictions = {}
for col in gt.columns:
    if ("HG" in col) or ("NA" in col):
        pred1 = np.dot(gt[col], iris["b1"]) + a1
        pred2 = np.dot(gt[col], iris["b2"]) + a2
        blue = p_blue(pred1, pred2)
        other = p_other(pred1, pred2)
        brown = 1 - blue - other
        predictions[col] = [pred1, pred2, blue, other, brown]

predict = pd.DataFrame.from_dict(predictions, orient='index', columns=["pred1", "pred2", "blue", 
                                                                       "other", "brown"])

In [None]:
def get_color_pred(x):
    if x["blue"] >= x["brown"] and x["blue"] >= x["other"]:
        return "blue"
    elif x["brown"] >= x["other"]:
        return "brown"
    else:
        return "other"

In [None]:
predict["predicted_eye_color"] = predict.apply(get_color_pred, 1)

In [None]:
print(predict["predicted_eye_color"].value_counts())
print(predict["predicted_eye_color"].value_counts().sum())

In [None]:
# Check to see if matches homework
color = predict[["blue", "other", "brown"]]
for ind in ["NA12249", "NA20509", "NA12750"]:
    color_pred = color.loc[ind].idxmax(axis=1)
    print(ind, color_pred)
    
print(predict.loc["NA12249"])
print(predict.loc["NA20509"])
print(predict.loc["NA12750"])

In [None]:
# Use numeric labels for phenotypes
label_mapping = {"brown":0, "blue":1, "other":2}
def get_label(x):
    return label_mapping[x["predicted_eye_color"]]
predict["label"] = predict.apply(get_label, 1)

In [None]:
labels = predict["label"]
labels.head()

In [None]:
label_counts = labels.value_counts()
print(label_counts)
label_counts/label_counts.sum()

In [None]:
train_labels, val_labels = train_test_split(labels, test_size=0.2, stratify=labels.values)

In [None]:
train_counts = train_labels.value_counts()
print(train_counts)
train_counts/train_counts.sum()

In [None]:
val_counts = val_labels.value_counts()
print(val_counts)
val_counts/val_counts.sum()

In [None]:
# Training and val split for labels
train_ids = train_labels.index
val_ids = val_labels.index

In [None]:
# Save to files
train_labels.to_csv('train_labels.csv', index=True)
val_labels.to_csv('val_labels.csv', index=True)

## Extracting genotypes frmom 1000Genomes vcf files and filtered SNP list from openSNP
Using openSNP_filtered_rsids.list as input, run oneK_genotypes.sh to generate oneK_genotypes.tab
Also get train and val split from phenotypes (train_ids, val_ids)

In [3]:
# Load in genotypes from oneK_genotypes.sh output
oneK_genotypes = pd.read_csv('oneK_genotypes.tab', sep='\t')

In [None]:
# Save the final list of rsids, with reference allele for openSNP
oneK_genotypes[["ID", "REF"]].to_csv('oneK_rsids.tab', sep='\t', index=False)

In [None]:
data = oneK_genotypes.set_index("ID").loc[:, "HG00096":"NA21144"]

In [None]:
data_z = data.progress_apply(zscore).T

In [None]:
train = data.loc[train_ids]
val = data.loc[val_ids]

In [None]:
print(len(train))
print(len(val))

In [None]:
print(train.index == train_labels.index)
print(val.index == val.index)

In [None]:
train.to_csv('train_set.csv', index=True)
val.to_csv('val_set.csv', index=True)