# Perform sanity checks on datasets and complete final preprocessing steps
Adam Klie<br>
06/05/2020

In [1]:
import pandas as pd
import numpy as np
import os
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from tqdm.notebook import tqdm_notebook
tqdm_notebook.pandas()

## Get means and standard deviations of each SNP in training set to apply to test set

In [5]:
working_df = pd.read_csv('oneKGenomes/data/oneK_genotypes.tsv', sep='\t').drop_duplicates("ID").set_index("ID").loc[:, "HG00096":"NA21144"] 

In [6]:
z_df = pd.DataFrame(index=working_df.index, data={'means':working_df.mean(axis=1).values, 'std':working_df.std(axis=1).values})

In [8]:
z_df.to_csv('oneKGenomes/data/train_set_stats.tsv', sep='\t')

In [9]:
del working_df, z_df

## Check for same SNP set and individuals in train, val and test

In [None]:
train = pd.read_pickle('oneKGenomes/data/train_set.pickle')

In [None]:
val = pd.read_pickle('oneKGenomes/data/val_set.pickle')

In [None]:
test = pd.read_pickle('openSNP/data/test_set.pickle')

In [None]:
# Make sure IDs don't intersect
set(train.index).intersection(val.index)

In [None]:
# Make sure the ordering of SNPs is correct
print((train.columns != val.columns).sum())
print((test.columns != val.columns).sum())

In [None]:
del train, val, test

## Sanity check z-scores make sense

In [None]:
# Sanity check genotypes to z-score conversions
train = pd.read_pickle('oneKGenomes/data/train_set.pickle')
genotypes =  pd.read_pickle('oneKGenomes/data/oneK_genotypes.pickle')
z_df = pd.read_csv('oneKGenomes/data/train_set_stats.tsv', sep='\t')

In [None]:
gt_index = genotypes.set_index("ID")
del genotypes

In [None]:
# Check this against screenshot
gt_index.head()

In [None]:
gt = gt_index["HG02330"].loc["rs7917054"]

In [None]:
stats = z_df.set_index("ID").loc["rs7917054"]
m = stats.loc["means"]
sd = stats.loc["std"]

In [None]:
round(train.loc["HG02330", "rs7917054"], 5) == round((gt-m)/sd, 5)

In [None]:
del train, gt_index

In [None]:
test = pd.read_pickle('openSNP/data/test_set.pickle')
genotypes = pd.read_pickle('openSNP/data/openSNP_final_genotypes.pickle')
z_df = pd.read_csv('oneKGenomes/data/train_set_stats.tsv', sep='\t')
ref_alleles = pd.read_csv('oneKGenomes/data/oneK_rsids.tsv', sep='\t')

In [None]:
gt_index = genotypes.set_index("rsid")
del genotypes

In [None]:
gt_index.head()

In [None]:
ref_alleles = ref_alleles.set_index("ID")

In [None]:
ref_alleles.loc["rs7537756"]

In [None]:
z_df = z_df.set_index("ID")

In [None]:
test_ids = random.sample(set(test.index), 10)
rsid = "rs3131972"
print("Test set value", "Genotype z-score")
for id in test_ids:
    stats = z_df.loc[rsid]
    m = stats.loc["means"]
    sd = stats.loc["std"]
    gt = gt_index[id].loc[rsid]
    z_gt = test.loc[id, rsid]
    print(round(z_gt, 5), round((gt-m)/sd, 5))
    print("Same?", round(z_gt, 5) == round((gt-m)/sd, 5))

In [None]:
avg_na = gt_index.isna().apply(sum).mean()

In [None]:
avg_na/test.shape[1]

In [None]:
del test, gt_index, ref_alleles, z_df, test_ids, avg_na

# Check phenotype labels

In [None]:
test_labels = pd.read_csv('openSNP/data/test_labels.csv', index_col=0)

In [None]:
test_phenotypes = pd.read_csv('openSNP/data/openSNP_initial_phenotypes.tsv', sep='\t')

In [None]:
test_phenotypes = test_phenotypes.set_index('user_id')

In [None]:
test_phenotypes.merge(test_labels, left_index=True, right_index=True)

In [None]:
del test_labels, test_phenotypes

## Generate pickles from csv for each dataset

In [None]:
train = pd.read_pickle('oneKGenomes/data/train_set.pickle')

In [None]:
val = pd.read_pickle('oneKGenomes/data/val_set.pickle')

In [6]:
test = pd.read_pickle('openSNP/data/test_set.pickle')

In [None]:
with open('iris_rsids.txt') as f:
    iris_ids = [id.rstrip() for id in f.readlines()]

In [None]:
train[iris_ids].to_pickle('oneKGenomes/data/iris_train.pickle')
val[iris_ids].to_pickle('oneKGenomes/data/iris_val.pickle')
test[iris_ids].to_pickle('openSNP/data/iris_test.pickle')

In [None]:
del train, val, test

## Run linear correlation analysis P-values

In [None]:
%%bash
DATA_DIR=~/project/datasets/oneKGenomes/data
#vcftools --gzvcf ${DATA_DIR}/oneK_genotypes.vcf.gz --plink --out oneK 
plink --file oneK --allow-no-sex --make-bed --out oneK

In [None]:
# Create phenotypes
train_labels = pd.read_csv('oneKGenomes/data/train_labels.csv')
val_labels = pd.read_csv('oneKGenomes/data/val_labels.csv')
all_labels = pd.concat([train_labels, val_labels])

In [None]:
len(set(all_labels["user_id"].values))

In [None]:
all_labels.head()

In [None]:
from scipy.stats import zscore
all_labels["phen"] = zscore(all_labels["label"].values)

In [None]:
all_labels["sample"] = all_labels["user_id"]

In [None]:
all_labels["fam"] = all_labels["user_id"]

In [None]:
all_labels[["sample", "fam", "phen"]].to_csv('oneK.phen', sep=' ', index=False)

In [None]:
%%bash

# Code to run GWAS using plink
# Set up directories
DATADIR=~/project/datasets
PREFIX=${DATADIR}/oneK # use for plink --bfile
USERPREFIX=${DATADIR}/oneK_gwas # use for plink --out

# GWAS
plink --bfile $PREFIX --linear --out $USERPREFIX --pheno ${PREFIX}.phen --allow-no-sex

# Change to more useable output format
cat ${USERPREFIX}.assoc.linear | sed -r 's/^\s+//g' | sed -r 's/\s+/\t/g' > ${USERPREFIX}.assoc.linear.tab

plink --bfile $PREFIX --clump ${USERPREFIX}.assoc.linear  --out $USERPREFIX --clump-field P

In [None]:
!tail -n +2 /home/aklie/project/datasets/oneK_gwas.clumped | sed '/^$/d' | wc -l

# Generate some new datasets using p-values

In [2]:
train = pd.read_pickle('oneKGenomes/data/train_set.pickle')

In [3]:
val = pd.read_pickle('oneKGenomes/data/val_set.pickle')

In [4]:
test = pd.read_pickle('openSNP/data/test_set.pickle')

In [5]:
gwas_out = pd.read_csv('oneK_gwas.assoc.linear.tab', '\t')

In [6]:
for i in [1000, 10000, 50000, 100000]:
    print("Saving ", i)
    curr_rsids = gwas_out.sort_values('P')["SNP"].values[:i]
    train[curr_rsids].to_pickle('oneKGenomes/data/{}_train_set.pickle'.format(i))
    val[curr_rsids].to_pickle('oneKGenomes/data/{}_val_set.pickle'.format(i))
    test[curr_rsids].to_pickle('openSNP/data/{}_test_set.pickle'.format(i))

Saving  1000
Saving  10000
Saving  50000
Saving  100000


In [10]:
del gwas_out, train, val, test

In [10]:
#pd.read_pickle('openSNP/data/100000_test_set.pickle')

In [11]:
#whos

## Process IrisPlex on openSNP for ROC curves

In [3]:
pred = pd.read_csv('openSNP/data/openSNP_iris_prediction.tsv', sep='\t')

In [None]:
pred_id = pred.set_index("user_id")

In [None]:
pred_cols = pred_id[["index", "brown", "blue", "other", "predicted_eye_color", "eye_color"]]

In [None]:
pred_cols

In [None]:
labels = {'brown':0, 'blue':1, 'other':2} 
labels2 = {'brown':0, 'blue':1, 'green':2} 
  
# Remap the values of the dataframe 
final_preds = pred_cols.replace({"predicted_eye_color": labels, "eye_color":labels2}) 

In [None]:
final_preds.to_csv('openSNP/data/openSNP_final_iris_preds.tsv', sep='\t', index=False)

In [12]:
pd.read_csv('openSNP/data/test_labels.csv')["label"].value_counts()