In [0]:
import sys

sys.path.append("../include_utils/")

import ipyparallel as ipp
import os, time
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import cyvcf
import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE
import seaborn as sns
from IPython.display import FileLink
import urllib2
import dill
import traceback
from pandas import Series, DataFrame
import gzip
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%config InlineBackend.figure_format = 'retina'
from Bio import SeqIO
import pysam
from collections import OrderedDict, namedtuple
import operator
import multiprocessing as mp
from hdfstorehelper import HDFStoreHelper
import dill

samtools = "/home/cfriedline/gpfs/src/samtools-1.2/samtools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.2/bcftools"
picard = "/home/cfriedline/gpfs/src/broadinstitute-picard-03a1d72/dist/picard.jar"
java = "/home/cfriedline/g/src/jdk1.8.0_60/bin/java"
perl = "/home/cfriedline/gpfs/opt/ActivePerl-5.16/bin/perl"

vcfutils = "perl /home/cfriedline/g/src/bcftools-1.2/vcfutils.pl"
vcftools = "/home/cfriedline/bin/vcftools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.2/bcftools"
tabix = "/home/cfriedline/gpfs/src/samtools-1.2/htslib-1.2.1/tabix"
bgzip = "/home/cfriedline/gpfs/src/samtools-1.2/htslib-1.2.1/bgzip"


def setup_r():
    os.environ['R_HOME'] = '/home/cfriedline/g/R3/lib64/R'
    os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s:%s" % (os.environ['R_HOME'], 
                                                   os.environ['LD_LIBRARY_PATH'],
                                                     "/home/cfriedline/lib64")

setup_r()
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = robjects.r

%reload_ext autoreload
%autoreload 2
%matplotlib inline
%reload_ext rpy2.ipython

In [0]:
notimputed_vcf_gz = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/notimputed/isect.vcf.gz.sorted.gz"
imputed_vcf_gz = '/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle40/isect.vcf.gz.sorted.gz'

In [0]:
vcfs = [notimputed_vcf_gz, imputed_vcf_gz]
for v in vcfs:
    !$vcftools --gzvcf $v --012 --out $v

In [0]:
z12s = ["%s.012" % x for x in vcfs]

In [0]:
translation_df = pd.read_csv("translation_table.csv", sep="\t", index_col=0)
def get_translated_name(n):
    row = translation_df.ix[n.strip()]
    return "%s_%d_%d" % (row['pop'], row.indiv, row.dup)

In [0]:
translation_df.head()

In [0]:
def get_z12_df(z12_file):
    indv_file = "%s.indv" % z12_file
    pos_file = "%s.pos" % z12_file
    z12_data = []
    for i, line in enumerate(open(z12_file)):
        line = line.strip()
        line = [int(x) for x in line.split("\t")]
        z12_data.append(np.array(line))
    z12_data = np.array(z12_data)
    p = pd.read_csv(pos_file, sep="\t", names=['contig', 'pos'])
    i = pd.read_csv(indv_file, names=['sample_name'])
    df = pd.DataFrame(z12_data)
    df = df.drop(0, axis=1)
    df.columns = p.apply(lambda x: "%s_%s" % (x.contig, x.pos), axis=1)
    df.index = [get_translated_name(x) for x in i.sample_name]
    return df
z12_dfs = [get_z12_df(x) for x in z12s]

In [0]:
def assign_population(df):
    df['population'] = df.apply(lambda x: x.name.split("_")[0], axis=1)
[assign_population(x) for x in z12_dfs];

In [0]:
z12_dfs[0].head()

In [0]:
def get_correction(n):
    #for finite sample size
    return (2*n)/(2*n-1)

def get_allele_freqs(locus, debug):
    c = locus[locus != -1].value_counts()
    total_alleles = 2.0*sum(c)
    num_individuals = sum(c)
    P = 0
    Q = 0
    PQ = 0
    if 0 in c:
        P = 2*c[0]
    if 2 in c:
        Q = 2*c[2]
    if 1 in c:
        PQ = c[1]
    P += PQ
    Q += PQ
    p = P/total_alleles
    q = Q/total_alleles
    assert p + q == 1.0
    He = 2 * p * q * get_correction(num_individuals)
    Ho = PQ*1.0/num_individuals
    Fis = 1 - (Ho/He)
    #print p, q, He, Ho, Fis
    
        
    ret = pd.Series({"p":p, 
                      "q":q,
                      "P":P,
                      "Q":Q,
                      "He":He,
                      "Ho":Ho, 
                      "Fis":Fis})
    if debug:
        print(ret)
    return ret

In [0]:
allele_freqs = [x.ix[:,:-1].apply(get_allele_freqs, args=(False,)) for x in z12_dfs]

In [0]:
mafs = [x.apply(lambda x: min(x["p"], x["q"])) for x in allele_freqs]

In [0]:
mafs[0].head()

In [0]:
mafs[1].head()

In [0]:
plt.scatter(mafs[0], mafs[1])
plt.title("MAF")
plt.xlabel("not imputed")
plt.ylabel("imputed")
plt.show()

In [0]:
def swap_alleles(locus, af):
    if "_" in locus.name:
        locus_id = locus.name
        freqs = af[locus_id]
        maf = min(freqs["p"], freqs["q"])
        if maf == freqs["p"]:
            return locus.replace({0:2,2:0})
        return locus
    else:
        return locus

In [0]:
z12_swapped = []
for i, z12 in enumerate(z12_dfs):
    z12_swapped.append(z12.apply(swap_alleles, args=(allele_freqs[i],)))

In [0]:
z12_dfs[0].head()

In [0]:
z12_swapped[0].head()

In [0]:
pop_id = {}
i = 1
for p in sorted(z12_dfs[0]['population'].unique()):
    pop_id[p] = i
    i+=1
pop_id

In [0]:
def assign_popid(series):
    series['popid'] = pop_id[series['population']]
    return series

In [0]:
z12_swapped = [x.apply(assign_popid, axis=1) for x in z12_swapped]

In [0]:
z12_swapped[0].head()

In [0]:
def center_and_standardize_value(val, u, var):
    if val == -1:
        return 0.0
    return (val-u)/np.sqrt(var)

def center_and_standardize(locus, af):
    if "_" in locus.name:
        #locus_id = int(locus.name[1:])
        locus_id = locus.name
        freqs = af[locus_id]
        maf = min(freqs["p"], freqs["q"])
        var = maf*(1-maf)
        u = np.mean([x for x in locus if x != -1])
        return locus.apply(center_and_standardize_value, args=(u, var))
    return locus

In [0]:
pca_std = []
for i, df in enumerate(z12_swapped):
    pca_std.append(df.apply(center_and_standardize, args=(allele_freqs[i],)))

In [0]:
pca_std_data = [x.ix[:,:-2] for x in pca_std]

In [0]:
prcomp = r('prcomp')
summary = r('summary')

In [0]:
prcomp_res = [prcomp(x, scale=False, center=False) for x in pca_std_data]

In [0]:
def get_pca_x(res):
    x = pd.DataFrame(pandas2ri.ri2py(res.rx2("x")))
    x.index = res.rx2("x").names[0]
    x.columns = res.rx2("x").names[1]
    return x

In [0]:
pca_x = [get_pca_x(x) for x in prcomp_res]

In [0]:
norm = mcolors.Normalize(min(pop_id.values()), max(pop_id.values()))
def plot_pca(key, pca_std, pca_std_data, pca_x, prcomp_res):
    joined = pca_std.join(pca_x)
    legend = {}
    for row in joined.iterrows():
        pop = row[1]['population']
        n = norm(pop_id[pop])
        color = cm.rainbow(n)
        legend[pop] = color
        plt.scatter(row[1].PC1, 
                    row[1].PC2, 
                    s=50, 
                    c=color)
    fig = plt.gcf()
    ax = plt.gca()
    cmap = plt.get_cmap()
    fig.set_size_inches(10,8)
    plt.title("PCA of n=%d samples (%s) on %d loci" % (len(joined), key, len(pca_std_data.columns)))
    imp = summary(prcomp_res).rx("importance")[0]
    plt.xlabel("PC1 (%g)" % imp.rx(2,1)[0])
    plt.ylabel("PC2 (%g)" % imp.rx(2,2)[0])

    handles = []
    for pop in sorted(legend):
        handles.append(mpatches.Patch(color=legend[pop], label=pop))
    plt.legend(handles=sorted(handles))
    plt.show()

In [0]:
for i, key in enumerate(["not imputed", "imputed"]):
    plot_pca(key, pca_std[i], pca_std_data[i], pca_x[i], prcomp_res[i])

In [0]:
z12_dfs[1]

In [0]:
for i, key in enumerate(["not_imputed", "imputed"]):
    filedir = os.path.dirname(z12s[i])
    hdffile = os.path.join(filedir, "isect.hd5")
    print hdffile
    !rm -f $hdffile
    hdf = HDFStoreHelper(hdffile)
    hdf['z12_df'] = z12_dfs[i]
    hdf['z12_swapped'] = z12_swapped[i]
    hdf['pca_std'] =  pca_std[i]
    hdf['pca_std_data'] = pca_std_data[i]
    hdf['mafs'] = mafs[i]
    hdf['allele_freqs'] = allele_freqs[i]
    hdf['pca_x'] = pca_x[i]
    print hdf.get_group_names()
    dill.dump(prcomp_res, open(os.path.join(filedir, "isect_prcomp.dill"), "w"))

In [0]:
%%R
source("tw_calc.R")
test=read.table("twtable", header=F)

In [0]:
TWcalc = r('TWcalc')

In [0]:
pca_std_data_notimputed = pca_std_data[0]
pca_std_data_imputed = pca_std_data[1]

In [0]:
%R -i pca_std_data_notimputed
%R -i pca_std_data_imputed

In [0]:
%%R
tw_notimputed = TWcalc(as.matrix(pca_std_data_notimputed),12)
tw_imputed = TWcalc(as.matrix(pca_std_data_imputed),12)

In [0]:
tws = [r("tw_notimputed[[2]]"), r("tw_imputed[[2]]")]

In [0]:
def get_sig_tracywidom(tw_p):
    ps = []
    for i, p in enumerate(tw_p):
        if p > 0.05:
            print(i, p)
            break
        else:
            ps.append(p)
    return len(ps), ps
    

In [0]:
[get_sig_tracywidom(x) for x in tws]

In [0]:
hdf.get_group_names()