In [0]:
import sys

sys.path.append("../../include_utils/")

import ipyparallel as ipp
import os, time
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE
import seaborn as sns
from IPython.display import FileLink
#import urllib2
import urllib.request as urllib2
import urllib
import dill
import traceback
from pandas import Series, DataFrame
import gzip
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%config InlineBackend.figure_format = 'retina'
from Bio import SeqIO
import pysam
from collections import OrderedDict, namedtuple
import operator
import multiprocessing as mp
import pickle
from IPython.display import FileLink, FileLinks, display

samtools = "/home/cfriedline/gpfs/src/samtools-1.3/samtools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.3/bcftools"
picard = "/home/cfriedline/gpfs/src/broadinstitute-picard-03a1d72/dist/picard.jar"
java = "/home/cfriedline/g/src/jdk1.8.0_60/bin/java"
perl = "/home/cfriedline/gpfs/opt/ActivePerl-5.18/bin/perl"

vcfutils = "perl /home/cfriedline/g/src/bcftools-1.3/vcfutils.pl"
vcftools = "/home/cfriedline/bin/vcftools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.3/bcftools"
tabix = "/home/cfriedline/gpfs/src/htslib-1.3/tabix"
bgzip = "/home/cfriedline/gpfs/src/htslib-1.3/bgzip"


def setup_r():
    os.environ['R_HOME'] = '/home/cfriedline/g/R3/lib64/R'
    os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s:%s" % (os.environ['R_HOME'], 
                                                   os.environ['LD_LIBRARY_PATH'],
                                                     "/home/cfriedline/lib64")

setup_r()
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = robjects.r

%reload_ext autoreload
%autoreload 2
%matplotlib inline
%reload_ext rpy2.ipython

In [0]:
ni_dir = "/home/cfriedline/eckertlab/Mitra/dDocent"
#imp_dir = "/home/cfriedline/eckertlab/Mitra/mapping/split_parallel/collapsed/work/samtools1.3/beagle40/" 

In [0]:
notimputed_vcf_gz = os.path.join(ni_dir, "good_snps.recode.vcf.gz")
#imputed_vcf_gz = os.path.join(imp_dir, "isect_snps.recode.vcf.gz_sorted.vcf.gz")

In [0]:
vcfs = [notimputed_vcf_gz]#, imputed_vcf_gz]

In [0]:
for v in vcfs:
    !$vcftools --gzvcf $v --012 --out $v

In [0]:
z12s = ["%s.012" % x for x in vcfs]

In [0]:
def get_z12_df(z12_file):
    indv_file = "%s.indv" % z12_file
    pos_file = "%s.pos" % z12_file
    z12_data = []
    for i, line in enumerate(open(z12_file)):
        line = line.strip()
        line = [int(x) for x in line.split("\t")]
        z12_data.append(np.array(line))
    z12_data = np.array(z12_data)
    p = pd.read_csv(pos_file, sep="\t", names=['contig', 'pos'])
    i = pd.read_csv(indv_file, names=['sample_name'])
    df = pd.DataFrame(z12_data)
    df = df.drop(0, axis=1)
    df.columns = p.apply(lambda x: "%s-%s" % (x.contig, x.pos), axis=1)
    df.index = [x.upper() for x in i.sample_name]
    return df
z12_dfs = [get_z12_df(x) for x in z12s]


In [0]:
z12_dfs[0].head()

In [0]:
def extract_pop(row):
    if row.name.startswith("LP"):
        return row.name[0:2]
    return row.name[0:3]

def assign_population(df):
    df['population'] = df.apply(lambda x: extract_pop(x), axis=1)
[assign_population(x) for x in z12_dfs];

In [0]:
[x.shape for x in z12_dfs]

In [0]:
z12_dfs[0].head()

In [0]:
def get_correction(n):
    #for finite sample size
    return (2*n)/(2*n-1)

def get_allele_freqs(locus, debug):
    c = locus[locus != -1].value_counts()
    total_alleles = 2.0*sum(c)
    num_individuals = sum(c)
    P = 0
    Q = 0
    PQ = 0
    if 0 in c:
        P = 2*c[0]
    if 2 in c:
        Q = 2*c[2]
    if 1 in c:
        PQ = c[1]
    P += PQ
    Q += PQ
    if total_alleles == 0:
        return None
    p = P/total_alleles
    q = Q/total_alleles
    assert p + q == 1.0
    He = 2 * p * q * get_correction(num_individuals)
    Ho = PQ*1.0/num_individuals
    Fis = 1 - (Ho/He)
    #print p, q, He, Ho, Fis
    
        
    ret = pd.Series({"p":p, 
                      "q":q,
                      "P":P,
                      "Q":Q,
                      "He":He,
                      "Ho":Ho, 
                      "Fis":Fis})
    if debug:
        print(ret)
    return ret

In [0]:
allele_freqs = [x.ix[:,:-1].apply(get_allele_freqs, args=(False,)) for x in z12_dfs]

In [0]:
[x.shape for x in allele_freqs]

In [0]:
mafs = [x.apply(lambda x: min(x["p"], x["q"])) for x in allele_freqs]

In [0]:
mafs[0].head()

In [0]:
mafs[0].describe()

In [0]:
def swap_alleles(locus, af):
    if "_" in locus.name:
        locus_id = locus.name
        freqs = af[locus_id]
        maf = min(freqs["p"], freqs["q"])
        if maf == freqs["p"]:
            return locus.replace({0:2,2:0})
        return locus
    else:
        return locus

In [0]:
z12_swapped = []
for i, z12 in enumerate(z12_dfs):
    z12_swapped.append(z12.apply(swap_alleles, args=(allele_freqs[i],)))

In [0]:
z12_dfs[0].head(20)

In [0]:
allele_freqs[0]

In [0]:
z12_swapped[0].head(20)

In [0]:
pop_id = {}
i = 1
for p in sorted(z12_dfs[0]['population'].unique()):
    pop_id[p] = i
    i+=1
pop_id

In [0]:
pops = pd.read_csv("~/eckertlab/Mitra/Pops.txt", sep="\t").drop_duplicates()

def get_species_label(row):
    return pops[pops.Pop==row.population].Species.values[0]

In [0]:
def assign_popid(series):
    series['popid'] = pop_id[series['population']]
    return series

In [0]:
z12_swapped = [x.apply(assign_popid, axis=1) for x in z12_swapped]

In [0]:
z12_swapped[0].head()

In [0]:
save_df(ni_dir, "z12_swapped", z12_swapped[0])

In [0]:
def center_and_standardize_value(val, u, var):
    if val == -1:
        return 0.0
    return (val-u)/np.sqrt(var)

def center_and_standardize(locus, af):
    if "_" in locus.name:
        #locus_id = int(locus.name[1:])
        locus_id = locus.name
        freqs = af[locus_id]
        maf = min(freqs["p"], freqs["q"])
        var = maf*(1-maf)
        u = np.mean([x for x in locus if x != -1])
        return locus.apply(center_and_standardize_value, args=(u, var))
    return locus

In [0]:
pca_std = []
for i, df in enumerate(z12_swapped):
    pca_std.append(df.apply(center_and_standardize, args=(allele_freqs[i],)))

In [0]:
for i, df in enumerate(pca_std):
    df['species'] = df.apply(get_species_label, axis=1)

In [0]:
pca_std_data = [x.ix[:,:-3] for x in pca_std]

In [0]:
pca_std_data[0].ix[:,0:10].apply(np.std)

In [0]:
pca_std_data_ni = pca_std_data[0]
#pca_std_data_imp = pca_std_data[1]

In [0]:
pca_std_data_ni.shape#, pca_std_data_imp.shape

In [0]:
pca_std_data_ni.to_csv(os.path.join(ni_dir, "pca_std_data.txt"), header=True, index=True, sep="\t")

## Run PCA

In [0]:
%%R
library(data.table)
ni_dir = '/home/cfriedline/eckertlab/Mitra/dDocent'
data_ni = fread(paste(ni_dir, '/pca_std_data.txt', sep=''), sep="\t", data.table=F)
rownames(data_ni) = data_ni$V1

drops = c("V1")
data_ni = data_ni[,!(names(data_ni) %in% drops)]
res_ni = prcomp(data_ni, scale=F, center=F)
rownames(res_ni$x) = rownames(data_ni)

fname = 'pca_res.rds'
ni = paste(ni_dir, "/", fname, sep='')
saveRDS(res_ni, ni)

In [0]:
r("res_ni = readRDS('%s/pca_res.rds')" % ni_dir);
#r("res_imp = readRDS('%s/pca_res.rds')" % imp_dir);

In [0]:
def get_pca_x(res):
    x = pd.DataFrame(pandas2ri.ri2py(res.rx2("x")))
    x.index = res.rx2("x").names[0]
    x.columns = res.rx2("x").names[1]
    return x

In [0]:
print(r('res_ni').rx2('x').names[0])

In [0]:
summary = r('summary')

In [0]:
prcomp_res = [x for x in [r['res_ni']]]#, r['res_imp']]]

In [0]:
pca_x = [get_pca_x(x) for x in [r['res_ni']]]#, r['res_imp']]]

In [0]:
pca_x[0].index = pca_std_data_ni.index
#pca_x[1].index = pca_std_data_imp.index

In [0]:
pca_x[0].shape

In [0]:
pca_std[0].species.unique()

In [0]:
sns.set_style("white")
norm = mcolors.Normalize(min(pop_id.values()), max(pop_id.values()))

def get_prop_var_from_summary(res, pc):
    return summary(res).rx("importance")[0].rx(2,pc)[0]

def plot_pca_pop(key, pca_std, pca_std_data, pca_x, prcomp_res, x, y):
    joined = pca_std.join(pca_x)
    legend = {}
    for row in joined.iterrows():
        pop = row[1]['population']
        n = norm(pop_id[pop])
        color = cm.rainbow(n)
        legend[pop] = color
        plt.scatter(row[1]['PC%d' % x], 
                    row[1]['PC%d' % y], 
                    s=50, 
                    c=color)
    fig = plt.gcf()
    ax = plt.gca()
    cmap = plt.get_cmap()
    fig.set_size_inches(10,8)
    plt.title("PCA of n=%d samples on %d dDocent loci (%s)" % (len(joined), len(pca_std_data.columns), key))
    plt.xlabel("PC{} ({})".format(x, get_prop_var_from_summary(prcomp_res, x)))
    plt.ylabel("PC{} ({})".format(y, get_prop_var_from_summary(prcomp_res, y)))

    handles = []
    for pop in sorted(legend):
        handles.append(mpatches.Patch(color=legend[pop], label=pop))
    ax.legend(handles=handles, bbox_to_anchor=(1.25, .9), ncol=2)
    
    out_file = "{}_{}_{}_pop.pdf".format(key.replace(" ", "_"), x, y)
    
    plt.savefig(out_file)
    plt.show()
    return out_file

def plot_pca_spp(key, pca_std, pca_std_data, pca_x, prcomp_res, x, y):
    joined = pca_std.join(pca_x)
    legend = {'SWWP': 'blue', 'PA': 'orange', 'PF': 'green'}
    for row in joined.iterrows():
        pop = row[1]['population']
        plt.scatter(row[1]['PC%d' % x], row[1]['PC%d' % y], s=50, c=legend[row[1].species])
    fig = plt.gcf()
    ax = plt.gca()
    cmap = plt.get_cmap()
    fig.set_size_inches(10,8)
    plt.title("PCA of n=%d samples on %d dDocent loci (%s)" % (len(joined), len(pca_std_data.columns), key))
    plt.xlabel("PC{} ({})".format(x, get_prop_var_from_summary(prcomp_res, x)))
    plt.ylabel("PC{} ({})".format(y, get_prop_var_from_summary(prcomp_res, y)))

    out_file = "{}_{}_{}_spp.pdf".format(key.replace(" ", "_"), x, y)
    
    handles = []
    for pop in sorted(legend):
        handles.append(mpatches.Patch(color=legend[pop], label=pop))
    ax.legend(handles=handles, bbox_to_anchor=(1.17, .9), ncol=1)
    
    plt.savefig(out_file)
    plt.show()
    return out_file

for i, key in enumerate(["not imputed"]):
    f = plot_pca_spp(key, pca_std[i], pca_std_data[i], pca_x[i], prcomp_res[i], 1, 2)
    display(FileLink(f))
    f = plot_pca_spp(key, pca_std[i], pca_std_data[i], pca_x[i], prcomp_res[i], 3, 4)
    display(FileLink(f))
    f = plot_pca_pop(key, pca_std[i], pca_std_data[i], pca_x[i], prcomp_res[i], 1, 2)
    display(FileLink(f))
    f = plot_pca_pop(key, pca_std[i], pca_std_data[i], pca_x[i], prcomp_res[i], 3, 4)
    display(FileLink(f))    

In [0]:
def missing(locus):
    c = len(locus[locus == -1])
    return c/len(locus)
missing = z12_dfs[0].apply(missing)

In [0]:
missing

In [0]:
print(pd.DataFrame(missing[:-1].describe()).apply(np.round, args=(2,)).to_csv(header=False, sep="="))

In [0]:
sns.distplot(missing, kde=False, bins=100)
plt.title("dDocent calls missing across individuals")
plt.text(0.01, 500, pd.DataFrame(missing[:-1].describe()).apply(np.round, args=(2,)).to_csv(header=False, sep="="))
fig = plt.gcf()
fig.set_size_inches(10, 8)
plt.show()

In [0]:
def save_df(dirname, fname, df):
    f = os.path.join(dirname, "%s.txt" % fname) 
    df.to_csv(f, 
              header=True,
              index=True,
              sep="\t")
    print("saved %s" % f)

In [0]:
%%R
library(data.table)
ni_dir ='/home/cfriedline/eckertlab/Mitra/mapping/split_parallel/collapsed/work/samtools1.3/'
imp_dir = '/home/cfriedline/eckertlab/Mitra/mapping/split_parallel/collapsed/work/samtools1.3/beagle40/'
data_ni = fread(paste(ni_dir, '/pca_std_data.txt', sep=''), sep="\t", data.table=F)
data_imp = fread(paste(imp_dir, '/pca_std_data.txt', sep=''), sep="\t", data.table=F)
rownames(data_ni) = data_ni$V1
rownames(data_imp) = data_imp$V1
drops = c("V1")
data_ni = data_ni[,!(names(data_ni) %in% drops)]
data_imp = data_imp[,!(names(data_imp) %in% drops)]
source("tw_calc.R")
test=read.table("twtable", header=F)

In [0]:
%%R
tw_ni = TWcalc(as.matrix(data_ni),20)
tw_imp = TWcalc(as.matrix(data_imp),20)

In [0]:
tws = [r("tw_ni[[2]]"), r("tw_imp[[2]]")]

In [0]:
def get_sig_tracywidom(tw_p):
    ps = []
    for i, p in enumerate(tw_p):
        if p > 0.05:
            print(i, p)
            break
        else:
            ps.append(p)
    return len(ps), ps
    

In [0]:
tw_num = [get_sig_tracywidom(x) for x in tws]

### Tracy-Widom

```
Not imputed: 13
Imputed: 15
```

In [0]:
pca_cov = [None]*2
pca_cov[0] = pca_x[0].ix[:,0:tw_num[0][0]]
pca_cov[1] = pca_x[1].ix[:,0:tw_num[1][0]]

In [0]:
[x.shape for x in pca_cov]

In [0]:
for d, f in zip([ni_dir, imp_dir], pca_cov):
    save_df(d, 'pca_cov', f)

In [0]:
#pca_std_pheno = pheno.join(pca_cov, how="inner").join(pca_maf.ix[:,:-2], how="inner")
pca_std_pheno = []
for i, df in enumerate(pca_cov):
    df = pheno.join(pca_cov[i], how='inner').join(z12_swapped[i], how='inner')
    print(df.shape)
    pca_std_pheno.append(df)

In [0]:
for i, d in enumerate([ni_dir, imp_dir]):
    save_df(d, "z12_df", z12_dfs[i])
    save_df(d, "z12_swapped", z12_swapped[i])
    save_df(d, "pca_std", pca_std[i])
    save_df(d, "pca_std_data", pca_std_data[i])
    save_df(d, "mafs", mafs[i])
    save_df(d, "allele_freqs", allele_freqs[i])
    save_df(d, "pca_x", pca_x[i])

In [0]:
pop_allele_data = []

for i, df in enumerate(z12_swapped):
    pop_data = {}
    for group, data in df.groupby('population'):
        data = data.drop(['population', 'popid'], axis=1)
        print(i, group, data.shape)
        gt = data.apply(get_allele_freqs, debug=False)
        pop_data[group] = gt.to_dict()
    pop_allele_data.append(pop_data)

In [0]:
for i, d in enumerate([ni_dir, imp_dir]):
    pickle.dump(pop_allele_data[i], 
                open(os.path.join(d, "pop_allele_data.pkl"), "wb"), 
                protocol=pickle.HIGHEST_PROTOCOL)

In [0]:
pca_x[0].head(), pca_x[1].head()

In [0]:
pca_x_merge = pd.merge(pca_x[0], pca_x[1], suffixes=("_ni", "_imp"), left_index=True, right_index=True)

In [0]:
sp.stats.linregress(pca_x_merge.PC1_ni, pca_x_merge.PC1_imp)

In [0]:
plt.scatter(pca_x_merge.PC1_ni, pca_x_merge.PC1_imp)