In [0]:
import scandir
import os, sys
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.robjects as ro
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import dill
import random
import vcf
from hdfstorehelper import HDFStoreHelper
import statsmodels.api as sm
import statsmodels.formula.api as smf
import operator
import traceback
%load_ext rpy2.ipython
from rpy2.robjects import pandas2ri as p2r
p2r.activate()
r = ro.r
import shutil
from utils import read_df, save_df
from pathlib import Path, PurePath
from ipyparallel import Client
from collections import Counter, defaultdict, namedtuple, OrderedDict
from scipy.stats import mannwhitneyu, ks_2samp, f_oneway
import tables
import ujson
import pickle

In [0]:
rc = Client(profile="sge")

In [0]:
dview = rc[:]
lview = rc.load_balanced_view()
len(dview)

In [0]:
analysis_dir = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/beagle40/"

In [0]:
gemma_dir = os.path.join(analysis_dir, "gemma_run")
gemma_dir = os.path.join(gemma_imputed.ipynba_dir, "output")

In [0]:
ni_data = read_df("/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/ni", "z12_df")

In [0]:
def percent_missing(snp):
    c = snp.value_counts()
    if not -1 in c:
        return 0
    return c[-1]/np.sum(c)

In [0]:
percent_missing = ni_data.apply(percent_missing)

In [0]:
phenos = ["mass", "tdt", "pd"]

In [0]:
combined_dfs = pickle.load(open(os.path.join(gemma_dir, "combined_dfs.pkl"), "rb"))

In [0]:
effect_snps = pickle.load(open(os.path.join(gemma_dir, "effect_snps.pkl"), "rb"))

In [0]:
effect_snps.keys()

In [0]:
gt_base_df = read_df(analysis_dir, 'gt_base_df')

In [0]:
pops = sorted(set([x.split("_")[0] for x in gt_base_df.index]))

In [0]:
def count_genotypes(snp):
    counts = Counter()
    for gt in snp:
        try:
            float(gt) #if gt is nan
        except:
            counts[gt[0]]+=1
            counts[gt[-1]]+=1
    return sorted(counts.items(), key=operator.itemgetter(1))

In [0]:
gt_base_df['population'] = gt_base_df.apply(lambda x: x.name.split("_")[0], axis=1)

In [0]:
pop_allele_data = {}

def add_allele_freq(gt_list):
    data = gt_list
    ret = OrderedDict()
    if len(gt_list) == 2:
        total = data[0][1]+data[1][1]
        ret[data[0][0]] = [data[0][1], data[0][1]/total]
        ret[data[1][0]] = [data[1][1], data[1][1]/total]
    else:
        ret[data[0][0]] = [data[0][1], 1.0]
    return ret

for group, data in gt_base_df.groupby('population'):
    data = data.drop('population', axis=1)
    print(group, data.shape)
    gt = data.apply(count_genotypes).apply(add_allele_freq)
    pop_allele_data[group] = gt.to_dict()

In [0]:
gemma_gt = read_df(analysis_dir, '_gemma_gt').replace("NA", np.nan)

In [0]:
gemma_gt.head()

In [0]:
pd.DataFrame(pop_allele_data['VA1'])

In [0]:
allele_freqs

In [0]:
gt_counts = gt_base_df.apply(count_genotypes)

In [0]:
gt_counts_af = gt_counts.apply(add_allele_freq)

In [0]:
gt_counts_af.head()

In [0]:
gemma_gt.head()

In [0]:
dview['gemma_gt'] = gemma_gt

dview['pops'] = pops

dview['analysis_dir'] = analysis_dir

In [0]:
%%px 
import os, pickle, traceback

In [0]:
with open(os.path.join(analysis_dir, "pop_allele_data.pkl"), "wb") as o:
    pickle.dump(pop_allele_data, o, pickle.HIGHEST_PROTOCOL)

In [0]:
%%px 
if not 'pop_allele_data' in dir():
    pop_allele_data = pickle.load(open(os.path.join(analysis_dir, "pop_allele_data.pkl"), "rb"))

In [0]:
def compute_obs_heterozygosity(snp):
    het = 0
    total = 0
    for gt in snp:
        if gt[0] != gt[-1]:
            het += 1
        if gt[1] == "/" or gt[1] == "|":
            total += 1
    return het/total

def compute_exp_heterozygosity(snp):
    het = 0
    total = 0
    c = Counter()
    for gt in snp:
        c[gt[0]] += 1
        c[gt[-1]] += 1
    total = np.sum(list(c.values()))
    He = 2
    for a in c:
        He *= (c[a]/total)
    return He

In [0]:
het_bins = np.linspace(0,1,20)

In [0]:
He = gt_base_df.drop("population", axis=1).apply(compute_exp_heterozygosity)
He = pd.DataFrame(He, columns=["He"])
He['rs'] = He.index

In [0]:
Ho = gt_base_df.drop("population", axis=1).apply(compute_obs_heterozygosity)
Ho = pd.DataFrame(Ho, columns=["Ho"])
Ho['rs'] = Ho.index

In [0]:
He.head()

In [0]:
tdt_ho = combined_dfs['tdt'].join(Ho, how="inner")
mass_ho = combined_dfs['mass'].join(Ho, how="inner")
pd_ho = combined_dfs['pd'].join(Ho, how="inner")

tdt_he = combined_dfs['tdt'].join(He, how="inner")
mass_he = combined_dfs['mass'].join(He, how="inner")
pd_he = combined_dfs['pd'].join(He, how="inner")

In [0]:
tdt_ho['het_bin'] = np.digitize(tdt_ho.Ho, het_bins)
mass_ho['het_bin'] = np.digitize(mass_ho.Ho, het_bins)
pd_ho['het_bin'] = np.digitize(pd_ho.Ho, het_bins)

tdt_he['het_bin'] = np.digitize(tdt_he.He, het_bins)
mass_he['het_bin'] = np.digitize(mass_he.He, het_bins)
pd_he['het_bin'] = np.digitize(pd_he.He, het_bins)

In [0]:
PhenoContainer = namedtuple("PhenoContainer", ["He", "Ho", "hmean", "sig", "relaxed"])

In [0]:
PC = {"mass": PhenoContainer(mass_he, mass_ho, combined_dfs['mass'], 
                             effect_snps[('mass', 'gamma_hmean', 'total_effect', 0.999)],
                             effect_snps[('mass', 'gamma_hmean', 'total_effect', 0.995)]),
      "pd":PhenoContainer(pd_he, pd_ho, combined_dfs['pd'],
                          effect_snps[('pd', 'gamma_hmean', 'total_effect', 0.999)],
                          effect_snps[('pd', 'gamma_hmean', 'total_effect', 0.995)]),
      "tdt":PhenoContainer(tdt_he, tdt_ho, combined_dfs['tdt'],
                           effect_snps[('tdt', 'gamma_hmean', 'total_effect', 0.999)],
                           effect_snps[('tdt', 'gamma_hmean', 'total_effect', 0.995)])}

In [0]:
for pheno in PC:
    plt.scatter(PC[pheno].He.gamma_hmean, PC[pheno].He.He)
    plt.xlabel("PIP")
    plt.ylabel(r"$H_{exp}$")
    plt.title("TDT")
    plt.show()

In [0]:
for pheno in PC:
    plt.scatter(PC[pheno].Ho.gamma_hmean, PC[pheno].Ho.Ho)
    plt.xlabel("PIP")
    plt.ylabel(r"$H_{obs}$")
    plt.title(pheno.upper())
    plt.show()

In [0]:
test_snp = 'ctg7180005039298_50'
test_minor = gemma_gt.ix[test_snp, "minor"]
print(test_minor)
for p in pop_allele_data:
    if test_minor in pop_allele_data[p][test_snp]:
        print(pop_allele_data[p][test_snp][test_minor])

In [0]:
gemma_gt.head()

In [0]:
pop_counts = Counter()
for col in gemma_gt.columns[2:]:
    pop_counts[col.split("_")[0]] += 1

In [0]:
pop_counts_alleles = Counter()
for p in pop_allele_data:
    for snp in pop_allele_data[p]:
        for allele in pop_allele_data[p][snp]:
            pop_counts_alleles[p] += pop_allele_data[p][snp][allele][0]

In [0]:
pop_counts

In [0]:
pop_counts_alleles

In [0]:
total_ind = np.sum(list(pop_counts.values()))
total_alleles = np.sum(list(pop_counts_alleles.values()))
for p in pop_counts:
    print(p, pop_counts[p]/total_ind, pop_counts_alleles[p]/total_alleles)

In [0]:
allele_freqs = read_df(analysis_dir, "allele_freqs")

In [0]:
mafs = allele_freqs.apply(lambda x: np.min((x.p, x.q)))

In [0]:
dview['pop_counts_alleles'] = pop_counts_alleles
dview['pop_counts'] = pop_counts
dview['allele_freqs'] = allele_freqs
dview['mafs'] = mafs

# From Storz and Kelley (2008), Eqn 2

# $D_{ij} = \bigg(\sum_{k}\frac{n_k}{n}D_{ij,k}\bigg) + \bigg(\sum_{k}\frac{n_k}{n}(q_{i,k}q_{j,k} - q_iq_j)\bigg)$



In [0]:
@lview.remote()
def do_pairwise_storey(sig_list):
    import numpy as np
    import traceback
    ret = {}
    n = np.sum(list(pop_counts_alleles.values()))
    for i, snp in enumerate(sig_list):
        snp_i = snp
        minor_allele_i = gemma_gt.ix[snp_i, "minor"]
        qi = mafs[snp_i]
        for j in range(i):
            snp_j = sig_list[j]
            minor_allele_j = gemma_gt.ix[snp_j, "minor"]
            qj = mafs[snp_j]
            
            k_sum = 0
            for p in pops:
                nk = pop_counts_alleles[p]
                
                qik = qjk = 0.0
                
                if minor_allele_i in pop_allele_data[p][snp_i]:
                    qik = pop_allele_data[p][snp_i][minor_allele_i][1]
                    
                if minor_allele_j in pop_allele_data[p][snp_j]:
                    qjk = pop_allele_data[p][snp_j][minor_allele_j][1]
                
                k_sum += ((nk/n) * ((qik*qjk)-(qi*qj)))
                
            ret[snp_i, snp_j] = k_sum  
    return ret


@lview.remote()
def do_pairwise_eckert(sig_list):
    import numpy as np
    import traceback
    ret = {}
    for i, snp in enumerate(sig_list):
        snp_i = snp
        minor_allele_i = gemma_gt.ix[snp_i, "minor"]
        for j in range(i):
            snp_j = sig_list[j]
            minor_allele_j = gemma_gt.ix[snp_j, "minor"]
            in_prods = []
            freqs = {snp_i: [], snp_j: []}
            for p in pops:
                paf_i = paf_j = 0.0
                
                if minor_allele_i in pop_allele_data[p][snp_i]:
                    paf_i = pop_allele_data[p][snp_i][minor_allele_i][1]
                    
                if minor_allele_j in pop_allele_data[p][snp_j]:
                    paf_j = pop_allele_data[p][snp_j][minor_allele_j][1]
                
                freqs[snp_i].append(paf_i)
                freqs[snp_j].append(paf_j)
                in_prods.append(paf_i * paf_j)
            avg_in_prod = np.mean(in_prods)
            freqs_avg = {k: np.mean(freqs[k]) for k in freqs}
            across_freqs = list(freqs_avg.values())
            across_prod = across_freqs[0] * across_freqs[1]
            ret[snp_i, snp_j] = (avg_in_prod-across_prod)
    return ret



In [0]:
for pheno in PC:
    sig = list(PC[pheno].sig)
    storey = do_pairwise_storey(sig)
    eckert = do_pairwise_eckert(sig)

    storey_vals = []
    eckert_vals = []
    for pair in storey:
        storey_vals.append(storey[pair])
        eckert_vals.append(eckert[pair])
        
    sns.distplot(storey_vals, label="storey")
    sns.distplot(eckert_vals, label="eckert")
    plt.xlabel("pairwise D")
    plt.title(pheno)
    plt.legend()
    f, p = f_oneway(storey_vals, eckert_vals)
    plt.text(0.02, 61, r"$F = %.3f, p = %.3f$" % (f, p))
    plt.show()

In [0]:
def get_nulls_by_het(n, sig_df, df):
    unassoc = df.drop(sig_df.index)
    het_bin_counts = df.ix[sig_df.index]['het_bin'].value_counts()
    het_bins = het_bin_counts.index.tolist()
    unassoc = unassoc[unassoc.het_bin.isin(het_bins)]
    data = []
    for i in range(n):
        inner = []
        for het_bin, het_count in het_bin_counts.iteritems():
            inner.extend(unassoc[unassoc.het_bin == het_bin].rs.sample(het_count).tolist())
        data.append(inner)
    return data, het_bins

def get_nulls_naive(n, sig_df, df):
    unassoc = df.drop(sig_df.index)
    return [unassoc.rs.sample(len(sig_df)).tolist() for x in range(n)], []

In [0]:
get_nulls = get_nulls_by_het
dview['get_nulls_naive'] = get_nulls_naive
dview['get_nulls'] = get_nulls
dview['get_nulls_by_het'] = get_nulls

In [0]:
def num_pairwise(n):
    return ((n*n)/2)-(n/2)

In [0]:
gwas_D = {}
for pheno in PC:
    sig = list(PC[pheno].sig)
    res = do_pairwise_storey(sig).r
    gwas_D[pheno] = res

In [0]:
nulls = {}
sig_het_bins = {}
for pheno in PC:
    sig = PC[pheno].hmean.ix[PC[pheno].sig]
    nulls[pheno], sig_het_bins[pheno] = get_nulls_by_het(1000, sig, PC[pheno].He)


In [0]:
len(nulls['tdt'][0])

In [0]:
nulls_D = {}
for pheno in PC:
    nulls_D[pheno] = []
    for i, null_list in enumerate(nulls[pheno]):
        nulls_D[pheno].append(do_pairwise(null_list))

In [0]:
for pheno in PC:
    nulls_D[pheno] = [np.abs(x.r) for x in nulls_D[pheno]]

In [0]:
sns.set_context("talk")

In [0]:
for pheno in PC:
    d = gwas_D[pheno]
    sns.distplot(d, label="sig")
    plt.xlabel("Pairwise D")
    plt.title("%s (n=%d, pairwise=%d)" % (pheno.upper(), len(list(PC[pheno].sig)), len(d)))
    plt.legend()
    plt.show()

In [0]:
null_medians = {}
for pheno in PC:
    null_medians[pheno] = []
    for l in nulls_D[pheno]:
        null_medians[pheno].append(np.median(l))

In [0]:
for pheno in PC:
    d = nulls_D[pheno][0]
    sns.distplot(d)
    plt.title("nulls %s " % pheno)
    plt.show()

In [0]:
sns.set_context("talk")
for pheno in PC:
    n = pd.Series(null_medians[pheno])
    d = pd.Series(gwas_D[pheno])
    sns.distplot(n, label="null")
    sns.distplot(d, label="observed")
    plt.title("%s (n = %d)" % (pheno, len(PC[pheno].sig)))
    plt.axvline(x=n.quantile(0.95), c="red", zorder=0, label="null 95th")
    plt.axvline(x=np.median(d), c="blue", zorder=0, label="obs. median")
    plt.axvline(x=d.quantile(0.95), c="green", zorder=0, label="obs. 95th")
    plt.xlabel("D")
    plt.legend()
    plt.show()