In [0]:
import scandir
import os, sys
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.robjects as ro
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import dill
import random
import vcf
from hdfstorehelper import HDFStoreHelper
import statsmodels.api as sm
import statsmodels.formula.api as smf
import operator
import traceback
%load_ext rpy2.ipython
from rpy2.robjects import pandas2ri as p2r
p2r.activate()
r = ro.r
import shutil
from utils import read_df, save_df
from pathlib import Path, PurePath
from ipyparallel import Client
from collections import Counter, defaultdict, namedtuple, OrderedDict
from scipy.stats import mannwhitneyu, ks_2samp
import tables
import ujson
import pickle

In [0]:
rc = Client(profile="sge")

In [0]:
dview = rc[:]
lview = rc.load_balanced_view()
len(dview)

In [0]:
analysis_dir = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/beagle40/"

In [0]:
phenos = ["mass", "tdt", "pd"]

In [0]:
hmeans = {"mass": os.path.join(analysis_dir, "mcmc_mass_hmean.txt"),
         "tdt": os.path.join(analysis_dir, "mcmc_tdt_hmean.txt"),
         "pd": os.path.join(analysis_dir, "mcmc_pd_hmean.txt")}

quants = {"mass": os.path.join(analysis_dir, "mass_quant.txt"),
         "tdt": os.path.join(analysis_dir, "tdt_quant.txt"),
         "pd": os.path.join(analysis_dir, "pd_quant.txt")}

In [0]:
def read_hmean(key):
    return pd.read_csv(hmeans[key], sep="\t", index_col=0).apply(pd.to_numeric, errors="ignore")

In [0]:
def read_quant(key):
    return pd.read_csv(quants[key], sep="\t", index_col=0).apply(pd.to_numeric, errors="ignore")

In [0]:
mass_hmean = read_hmean("mass")
pd_hmean = read_hmean("pd")
tdt_hmean = read_hmean("tdt")


In [0]:
mass_quant = read_quant("mass")
pd_quant = read_quant("pd")
tdt_quant = read_quant("tdt")

In [0]:
mass_sig = mass_hmean[mass_hmean.postrb_hmean > mass_quant.ix['cutoff'].values[0]]
mass_relaxed = mass_hmean[mass_hmean.postrb_hmean > mass_quant.ix['relaxed_cutoff'].values[0]]

In [0]:
mass_sig = mass_hmean.sort_values('postrb_hmean', ascending=False).head(20)
pd_sig = pd_hmean.sort_values('postrb_hmean', ascending=False).head(20)
tdt_sig = tdt_hmean.sort_values('postrb_hmean', ascending=False).head(20)

tdt_sig = tdt_hmean.sort_values('postrb_hmean', ascending=False).tail(20)

In [0]:
sns.distplot(mass_sig.postrb_hmean);

In [0]:
sns.distplot(pd_sig.postrb_hmean);

In [0]:
sns.distplot(tdt_sig.postrb_hmean);

In [0]:
gt_base_df = read_df(analysis_dir, 'gt_base_df')

In [0]:
pops = sorted(set([x.split("_")[0] for x in gt_base_df.index]))

In [0]:
def count_genotypes(snp):
    counts = Counter()
    for gt in snp:
        try:
            float(gt) #if gt is nan
        except:
            counts[gt[0]]+=1
            counts[gt[-1]]+=1
    return sorted(counts.items(), key=operator.itemgetter(1))

In [0]:
gt_base_df['population'] = gt_base_df.apply(lambda x: x.name.split("_")[0], axis=1)

In [0]:
pop_allele_data = {}

def add_allele_freq(gt_list):
    data = gt_list
    ret = OrderedDict()
    if len(gt_list) == 2:
        total = data[0][1]+data[1][1]
        ret[data[0][0]] = [data[0][1], data[0][1]/total]
        ret[data[1][0]] = [data[1][1], data[1][1]/total]
    else:
        ret[data[0][0]] = [data[0][1], 1.0]
    return ret

for group, data in gt_base_df.groupby('population'):
    data = data.drop('population', axis=1)
    print(group, data.shape)
    gt = data.apply(count_genotypes).apply(add_allele_freq)
    pop_allele_data[group] = gt.to_dict()

In [0]:
pimass_gt = read_df(analysis_dir, '_pimass_gt').replace("NA", np.nan)

In [0]:
pimass_gt.head()

In [0]:
pd.DataFrame(pop_allele_data['VA1'])

In [0]:
gt_counts = gt_base_df.apply(count_genotypes)

In [0]:
gt_counts_af = gt_counts.apply(add_allele_freq)

In [0]:
gt_counts_af.head()

In [0]:
dview['pimass_gt'] = pimass_gt

In [0]:
dview['pops'] = pops

In [0]:
dview['analysis_dir'] = analysis_dir

In [0]:
%px import os, pickle, traceback

In [0]:
with open(os.path.join(analysis_dir, "pop_allele_data.pkl"), "wb") as o:
    pickle.dump(pop_allele_data, o, pickle.HIGHEST_PROTOCOL)

In [0]:
%px pop_allele_data = pickle.load(open(os.path.join(analysis_dir, "pop_allele_data.pkl"), "rb"))

In [0]:
def compute_heterozygosity(snp):
    het = 0
    total = 0
    for gt in snp:
        if gt[0] != gt[-1]:
            het += 1
        if gt[1] == "/" or gt[1] == "|":
            total += 1
    return het/total
Ho = gt_base_df.drop("population", axis=1).apply(compute_heterozygosity)
Ho = pd.DataFrame(Ho, columns=["Ho"])
Ho['rs'] = Ho.index

het_bins = np.linspace(0,1,10)

In [0]:
tdt_ho = tdt_hmean.join(Ho, on="rs", rsuffix="_ho", how="inner")
mass_ho = mass_hmean.join(Ho, on="rs", rsuffix="_ho", how="inner")
pd_ho = pd_hmean.join(Ho, on="rs", rsuffix="_ho", how="inner")

In [0]:
tdt_ho['het_bin'] = np.digitize(tdt_ho.Ho, het_bins)
mass_ho['het_bin'] = np.digitize(mass_ho.Ho, het_bins)
pd_ho['het_bin'] = np.digitize(pd_ho.Ho, het_bins)

In [0]:
plt.scatter(tdt_ho.postrb_hmean, tdt_ho.Ho)
plt.xlabel("PIP")
plt.ylabel(r"$H_{O}$")
plt.title("TDT")
plt.show()

plt.scatter(mass_ho.postrb_hmean, tdt_ho.Ho)
plt.xlabel("PIP")
plt.ylabel(r"$H_{O}$")
plt.title("Mass")
plt.show()

plt.scatter(pd_ho.postrb_hmean, tdt_ho.Ho)
plt.xlabel("PIP")
plt.ylabel(r"$H_{O}$")
plt.title("PD")
plt.show()

In [0]:
@lview.remote()
def do_pairwise(sig_list):
    import numpy as np
    import traceback
    ret = []
    for i, snp in enumerate(sig_list):
        snp_i = snp
        minor_allele_i = pimass_gt.ix[snp_i, "minor"]
        for j in range(i):
            snp_j = sig_list[j]
            minor_allele_j = pimass_gt.ix[snp_j, "minor"]
            in_prods = []
            freqs = {snp_i: [], snp_j: []}
            for p in pops:
                paf_i = paf_j = 0.0
                try:
                    paf_i = pop_allele_data[p][snp][minor_allele_i][1]
                    paf_j = pop_allele_data[p][snp_j][minor_allele_j][1]
                    freqs[snp_i].append(paf_i)
                    freqs[snp_j].append(paf_j)
                    in_prods.append(paf_i * paf_j)
                    avg_in_prod = np.mean(in_prods)
                    freqs_avg = {k: np.mean(freqs[k]) for k in freqs}
                    across_freqs = list(freqs_avg.values())
                    across_prod = across_freqs[0] * across_freqs[1]
                    ret.append(avg_in_prod-across_prod)
                except KeyError as e:
                    pass #in case allele is fixed in pop
                except:
                    traceback.print_exc()
    return ret

In [0]:
def get_nulls(n, sig_df, df):
    unassoc = df.drop(sig_df.index)
    het_bins = df.ix[sig_df.index]['het_bin'].unique()
    unassoc = unassoc[unassoc['het_bin'].isin(het_bins)]
    return [unassoc.rs.sample(len(sig_df)).tolist() for x in range(n)]

In [0]:
dview['get_nulls'] = get_nulls

In [0]:
ld = do_pairwise(tdt_sig.rs.tolist()).r

In [0]:
nulls = get_nulls(1000, tdt_sig, tdt_ho)

In [0]:
test = []
for i, n in enumerate(nulls):
    if i % 100 == 0:
        print(i)
    test.append(do_pairwise(n))

In [0]:
sum([x.ready() for x in test])

In [0]:
@lview.remote()
def ks_test(arr1, arr2):
    from scipy.stats import ks_2samp
    return ks_2samp(arr1, arr2)

@lview.remote()
def mwu_test(arr1, arr2):
    from scipy.stats import mannwhitneyu
    return mannwhitneyu(arr1, arr2)

In [0]:
test_r = [x.r for x in test]

In [0]:
ks = [ks_test(ld, x) for x in test_r]

In [0]:
ks_r = [x.r for x in ks]

In [0]:
ks_pvals = []
for stat, pval in ks_r:
    ks_pvals.append(pval)

In [0]:
sns.distplot(ks_pvals, bins=25, label="KW")
plt.legend()

In [0]:
mwu = [mwu_test(ld, x) for x in test_r]

In [0]:
sum([x.ready() for x in mwu])

In [0]:
mwu_r = [x.r for x in mwu]

In [0]:
mwu_pvals = []
for stat, pval in mwu_r:
    mwu_pvals.append(pval)

In [0]:
sns.distplot(mwu_pvals, label="MW");
plt.legend()

In [0]:
nulls2 = get_nulls(1000, tdt_sig, tdt_ho)
nulls3 = get_nulls(1000, tdt_sig, tdt_ho)

In [0]:
n2 = [do_pairwise(x) for x in nulls2]
n3 = [do_pairwise(x) for x in nulls3]

In [0]:
sum(x.ready() for x in n2)

In [0]:
sum(x.ready() for x in n3)

In [0]:
n2_r = [x.r for x in n2]

In [0]:
n3_r = [x.r for x in n3]

In [0]:
null_pvals = []
for x, y in zip(n2_r, n3_r):
    stat, p = ks_2samp(x, y)
    null_pvals.append(p)

In [0]:
sns.distplot(null_pvals);

In [0]:
sns.distplot(ks_pvals);

In [0]:
len(ks_pvals), len(null_pvals)

In [0]:
ks_2samp(ks_pvals, null_pvals)

In [0]:
mannwhitneyu(ks_pvals, null_pvals)