In [0]:
import scandir
import os
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.robjects as ro
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import dill

In [0]:
assembly = "/home/cfriedline/gpfs/assemblies/gypsy/masurca_new/CA/10-gapclose/genome.ctg.fasta"

In [0]:
%load_ext rpy2.ipython
r = ro.r

In [0]:
filedir = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle40/output_comeault_isect/"

In [0]:
def dump_session():
    dill.settings['recurse'] = True
    dill.settings['fmode'] = dill.HANDLE_FMODE
    dill.dump_session(filename=os.path.join(filedir, "pimass.dill"))

In [0]:
path_files = {}
mcmc_files = {}
gamma_files = {}
snp_files = {}
for root, dirs, files in scandir.walk(filedir):
    for f in files:
        d = f.split("_")
        pheno = d[1]
        if not pheno in path_files:
            path_files[pheno] = []
            mcmc_files[pheno]= []
            gamma_files[pheno] = []
            snp_files[pheno] = []
        if 'path' in f:
            path_files[pheno].append(os.path.join(root, f))
        elif 'mcmc' in f:
            mcmc_files[pheno].append(os.path.join(root, f))
        elif 'gamma' in f:
            gamma_files[pheno].append(os.path.join(root, f))
        elif 'snp' in f:
            snp_files[pheno].append(os.path.join(root, f))

In [0]:
%%R
library(coda)

In [0]:
mcmc = r('mcmc')
mcmc_list = r('mcmc.list')

In [0]:
dfs = {}
phenos = ["mass", "pd", "tdt"]
for pheno in phenos:
    frames = [pd.read_csv(x,sep="\t") for x in path_files[pheno]]
    frames = [x.ix[:,:-1] for x in frames]
    for df in frames:
        df.columns = [x.strip() for x in df.columns]
    dfs[pheno] = frames

In [0]:
dfs['mass'][0].head()

In [0]:
path_mcmc_r = {}
path_mcmc = {}
thin = 1
for key, dflist in dfs.items():
    path_mcmc_r[key] = [mcmc(pandas2ri.DataFrame(x.sample(frac=thin).sort_index())) for x in dflist]
    path_mcmc[key] = [x.sample(frac=thin).sort_index() for x in dflist]

In [0]:
path_mcmc_list_mass = mcmc_list(path_mcmc_r['mass'])
path_mcmc_list_pd = mcmc_list(path_mcmc_r['pd'])
path_mcmc_list_tdt = mcmc_list(path_mcmc_r['tdt'])

In [0]:
%R -i path_mcmc_list_mass -i path_mcmc_list_pd -i path_mcmc_list_tdt

In [0]:
%%R
effective_sizes_mass = lapply(path_mcmc_list_mass,effectiveSize)
effective_sizes_pd = lapply(path_mcmc_list_pd,effectiveSize)
effective_sizes_tdt = lapply(path_mcmc_list_tdt,effectiveSize)

In [0]:
def get_effective_sizes(r_name):
    df = pd.DataFrame([pandas2ri.ri2py(x) for x in r[r_name]])
    test = r[r_name].rx2(1)
    df.columns = r('names')(test)
    return df
ne_tdt = get_effective_sizes('effective_sizes_tdt')
ne_pd= get_effective_sizes('effective_sizes_pd')
ne_mass= get_effective_sizes('effective_sizes_mass')


In [0]:
print ne_tdt.mean()
print ne_tdt.std()


In [0]:
ne_pd.mean()

In [0]:
ne_mass.mean()

In [0]:
print "MASS", r("summary")(path_mcmc_list_mass)
print "PD", r("summary")(path_mcmc_list_pd)
print "TDT", r("summary")(path_mcmc_list_tdt)

In [0]:
%%R
plot(path_mcmc_list_mass)
plot(path_mcmc_list_pd)
plot(path_mcmc_list_tdt)

In [0]:
mcmc = {}
for pheno, files in mcmc_files.items():
    if not pheno in mcmc:
        mcmc[pheno] = pd.DataFrame()
    for f in files:
        index = os.path.basename(f).split("_")[-1].split(".")[0]
        testdf = pd.read_csv(f, sep="\t")
        testdf.columns = ["%s_%s" % (x.strip(), index) for x in testdf.columns]
        mcmc[pheno] = pd.concat([mcmc[pheno], testdf], axis=1)

In [0]:
mcmc_mass = mcmc['mass']
avg_pip_mass = mcmc_mass[[x for x in mcmc_mass.columns if 'postc' in x]].apply(np.mean, axis=1)
avg_effect_mass = mcmc_mass[[x for x in mcmc_mass.columns if 'beta_' in x]].apply(np.mean, axis=1)
avg_pip_mass_rb = mcmc_mass[[x for x in mcmc_mass.columns if 'postrb' in x]].apply(np.mean, axis=1)
avg_effect_mass_rb = mcmc_mass[[x for x in mcmc_mass.columns if 'betarb' in x]].apply(np.mean, axis=1)

In [0]:
contig_pips = {}
def get_contig_pip(row, pheno):
    if not pheno in contig_pips:
        contig_pips[pheno] = {}
        
    d = row.rs_1.split("_")
    contig = "_".join(d[:-1])
    if not contig in contig_pips[pheno]:
        contig_pips[pheno][contig] = {'postc':0,
                              'beta':0,
                              'betarb':0,
                              'postrb':0}
    contig_pips[pheno][contig]['postc'] += row.postc_1
    contig_pips[pheno][contig]['postrb'] += row.postrb_1
    contig_pips[pheno][contig]['beta'] += row.beta_1
    contig_pips[pheno][contig]['betarb'] += row.betarb_1

for pheno, df in mcmc.items():
    print pheno
    df.apply(get_contig_pip, args=(pheno,), axis=1)


In [0]:
contig_pip_dfs = {}
for pheno, data in contig_pips.items():
    contig_pip_dfs[pheno] = pd.DataFrame(data).T

In [0]:
from Bio import SeqIO
contig_lengths = {}
for rec in SeqIO.parse(assembly,"fasta"):
    contig_lengths[rec.name] = {"length":len(rec)}

In [0]:
contig_length_df = pd.DataFrame(contig_lengths).T

In [0]:
contig_length_df.head()

In [0]:
contig_pip_mass = contig_pip_dfs['mass'].join(contig_length_df)

In [0]:
plt.plot(contig_pip_dfs['tdt'].postrb, label="TDT")
plt.legend()
plt.show()

In [0]:
mcmc_pd = mcmc['pd']
avg_pip_pd = mcmc_pd[[x for x in mcmc_pd.columns if 'postc' in x]].apply(np.mean, axis=1)
avg_effect_pd = mcmc_pd[[x for x in mcmc_pd.columns if 'beta_' in x]].apply(np.mean, axis=1)
avg_pip_pd_rb = mcmc_pd[[x for x in mcmc_pd.columns if 'postrb' in x]].apply(np.mean, axis=1)
avg_effect_pd_rb = mcmc_pd[[x for x in mcmc_pd.columns if 'betarb' in x]].apply(np.mean, axis=1)

In [0]:
mcmc_tdt = mcmc['tdt']
avg_pip_tdt = mcmc_tdt[[x for x in mcmc_tdt.columns if 'postc' in x]].apply(np.mean, axis=1)
avg_effect_tdt = mcmc_tdt[[x for x in mcmc_tdt.columns if 'beta_' in x]].apply(np.mean, axis=1)
avg_pip_tdt_rb = mcmc_tdt[[x for x in mcmc_tdt.columns if 'postrb' in x]].apply(np.mean, axis=1)
avg_effect_tdt_rb = mcmc_tdt[[x for x in mcmc_tdt.columns if 'betarb' in x]].apply(np.mean, axis=1)

In [0]:
plt.xlim(0, len(mcmc_mass))
#plt.plot(avg_pip_mass, label="PIP")
#plt.plot(avg_effect_mass, alpha=0.5, label="Beta")
plt.plot(avg_pip_mass_rb, alpha=0.5, label="PIP (RB)")
plt.plot(avg_effect_mass_rb, alpha=0.5, label="Beta (RB)")
plt.title("Mass")
plt.xlabel("SNP")
plt.legend()
plt.show()

In [0]:
plt.xlim(0, len(mcmc_pd))
#plt.plot(avg_pip_pd, label="PIP")
#plt.plot(avg_effect_pd, alpha=0.5, label="Beta")
plt.plot(avg_pip_pd_rb, alpha=0.5, label="PIP (RB)")
plt.plot(avg_effect_pd_rb, alpha=0.5, label="Beta (RB)")
plt.title("PD")
plt.xlabel("SNP")
plt.legend()
plt.show()

In [0]:
plt.xlim(0, len(mcmc_tdt))
#plt.plot(avg_pip_tdt, label="PIP")
#plt.plot(avg_effect_tdt, alpha=0.5, label="Beta")
plt.plot(avg_pip_tdt_rb, alpha=0.5, label="PIP (RB)")
plt.plot(avg_effect_tdt_rb, alpha=0.5, label="Beta (RB)")
plt.title("TDT")
plt.xlabel("SNP")
plt.legend()
plt.show()

In [0]:
mcmc_pd.head()

In [0]:
snps = {}
for pheno, files in snp_files.items():
    if not pheno in snps:
        snps[pheno] = pd.DataFrame()
    for f in files:
        index = os.path.basename(f).split("_")[-1].split(".")[0]
        h = open(f)
        h.readline() ##skip header
        header = h.readline().strip().split()
        data = []
        for line in h:
            line = line.strip().split()
            data.append(line)
            
        testdf = pd.DataFrame(data, columns=header)
        testdf.columns = ["%s_%s" % (x.strip(), index) for x in testdf.columns]
        snps[pheno] = pd.concat([snps[pheno], testdf], axis=1)

In [0]:
snps_mass = snps['mass'][[x for x in snps['mass'] if '_1' in x]]

In [0]:
snps_mass.head()

In [0]:
def read_gamma(f):
    d = []
    h = open(f)
    header = h.readline().strip().split()
    for line in h:
        line = line.strip().split()
        d.append(line)
    df = pd.DataFrame(d, columns=header)
    return df.replace('NA', np.nan).astype(float)
gamma_mass = read_gamma(gamma_files['mass'][0])
gamma_pd = read_gamma(gamma_files['pd'][0])
gamma_tdt = read_gamma(gamma_files['tdt'][0])

In [0]:
snp_density = {}
def get_snp_density(row):
    included = row[1:].dropna()
    for snp_id in included:
        if not snp_id in snp_density:
            snp_density[snp_id] = 0
        snp_density[snp_id] += 1
    
x=gamma_mass.apply(get_snp_density, axis=1)


In [0]:
def percent_difference(x, y):
    x = float(x)
    y = float(y)
    return (np.abs(x-y)/np.mean([x, y]))*100

In [0]:
def get_quantile_max(name, data, q):
    d = data.quantile(q)
    d.index = [str(x) for x in d.index]
    d['median_val'] = data.median()
    d['mean_val'] = data.mean()
    d['cutoff'] = 0.01
    d["x99_cutoff"] = percent_difference(d['0.99'], d['cutoff'])
    d["x99_median"] =  percent_difference(d['0.99'], d['median_val'])
    d['relaxed_cutoff'] = d['0.99']
    d.name = name
    return d

In [0]:
mass_quant = get_quantile_max("mass", mcmc_mass.postrb_0, [0.95,0.99])
pd_quant = get_quantile_max("pd", mcmc_pd.postrb_0, [0.95,0.99])
tdt_quant =get_quantile_max("tdt", mcmc_tdt.postrb_0, [0.95,0.99]) 

In [0]:
print "%s\n\n%s\n\n%s\n" % (mass_quant, pd_quant, tdt_quant)

In [0]:
sig_snps_mass = mcmc_mass[mcmc_mass.postrb_0 > mass_quant.cutoff]
sig_snps_tdt = mcmc_tdt[mcmc_tdt.postrb_0 > tdt_quant.cutoff]
sig_snps_pd = mcmc_pd[mcmc_pd.postrb_0 > pd_quant.cutoff]

relaxed_sig_snps_mass = mcmc_mass[mcmc_mass.postrb_0 > mass_quant.relaxed_cutoff]
relaxed_sig_snps_tdt = mcmc_tdt[mcmc_tdt.postrb_0 > tdt_quant.relaxed_cutoff]
relaxed_sig_snps_pd = mcmc_pd[mcmc_pd.postrb_0 > pd_quant.relaxed_cutoff]

In [0]:
plt.hist(np.abs(sig_snps_mass.betarb_0.values))
plt.text(0.03, 20, r"$n = %d$" % len(sig_snps_mass))
plt.title(r"Mass ($> %.2f$)" % mass_quant.cutoff)
plt.xlabel(r"$\beta$")
plt.show()
plt.hist(np.abs(relaxed_sig_snps_mass.betarb_0.values))
plt.text(0.03, 50, r"$n = %d$" % len(relaxed_sig_snps_mass))
plt.title(r"Mass 99th($> %.5f$)" % mass_quant.relaxed_cutoff)
plt.xlabel(r"$\beta$")
plt.show()

In [0]:
plt.hist(np.abs(sig_snps_tdt.betarb_0.values))
plt.text(0.05, 80, r"$n = %d$" % len(sig_snps_tdt))
plt.title(r"TDT ($> %.2f$)" % tdt_quant.cutoff)
plt.xlabel(r"$\beta$")
plt.show()
plt.hist(np.abs(relaxed_sig_snps_tdt.betarb_0.values))
plt.text(0.05, 100, r"$n = %d$" % len(relaxed_sig_snps_tdt))
plt.title(r"TDT 99th ($> %.5f$)" % tdt_quant.relaxed_cutoff)
plt.xlabel(r"$\beta$")
plt.show()

In [0]:
plt.hist(np.abs(sig_snps_pd.betarb_0.values))
plt.text(0.02, 12, r"$n = %d$" % len(sig_snps_pd))
plt.title(r"PD ($> %.2f$)" % pd_quant.cutoff)
plt.xlabel(r"$\beta$")
plt.show()
plt.hist(np.abs(relaxed_sig_snps_pd.betarb_0.values))
plt.text(0.02, 60, r"$n = %d$" % len(relaxed_sig_snps_pd))
plt.title(r"PD 99th ($> %.5f$)" % pd_quant.relaxed_cutoff)
plt.xlabel(r"$\beta$")
plt.show()

In [0]:
sig_snps_mass.shape

In [0]:
sig_snps_tdt.shape

In [0]:
sig_snps_pd.shape

In [0]:
gamma_mass['snp'].describe()

In [0]:
gamma_tdt['snp'].describe()

In [0]:
gamma_pd['snp'].describe()

In [0]:
mass_pip_mean = np.mean(mcmc_mass[[x for x in mcmc_mass if 'postrb' in x]].apply(np.mean, axis=1))
pd_pip_mean = np.mean(mcmc_pd[[x for x in mcmc_pd if 'postrb' in x]].apply(np.mean, axis=1))
tdt_pip_mean = np.mean(mcmc_tdt[[x for x in mcmc_tdt if 'postrb' in x]].apply(np.mean, axis=1))

In [0]:
mass_beta_mean = np.mean(mcmc_mass[[x for x in mcmc_mass if 'betarb' in x]].apply(np.abs, axis=1).apply(np.mean, axis=1))
pd_beta_mean = np.mean(mcmc_pd[[x for x in mcmc_pd if 'betarb' in x]].apply(np.abs, axis=1).apply(np.mean, axis=1))
tdt_beta_mean = np.mean(mcmc_tdt[[x for x in mcmc_tdt if 'betarb' in x]].apply(np.abs, axis=1).apply(np.mean, axis=1))

In [0]:
mass_pip_mean, pd_pip_mean, tdt_pip_mean

In [0]:
mass_beta_mean, pd_beta_mean, tdt_beta_mean

In [0]:
mass_pip = mcmc_mass[[x for x in mcmc_mass if 'postrb' in x]]
pd_pip = mcmc_pd[[x for x in mcmc_pd if 'postrb' in x]]
tdt_pip = mcmc_tdt[[x for x in mcmc_tdt if 'postrb' in x]]

In [0]:
mass_pip.corr()

In [0]:
pd_pip.corr()

In [0]:
tdt_pip.corr()

In [0]:
mcmc_mass.postrb_0.quantile(q=[0.95])

In [0]:
dump_session()