In [0]:
import scandir
import os
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.robjects as ro
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import dill
import random
import cyvcf
from hdfstorehelper import HDFStoreHelper
import statsmodels.api as sm
import statsmodels.formula.api as smf
import operator
import traceback
%load_ext rpy2.ipython
from rpy2.robjects import pandas2ri as p2r
p2r.activate()
r = ro.r
from collections import defaultdict
from sklearn import preprocessing
import scipy as sp
import shutil

In [0]:
analysis_dir_notimp = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/notimputed/"
analysis_dir_imp = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle40/"
hdf_notimp = HDFStoreHelper(os.path.join(analysis_dir_notimp, "isect.hd5"))
hdf_imp = HDFStoreHelper(os.path.join(analysis_dir_imp, "isect.hd5"))

In [0]:
analysis_dir = [analysis_dir_notimp, analysis_dir_imp]

In [0]:
hdfs = [hdf_notimp, hdf_imp]

In [0]:
pop_allele_freqs = []
for d in analysis_dir:
    infile = os.path.join(d, "pop_allele_freqs.dill")
    paf = dill.load(open(infile))
    pop_allele_freqs.append(paf)

In [0]:
snpsfile_df = []
for i, paf in enumerate(pop_allele_freqs):
    pops = sorted(paf)
    paf_data = defaultdict(defaultdict)
    for popn in pops:
        af = paf[popn]
        for snp in af:
            paf_data["%s_1" % snp][popn] = af.ix["P",snp]
            paf_data["%s_2" % snp][popn] = af.ix["Q",snp]  
    df = pd.DataFrame(paf_data).T
    df['blank'] = ""
    snpsfile_df.append(df)

In [0]:
environ_cols = snpsfile_df[0].columns.drop("blank")

In [0]:
bioclim = [x['bioclim'] for x in hdfs]
bioclim = [x[[y for y in x if 'BIO' in y]] for x in bioclim]
bioclim = [x.T for x in bioclim]
bioclim = [x[environ_cols] for x in bioclim]
bioclim = [x.astype(float) for x in bioclim]
bioclim = [x.apply(preprocessing.scale, axis=1) for x in bioclim]

In [0]:
bioclim = [x.assign(blank=lambda x: "") for x in bioclim]

In [0]:
bioclim[0]

In [0]:
for i,snps in enumerate(snpsfile_df):
    snp_outfile = os.path.join(analysis_dir[i], "snpsfile")
    env_outfile = os.path.join(analysis_dir[i], "environfile")
    snps.to_csv(snp_outfile, sep="\t", header=False, index=False)
    envs = bioclim[i]
    envs.to_csv(env_outfile, sep="\t", header=False, index=True)

```bash
~/g/src/bayenv2_public/bayenv2 -i snpsfile -p 6 -k 100000 -r 187564 > matrix.out
```

In [0]:
vcov = []
for d in analysis_dir:    
    vcovs = []
    current = None
    for line in open(os.path.join(d, "matrix.out")):
        if "VAR-COVAR" in line:
            current = []
            vcovs.append(current)
        if isinstance(current, list):
            current.append(line.strip().split("\t"))
    vcov.append(vcovs)

In [0]:
vcov_dfs = []
for v in vcov:
    vcov_dfs_temp = []
    for i, elem in enumerate(v):
        vcov_dfs_temp.append(pd.DataFrame(elem[1:]).T)
    vcov_dfs.append(vcov_dfs_temp)

In [0]:
matrix_correlations = []
for vcov_df in vcov_dfs:
    temp = []
    for i in xrange(len(vcov_df)):
        for j in xrange(len(vcov_df)):
            if j == (i + 1):
                idf = vcov_df[i]
                jdf = vcov_df[j]
                idf = idf.ix[:,:len(idf)-1]
                jdf = jdf.ix[:,:len(jdf)-1]
                idf = [float(x) for x in idf.values.flatten()]
                jdf = [float(x) for x in jdf.values.flatten()]
                assert len(idf) == len(jdf)
                temp.append(sp.stats.pearsonr(idf, jdf)[0])
    matrix_correlations.append(temp)

In [0]:
for matrix_correlation in matrix_correlations:
    plt.plot(range(len(matrix_correlation)),matrix_correlation)
    plt.title("Pearson correlations among %d adjacent VCOV matrices" % len(matrix_correlation))
    plt.show()

In [0]:
for i, d in enumerate(analysis_dir):
    out_matrix = os.path.join(d, "matrix_last.out")
    matrix = vcov_dfs[i][-1]
    matrix.to_csv(out_matrix, sep="\t", index=False, header=False)

In [0]:
snp_indv = [x.drop("blank", axis=1) for x in snpsfile_df]

In [0]:
for s in snp_indv:
    s['snp_name'] = s.apply(lambda x: "_".join(x.name.split("_")[0:-1]), axis=1)

In [0]:
bayenv = []
for i, s in enumerate(snp_indv):
    temp = []
    bayenv.append(temp)
    outdir = os.path.join(analysis_dir[i], "bayenv")
    if not os.path.exists(outdir):
        os.mkdir(outdir)
    for group, data in s.groupby("snp_name"):
        bayenv_file = os.path.join(outdir, "%s.txt" % group)
        temp.append(bayenv_file)
        data = pd.DataFrame(data)
        data['blank'] = ""
        data = data.drop("snp_name", axis=1)
        data.to_csv(bayenv_file, index=False, header=False, sep="\t")

In [0]:
bayenv

In [0]:
bayenv_exe = "/home/cfriedline/g/src/bayenv2_public/bayenv2"
bayenv_opt = "-i snpfile -m matrixfile -e environfile -p 6 -k 100000 -n 19 -t -X -c -f -r rand"

for i, bayenv_files in enumerate(bayenv):
    cpu = 0
    max_cpu = 30
    thedir = analysis_dir[i]
    shutil.copy(os.path.join(thedir, "matrix_last.out"), os.path.dirname(bayenv_files[0]))
    shutil.copy(os.path.join(thedir, "environfile"), os.path.dirname(bayenv_files[0]))
    with open(os.path.join(os.path.dirname(bayenv_files[0]), "bayenv_parallel"), "w") as o:
        for bayenv_file in bayenv_files:
            bayenv_cmd = " ".join([bayenv_exe, bayenv_opt]).split()
            if cpu == max_cpu:
                cpu = 0
            
            bayenv_cmd[2] = os.path.basename(bayenv_file)
            bayenv_cmd[4] = "matrix_last.out"
            bayenv_cmd[6] = "environfile"
            bayenv_cmd[-1] = int(random.getrandbits(16))
            bayenv_cmd.insert(0, "taskset -c %d" % cpu)
            o.write("%s\n" % " ".join([str(x) for x in bayenv_cmd]))
            
            cpu += 1

## Run bayenv jobs

```bash
cat bayenv_parallel | parallel -j 30 --eta --
```

In [0]:
vartypes = ['bf', 'r', 'p']
bf_cols = []
for b in bioclim[0].index:
    for v in vartypes:
        bf_cols.append("%s_%s" % (b, v))

In [0]:
xtxs = []
bfs = []
for i, dir_name in enumerate(analysis_dir):
    xtx = os.path.join(os.path.join(dir_name, "bayenv"), "XtX_out.environfile")
    bf = os.path.join(os.path.join(dir_name, "bayenv"), "bf_environ.environfile")
    xtx_df = pd.read_csv(xtx, sep="\t", header = None, index_col=0, names=['XtX'])
    xtx_df.index.name = "snp"
    xtx_df.index = [x.replace(".txt", "") for x in xtx_df.index]
    bf_df = pd.read_csv(bf, sep="\t", header=None, index_col=0)
    bf_df = bf_df.drop(bf_df.columns[-1], axis=1)
    bf_df.index = [x.replace(".txt", "") for x in bf_df.index]
    bf_df.index.name = "snp"
    bf_df.columns = bf_cols
    bfs.append(bf_df)
    xtxs.append(xtx_df)

In [0]:
impute_status = ["notimputed", "imputed"]
for i, dir_name in enumerate(analysis_dir):
    bf_out = os.path.join(dir_name, "bayenv_bf_%s.txt" % impute_status[i])
    xtx_out = os.path.join(dir_name, "bayenv_xtx_%s.txt" % impute_status[i])
    bfs[i].to_csv(bf_out, sep="\t", index=True, header=True)
    xtxs[i].to_csv(xtx_out, sep="\t", index=True, header=True)

In [0]:
joined = []
for i, dir_name in enumerate(analysis_dir):
    bayenv_out = os.path.join(dir_name, "bayenv_results_%s.txt" % impute_status[i])
    j = xtxs[i].join(bfs[i])
    j.to_csv(bayenv_out, sep="\t", header=True, index=True)
    joined.append(j)

In [0]:
sns.set_context("notebook")

In [0]:
for i, data in enumerate(joined):
    sns.distplot(data.XtX)
    plt.title(impute_status[i])
    plt.xlim(4.5,8.5)
    plt.ylim(0,1.6)
    plt.show()

In [0]:
sns.load_dataset("tips").head()

In [0]:
def plot_bf_vs_xtx(df, imputed, dir_name):
    num_figs = len(df.columns)-1
    plt.gcf().set_size_inches(20,20)
    for i in xrange(num_figs):
        if i > 0:
            plt.subplot(5,4,i)
            plt.title("%s (%s)" % (df.columns[i].split("_")[0], imputed))
            plt.xlabel("BF")
            plt.ylabel(df.columns[0])
            plt.scatter(df.ix[:,i],test.ix[:,0])

    plt.subplots_adjust(left=None, bottom=None, right=None, top=None,
                    wspace=.5, hspace=.5)
    plt.savefig("%s.pdf" % os.path.join(dir_name, "bf_vs_xtx_%s" % imputed))
    plt.show()

In [0]:
for i, res in enumerate(joined):
    df = pd.DataFrame(res['XtX']).join(res[[x for x in res if 'bf' in x]])
    plot_bf_vs_xtx(df, impute_status[i], analysis_dir[i])

In [0]:
outliers = []
for i, res in enumerate(joined):
    XtX = res.XtX
    M = np.median(XtX)    
    MAD = sm.robust.mad(XtX)
    lower_cutoff = M-(3*MAD)
    upper_cutoff = M+(3*MAD)
    o = pd.DataFrame([impute_status[i], 
                      M,
                      MAD,
                      lower_cutoff, 
                      upper_cutoff, 
                      len(XtX[XtX<lower_cutoff]), len(XtX[XtX>upper_cutoff])]).T
    o.columns = ['state', 'median_XtX', 'MAD', 'lower_cutoff', 'upper_cutoff', 'n_lower', 'n_upper']
    outliers.append(o)

In [0]:
pd.concat(outliers)

In [0]:
??sm.robust.mad