In [0]:
!uname -a

In [0]:
cd ~/ipynb/gypsy_moth/

In [0]:
import scandir
import os
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.robjects as ro
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import dill
import random
from hdfstorehelper import HDFStoreHelper
import statsmodels.api as sm
import statsmodels.formula.api as smf
import operator
import traceback
%load_ext rpy2.ipython
from rpy2.robjects import pandas2ri as p2r
p2r.activate()
r = ro.r
from collections import defaultdict
from sklearn import preprocessing
import scipy as sp
import shutil
import pickle
from utils import read_df, save_df
import iterlib

In [0]:
analysis_dir_notimp = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/ni/"
analysis_dir_imp = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/beagle40/"

In [0]:
analysis_dir = [analysis_dir_notimp, analysis_dir_imp]

In [0]:
%%R
library(sp)
library(raster)

In [0]:
raster = r("raster")
extract = r("extract")

gps = {'QC32':[47.2509807, -79.4060515],
      'QC93': [46.90826, -70.8061075],
      'NC': [36.449125, -76.024672],
      'NY': [42.897768, -74.094761],
      'VA1': [38.657615, -77.463603],
      'VA2': [38.857470, -77.695003]}
gps_df = pd.DataFrame(gps).T
gps_df.columns = ['lat','lon']

latlon = pandas2ri.py2ri(gps_df[['lon', 'lat']])

bioclim_dir = "/gpfs_fs/home/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/bioclim_13"
bioclim = !ls {bioclim_dir}/*.bil
bioclim = sorted(bioclim)
bioclim_df = pd.DataFrame(gps_df)
for b in bioclim:
    rast = raster(b)
    bio = os.path.basename(b).split("_")[0].upper()
    vals = pd.DataFrame(pandas2ri.ri2py(extract(rast, latlon)))
    vals.index = bioclim_df.index
    vals.columns = [bio]
    bioclim_df = bioclim_df.join(vals)
bioclim_df = bioclim_df.sort_index()

In [0]:
bioclim_df

In [0]:
for d in analysis_dir:
    save_df(d, "bioclim_df", bioclim_df)

In [0]:
z12_swapped = []
for d in analysis_dir:
    z12_swapped.append(read_df(d, "z12_swapped"))

In [0]:
z12_swapped[1].head()

In [0]:
pop_allele_freqs = []
for d in analysis_dir:
    infile = os.path.join(d, "pop_allele_data.pkl")
    paf = pickle.load(open(infile, "rb"))
    pop_allele_freqs.append(paf)

In [0]:
snpsfile_df = []
for i, paf in enumerate(pop_allele_freqs):
    pops = sorted(paf)
    paf_data = defaultdict(defaultdict)
    for popn in pops:
        af = pd.DataFrame(paf[popn])        
        for snp in af:
            paf_data["%s_1" % snp][popn] = af.ix["P",snp]
            paf_data["%s_2" % snp][popn] = af.ix["Q",snp]  
    df = pd.DataFrame(paf_data).T
    df['blank'] = ""
    snpsfile_df.append(df)

In [0]:
environ_cols = snpsfile_df[0].columns.drop("blank")

In [0]:
environ_cols

In [0]:
bioclim = [read_df(x, "bioclim_df") for x in analysis_dir]

In [0]:
print(bioclim[1].to_csv())

In [0]:
bioclim = [bioclim[1]]
bioclim = [x[[y for y in x if 'BIO' in y]] for x in bioclim]
bioclim = [x.T for x in bioclim]
bioclim = [x[environ_cols] for x in bioclim]
bioclim = [x.astype(float) for x in bioclim]
bioclim = [x.apply(preprocessing.scale, axis=1) for x in bioclim]

In [0]:
bioclim = [x.assign(blank=lambda x: "") for x in bioclim]

In [0]:
bioclim.append(bioclim[0])

In [0]:
for i,snps in enumerate(snpsfile_df):
    snp_outfile = os.path.join(analysis_dir[i], "snpsfile")
    env_outfile = os.path.join(analysis_dir[i], "environfile")
    snps.to_csv(snp_outfile, sep="\t", header=False, index=False)
    envs = bioclim[i]
    envs.to_csv(env_outfile, sep="\t", header=False, index=True)

## Run bayenv to create covariance matrix

```bash
~/g/src/bayenv2_public/bayenv2 -i snpsfile -p 6 -k 100000 -r 187564 > matrix.out
```

In [0]:
analysis_dir

In [0]:
vcov = []
for d in analysis_dir[1:]:    
    vcovs = []
    current = None
    for line in open(os.path.join(d, "matrix.out")):
        if "VAR-COVAR" in line:
            current = []
            vcovs.append(current)
        if isinstance(current, list):
            current.append(line.strip().split("\t"))
    vcov.append(vcovs)

In [0]:
vcov_dfs = []
for v in vcov:
    vcov_dfs_temp = []
    for i, elem in enumerate(v):
        vcov_dfs_temp.append(pd.DataFrame(elem[1:]).T)
    vcov_dfs.append(vcov_dfs_temp)

In [0]:
matrix_correlations = []
for vcov_df in vcov_dfs:
    temp = []
    for i in range(len(vcov_df)):
        for j in range(len(vcov_df)):
            if j == (i + 1):
                idf = vcov_df[i]
                jdf = vcov_df[j]
                idf = idf.ix[:,:len(idf)-1]
                jdf = jdf.ix[:,:len(jdf)-1]
                idf = [float(x) for x in idf.values.flatten()]
                jdf = [float(x) for x in jdf.values.flatten()]
                assert len(idf) == len(jdf)
                temp.append(sp.stats.pearsonr(idf, jdf)[0])
    matrix_correlations.append(temp)

In [0]:
for matrix_correlation in matrix_correlations:
    plt.plot(list(range(len(matrix_correlation))),matrix_correlation)
    plt.title("Pearson correlations among %d adjacent VCOV matrices" % len(matrix_correlation))
    plt.show()

In [0]:
for i, d in enumerate(analysis_dir[1:]):
    out_matrix = os.path.join(d, "matrix_last.out")
    matrix = vcov_dfs[i][-1]
    matrix.to_csv(out_matrix, sep="\t", index=False, header=False)

In [0]:
snp_indv = [x.drop("blank", axis=1) for x in snpsfile_df]

In [0]:
for s in snp_indv:
    s['snp_name'] = s.apply(lambda x: "_".join(x.name.split("_")[0:-1]), axis=1)

In [0]:
%connect_info

In [0]:
def write_bayenv_files():
    bayenv = []
    for i, s in enumerate(snp_indv):
        temp = []
        bayenv.append(temp)
        outdir = os.path.join(analysis_dir[i], "bayenv")
        if not os.path.exists(outdir):
            os.mkdir(outdir)
        for group, data in s.groupby("snp_name"):
            bayenv_file = os.path.join(outdir, "%s.txt" % group)
            temp.append(bayenv_file)
            data = pd.DataFrame(data)
            data['blank'] = ""
            data = data.drop("snp_name", axis=1)
            data.to_csv(bayenv_file, index=False, header=False, sep="\t")
    return bayenv

In [0]:
bayenv = write_bayenv_files()

In [0]:
bayenv = !find {os.path.join(analysis_dir[1], "bayenv")} | grep tg | grep 'txt$'

In [0]:
bayenv = [[], sorted(bayenv)]

In [0]:
len(bayenv[1])

In [0]:
bayenv_jobs = 100

In [0]:
num_chunks = int(np.round(len(bayenv[1])/bayenv_jobs))

In [0]:
num_chunks

In [0]:
bayenv_exe = "/home/cfriedline/g/src/bayenv2_public/bayenv2"
bayenv_opt = "-i snpfile -m matrixfile -e environfile -p 6 -k 100000 -n 19 -t -X -c -f -r rand"

total = 0
for i, bayenv_files in enumerate(bayenv):
    if i == 0:
        continue
    cpu = 0
    max_cpu = 30
    thedir = analysis_dir[i]
    shutil.copy(os.path.join(thedir, "matrix_last.out"), os.path.dirname(bayenv_files[0]))
    shutil.copy(os.path.join(thedir, "environfile"), os.path.dirname(bayenv_files[0]))
    chunks=[bayenv_files[x:x+num_chunks] for x in range(0, len(bayenv_files), num_chunks)]
    for i, chunk in enumerate(chunks):
        with open(os.path.join(os.path.dirname(bayenv_files[0]), "bayenv_parallel_%d" % i), "w") as o:
            for bayenv_file in chunk:
                bayenv_cmd = " ".join([bayenv_exe, bayenv_opt]).split()
                if cpu == max_cpu:
                    cpu = 0

                bayenv_cmd[2] = os.path.basename(bayenv_file)
                bayenv_cmd[4] = "matrix_last.out"
                bayenv_cmd[6] = "environfile"
                bayenv_cmd[-1] = int(random.getrandbits(16))
                bayenv_cmd.insert(0, "taskset -c %d" % cpu)
                #bayenv_cmd.append("-o bayenv_%d" % i)
                bayenv_cmd.append("-o %s" % os.path.basename(bayenv_file).replace(".txt",""))
                o.write("%s\n" % " ".join([str(x) for x in bayenv_cmd]))
                total += 1
                cpu += 1

In [0]:
len(chunks)

In [0]:
total

In [0]:
from ipyparallel import Client

In [0]:
rc = Client(profile='sge')
dv = rc[:]
lv = rc.load_balanced_view()
len(dv)

In [0]:
def get_line_count(f):
    res = !wc -l {f}
    return int(res[0].split()[0])

In [0]:
dv['get_line_count'] = get_line_count

In [0]:
bayenv_parallel = !find {os.path.join(analysis_dir[1], "bayenv")} | grep bayenv_parallel_

In [0]:
total = 0
for f in bayenv_parallel:
    print(f)
    total += get_line_count(f)
total

In [0]:
bayenv_parallel = sorted(bayenv_parallel)
len(bayenv_parallel)

In [0]:
analysis_dir

In [0]:
bayenv_parallel[0:10]

In [0]:
!head -n1 {bayenv_parallel[0]}

In [0]:
def write_qsub_files(bayenv_parallel):
    files = []
    for i, f in enumerate(bayenv_parallel):
        d = os.path.join(analysis_dir[1], "bayenv")
        qsub_file = os.path.join(d, "qsub_%d.sh" % i)
        files.append(qsub_file)
        with open(qsub_file, "w") as o:
            os.chmod(o.name, 0o744)
            print(o.name)
            o.write("%s\n" % "\n".join(["#!/bin/bash", 
                                        "#$ -N bayenv%d" % i,
                                        "#$ -V",
                                        "#$ -cwd",
                                       "#$ -pe smp 30",
                                       "#$ -j y",
                                        "#$ -q all.q",
                                        "unset module",
                                        "echo \"Running on $HOSTNAME\"",
                                       "cat %s | ~/bin/parallel -j 30 --progress --" % f]))
    return files
qsub_files = write_qsub_files(bayenv_parallel)

In [0]:
cat /home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/beagle40/bayenv/qsub_0.sh

In [0]:
for i, ad in enumerate(analysis_dir):
    if i == 0:
        continue
    d = os.path.join(ad, "bayenv")
    with open(os.path.join(d, "qsub_runner.sh"), "w") as o:
        os.chmod(o.name, 0o744)
        o.write("#!/bin/bash\n")
        o.write("unset module\n")
        for q in qsub_files:
            o.write("qsub %s\n" % q)

In [0]:
!head -n5 {os.path.join(d, "qsub_runner.sh")}

In [0]:
qhost = !qhost | grep godel

In [0]:
for elem in qhost:
    host = elem.split()[0]
    if not host in ['godel200',
                   'godel21',
                   'godel37']:
        !ssh {host} pkill -9 bayenv

## Run bayenv jobs

```bash
cd ~/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/beagle40/bayenv

rm -f XtX_out.environfile
rm -f bf_environ.environfile
rm -f bayenv*.o*
rm -f bayenv*.po*
rm -f *.xtx
rm -f *.bf
rm -f *.txt.freqs

./qsub_runner.sh
```

In [0]:
vartypes = ['bf', 'r', 'p']
bf_cols = []
for b in bioclim[0].index:
    for v in vartypes:
        bf_cols.append("%s_%s" % (b, v))

In [0]:
xtxs = !ls {analysis_dir[1]}/bayenv | grep '.xtx'
xtxs = sorted([os.path.join(analysis_dir[1] + "bayenv", x) for x in xtxs])
len(xtxs)

In [0]:
bfs = !ls {analysis_dir[1]}/bayenv | grep '.bf'
bfs = sorted([os.path.join(analysis_dir[1] + "bayenv", x) for x in bfs])
len(bfs)

In [0]:
len(bfs), len(xtxs)

In [0]:
xtxs[0]

In [0]:
def read_bayenv(args):
    f, key, cols = args
    import pandas as pd
    key = key.lower()
    df = pd.read_csv(f, sep="\t", header=None, index_col=0)
    if key == 'xtx':
        df.columns = ['xtx']
    elif key == "bf":
        df = df.drop(df.columns[-1], axis=1)
    df.index = [x.replace(".txt", "") for x in df.index]
    df.index.name = "SNP"
    if cols:
        df.columns=cols
    return df.ix[0,:]

In [0]:
dv['read_bayenv'] = read_bayenv

In [0]:
xtx_args = [(x, 'xtx', None) for x in xtxs]
bf_args = [(x, 'bf', bf_cols) for x in bfs]

In [0]:
#xtx_dfs = [read_bayenv(x, "xtx") for x in xtxs]
xtx_dfs = dv.map_async(read_bayenv, xtx_args)

In [0]:
xtx_dfs = xtx_dfs.r

In [0]:
bf_dfs = dv.map_async(read_bayenv, bf_args)

In [0]:
bf_dfs.progressgress

In [0]:
bf_dfs = bf_dfs.r

In [0]:
analysis_dir[1]

In [0]:
bf = pd.concat(bf_dfs)
xtx = pd.concat(xtx_dfs)

In [0]:
xtx.shape, bf.shape

In [0]:
def run_makeup_bayenv(f):
    import os
    cmd = "cd %s && /home/cfriedline/g/src/bayenv2_public/bayenv2 -i %s \
-m %s -e %s -p 6 -k 100000 \
-n 19 -t -X -c -f -r 2573 \
-o %s" % (os.path.dirname(f),
          os.path.basename(f),
          "matrix_last.out", 
          "environfile",
          os.path.basename(f).replace(".txt", ""))
    !$cmd
    return f, cmd

In [0]:
bayenv_makeup = []
for f in bayenv[1]:
    name = os.path.basename(f).split(".")[0]
    if not name in xtx.index:
        bayenv_makeup.append(f)

In [0]:
dv['run_makeup_bayenv'] = run_makeup_bayenv

In [0]:
run_makeup_bayenv(bayenv_makeup[0])

In [0]:
len(makeup), len(bayenv_makeup)

In [0]:
makeup = dv.map_async(run_makeup_bayenv, bayenv_makeup)

In [0]:
makeup.progress

In [0]:
for snp in xtx.index:
    d = os.path.join(analysis_dir[1], "bayenv")
    f = os.path.join(d, "%s.txt" % snp)
    if not os.path.exists(f):
        print(f)

In [0]:
sns.set_context("notebook")

In [0]:
ax = sns.distplot(xtx)
ax.set_xlabel("XtX")
plt.show()

In [0]:
joined = xtx.join(bf)

In [0]:
joined.head()

In [0]:
def plot_bf_vs_xtx(df, imputed, dir_name):
    num_figs = len(df.columns)-1
    print(num_figs)
    plt.gcf().set_size_inches(20,20)
    for i in range(num_figs):
        if i > 0:
            plt.subplot(5,4,i)
            plt.title("%s (%s)" % (df.columns[i].split("_")[0], imputed))
            plt.xlabel("BF")
            plt.ylabel(df.columns[0])
            plt.scatter(df.ix[:,i],df.ix[:,0])

    plt.subplots_adjust(left=None, bottom=None, right=None, top=None,
                    wspace=.5, hspace=.5)
    plt.savefig("%s.pdf" % os.path.join(dir_name, "bf_vs_xtx_%s" % imputed))
    plt.show()
plot_bf_vs_xtx(pd.DataFrame(xtx).join(bf[[x for x in bf if 'bf' in x]]), "imputed", analysis_dir[1])

In [0]:
xtx_outliers = []
impute_status = ["imputed"]
for i, res in enumerate([joined]):
    XtX = res.xtx
    M = np.median(XtX)    
    MAD = sm.robust.mad(XtX)
    lower_cutoff = M-(3*MAD)
    upper_cutoff = M+(3*MAD)
    lower_snps = sorted(XtX[XtX<lower_cutoff].index.tolist())
    upper_snps = sorted(XtX[XtX>upper_cutoff].index.tolist())
    percent_cutoff = np.percentile(XtX, 99)
    percent_cutoff_outliers = sorted(XtX[XtX > percent_cutoff].index.tolist())
    o = pd.DataFrame([impute_status[i], 
                      M,
                      MAD,
                      lower_cutoff, 
                      upper_cutoff, 
                      len(lower_snps), 
                      len(upper_snps), 
                      lower_snps, 
                      upper_snps,
                     percent_cutoff, 
                      len(percent_cutoff_outliers),
                      percent_cutoff_outliers]).T
    o.columns = ['state', 'median_XtX', 'MAD', 'lower_cutoff', 'upper_cutoff', 'n_lower', 'n_upper', 
                'lower_snps', 'upper_snps', 'percent_cutoff_XtX', 'n_percent_cutoff', '99th_snps']
    xtx_outliers.append(o)

In [0]:
pd.concat(xtx_outliers)

In [0]:
joined.head()

In [0]:
xtx.sort_index().head()