In [0]:
import scandir
import os
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.robjects as ro
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import scipy as sp
import dill
import random
import vcf
from hdfstorehelper import HDFStoreHelper
import statsmodels.api as sm
import statsmodels.formula.api as smf
import operator
import traceback
import warnings
import sklearn
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%load_ext rpy2.ipython
r = ro.r
%matplotlib inline
from utils import save_df, read_df
from IPython.display import display
import collections
from collections import defaultdict
import pickle

In [0]:
analysis_dir = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/beagle40/"
snp_file_gz = "isect_snps.recode.vcf.gz_sorted.vcf.gz"
tabix = "/home/cfriedline/g/src/htslib-1.3/tabix"

## write GEMMA files

In [0]:
pca_std_pheno = read_df(analysis_dir, 'pca_std_pheno')

In [0]:
pca_std_pheno.head()

In [0]:
pca_std_pheno.shape

In [0]:
gemma_pheno = pca_std_pheno[["Population",
                              "Number",
                              "Mass",
                              "Pupual Duration",
                              "Total Dev Time"]]
gemma_pheno.head()

In [0]:
save_df(analysis_dir, 'gemma_pheno', gemma_pheno)

In [0]:
pca_x = read_df(analysis_dir, 'pca_x')

In [0]:
pca_x.head()

In [0]:
pca_std_pheno = gemma_pheno.join(pca_x, how="inner")

In [0]:
pca_std_pheno.head()

In [0]:
pca_std_pheno.shape

In [0]:
gemma_pheno_pca = pca_std_pheno[[x for x in pca_std_pheno if "PC" in x or 'Mass' in x or 'Pupual' in x or 'Total Dev' in x]]
gemma_pheno_pca.columns = [x.replace(" ", "_") for x in gemma_pheno_pca.columns]
gemma_pheno_pca.index = [x for x in gemma_pheno_pca.index]
phenos = ["Mass", "Pupual_Duration", "Total_Dev_Time"]
for p in phenos:
    mod = smf.ols(formula="%s~PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15" % p, data=gemma_pheno_pca)
    res = mod.fit()
    col = "%s_resid" % p
    col = col.lower()
    gemma_pheno[col] = res.resid

In [0]:
gemma_pheno.head()

In [0]:
z12_swapped = read_df(analysis_dir, 'z12_swapped')

In [0]:
z12_swapped.head()

In [0]:
z12_df = read_df(analysis_dir, 'z12_df')

In [0]:
z12_df.head()

In [0]:
z12_df.shape

In [0]:
gt_base_df = read_df(analysis_dir, "gt_base_df")
gt_ref_alt_df = read_df(analysis_dir, 'gt_ref_alt_df')

In [0]:
gt_base_df_swapped = read_df(analysis_dir, "gt_base_df_swapped")

In [0]:
gt_ref_alt_minor_major = read_df(analysis_dirlysis_dir, "gt_ref_alt_minor_major")

In [0]:
gemma_gt = read_df(analysis_dirlysis_dir, "_pimass_gt")

In [0]:
gemma_pheno = read_df(analysis_dir, "_pimass_pheno")

In [0]:
gemma_pheno.shape

In [0]:
save_df(analysis_dir, "_gemma_gt", gemma_gt)

In [0]:
save_df(analysis_dir, "_gemma_pheno", gemma_pheno)

In [0]:
gemma_dir = os.path.join(analysis_dir, "gemma_run")
if not os.path.exists(gemma_dir):
    os.mkdir(gemma_dir)

In [0]:
gemma_pheno.massx.to_csv(os.path.join(gemma_dir, "gemma_mass.txt"),
                                     index=False,
                                     header=False)
gemma_pheno.tdtx.to_csv(os.path.join(gemma_dir, "gemma_tdt.txt"),
                                     index=False,
                                     header=False)
gemma_pheno.pdx.to_csv(os.path.join(gemma_dir, "gemma_pd.txt"),
                                     index=False,
                                     header=False)
gemma_pheno.to_csv(os.path.join(gemma_dir, "gemma_pheno.txt"),
                                     index=True,
                                     header=True)
gemma_pheno[['massx', 'pdx', 'tdtx']].to_csv(os.path.join(gemma_dir, "gemma_all_pheno.txt"),
                                     index=False,
                                     header=False)
gemma_gt.to_csv(os.path.join(gemma_dir, "gemma_gt.txt"),
                index=True,
                header=False)

In [0]:
gemma_contigs = {}
with open(os.path.join(gemma_dir, "gemma_loc.txt"), "w") as o:    
    for x in gemma_gt.index:
        data = x.split("_")
        contig = "_".join(data[0:-1])
        pos = data[-1]
        if not contig in gemma_contigs:
            gemma_contigs[contig] = []
        gemma_contigs[contig].append(pos)
    
    chrom_id = 1
    for contig, positions in list(gemma_contigs.items()):
        for p in positions:
            o.write("%s_%s\t%s\t%d\n" % (contig, p, p, chrom_id))
        chrom_id += 1

# GEMMA commands for Gypsy Moth

## Estimate Relatedness Matrix from Genotypes

`./gemma -g [filename] -p [filename] -gk [num] -o [prefix]`

```bash
~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_all_pheno.txt \
-gk 1 -o gm
```

## Perform Eigen-Decomposition of the Relatedness Matrix

`./gemma -g [filename] -p [filename] -k [filename] -eigen -o [prefix]`

```bash
~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_all_pheno.txt \
-k output/gm.cXX.txt -eigen -o gm
```

## Association Tests with Univariate Linear Mixed Models

`./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -o [prefix]`

```bash
~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_all_pheno.txt \
-n 1 -a gemma_loc.txt -k output/gm.cXX.txt -lmm 4 -o mass_lmm

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_all_pheno.txt \
-n 2 -a gemma_loc.txt -k output/gm.cXX.txt -lmm 4 -o pd_lmm

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_all_pheno.txt \
-n 3 -a gemma_loc.txt -k output/gm.cXX.txt -lmm 4 -o tdt_lmm
```

## Association Tests with Multivariate Linear Mixed Models

For this test, all three phenotype files were combined into a single file, with 3 columns in this order: mass, pd, tdt.

`./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -n [num1] [num2] [num3] -o [prefix]`

```bash
~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_all_pheno.txt \
-a gemma_loc.txt -k output/gm.cXX.txt -lmm 4 -n 1 2 -o mass_pd_mlmm

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_all_pheno.txt \
-a gemma_loc.txt -k output/gm.cXX.txt -lmm 4 -n 1 3 -o mass_tdt_mlmm

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_all_pheno.txt \
-a gemma_loc.txt -k output/gm.cXX.txt -lmm 4 -n 2 3 -o pd_tdt_mlmm

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_all_pheno.txt \
-a gemma_loc.txt -k output/gm.cXX.txt -lmm 4 -n 1 2 3 -o mass_pd_tdt_mlmm
```
## Fit a Bayesian Sparse Linear Mixed Model

First, set up a qsub script: `bslmm.sh` and `chmod +x` it

```bash
#!/bin/bash
#$ -N BSLMM
#$ -V
#$ -cwd
#$ -j y
#$ -l mem_free=20G
$HOME/g/src/gemma-0.94.1/gemma -g $1 -p $2 -a $3 -k $4 -bslmm $5 -o $6
```

Now, the scripts:

`./gemma -g [filename] -p [filename] -a [filename] -k [filename] -bslmm [num] -o [prefix]`

### Standard BSLMM (4 chains)

```bash 
qsub bslmm.sh "gemma_gt.txt" "gemma_mass.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "mass_bslmm_std \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

qsub bslmm.sh "gemma_gt.txt" "gemma_pd.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "pd_bslmm_std \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

qsub bslmm.sh "gemma_gt.txt" "gemma_tdt.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "tdt_bslmm_std \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

```
---

```bash 
qsub bslmm.sh "gemma_gt.txt" "gemma_mass.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "mass_bslmm_std_1 \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

qsub bslmm.sh "gemma_gt.txt" "gemma_pd.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "pd_bslmm_std_1 \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

qsub bslmm.sh "gemma_gt.txt" "gemma_tdt.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "tdt_bslmm_std_1 \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

```
---

```bash 
qsub bslmm.sh "gemma_gt.txt" "gemma_mass.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "mass_bslmm_std_2 \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

qsub bslmm.sh "gemma_gt.txt" "gemma_pd.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "pd_bslmm_std_2 \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

qsub bslmm.sh "gemma_gt.txt" "gemma_tdt.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "tdt_bslmm_std_2 \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

```
---

```bash 
qsub bslmm.sh "gemma_gt.txt" "gemma_mass.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "mass_bslmm_std_3 \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

qsub bslmm.sh "gemma_gt.txt" "gemma_pd.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "pd_bslmm_std_3 \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

qsub bslmm.sh "gemma_gt.txt" "gemma_tdt.txt" "gemma_loc.txt" \
"output/gm.cXX.txt" "1" "tdt_bslmm_std_3 \
-w 1000000 -s 100000000 -smin 1 -smax 300 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0 -rpace 1000"

```

## analyze and process GEMMA

In [0]:
assembly = "/home/cfriedline/eckertlab/projects/gypsy_moth/assemblies/masurca3/CA/10-gapclose/genome.ctg.fasta"

In [0]:
filedir = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/beagle40/gemma_run/output"

In [0]:
bslmm = !ls {filedir}/*bslmm*.txt 

In [0]:
sorted(bslmm)

In [0]:
%%R
library(coda)
library(data.table)

In [0]:
plot_mcmc = r('plot')

In [0]:
def makehash():
    return collections.defaultdict(makehash)

def split_bslmm_by_pheno(bslmm):
    h = makehash()
    for f in bslmm:
        d = os.path.basename(f).split("_")
        pheno = d[0]
        o = d[-1].split(".")
        out = o[1]
        num = o[0]
        if num == 'std':
            num = 0
        h[pheno][out][num] = f
    return h
bslmm_dict = split_bslmm_by_pheno(bslmm)

In [0]:
def collect_files(key):
    d = defaultdict(list)
    for pheno, data in bslmm_dict.items():
        for n in data[key]:
            d[pheno].append(data[key][n])
    return d
    
hyp_files = collect_files("hyp")
param_files = collect_files("param")

In [0]:
hyp_files

In [0]:
param_files

In [0]:
r('eff_size=list()')
r('mcmc_summary=list()')
r('mcmc_lists=list()')
for pheno, files in hyp_files.items():
    print(pheno)
    r("m_list=list()")
    %R -i pheno
    for i, hyp_file in enumerate(files):
        lines = open(hyp_file).readlines()
        if len(lines) < 10:
            continue
        data = []
        for l in lines:
            l = l.strip().split("\t")
            data.append(l)
        hyp = pd.DataFrame(data[1:], columns=data[0], dtype=float)
        hyp.columns = [x.strip() for x in hyp.columns]
        hyp.to_csv(hyp_file, sep="\t", header=True, index=False)
        %R -i hyp_file
        r("m = mcmc(fread('%s', sep='\t', , header=T, data.table=F), thin=1000)" % hyp_file)
        r("m_list$%s = m" % os.path.basename(hyp_file))
    r("mcmc_list = mcmc.list(m_list)")
    r("mcmc_lists$%s = mcmc_list" % pheno) 
    r("eff_size$%s = effectiveSize(mcmc_list)" % pheno)
    r("mcmc_summary$%s = summary(mcmc_list)" % pheno)

In [0]:
%R -i filedir
r("saveRDS(mcmc_lists, file='%s')" % os.path.join(filedir, "mcmc_lists.rds"));
r("saveRDS(mcmc_summary, file='%s')" % os.path.join(filedir, "mcmc_summary.rds"));

In [0]:
print(r('eff_size'))
print(r('mcmc_summary'))
for pheno in hyp_files:
    print(pheno, r('gelman.diag(mcmc_lists$%s, autoburnin=F)' % pheno))

In [0]:
%%R
eff_size$mass

In [0]:
!ls {filedir}/*.mcmc

In [0]:
filedir

In [0]:
%%R
print("mass")
mcmc_lists = readRDS(paste(filedir, '/mcmc_lists.rds', sep=''))
plot(mcmc_lists$mass)
print("pd")
plot(mcmc_lists$pd)
print("tdt")
plot(mcmc_lists$tdt)

In [0]:
param_files

In [0]:
def get_param_dfs(files):
    dfs = {}
    for pheno, filelist in files.items():
        dfs[pheno] = pd.DataFrame()
        df = None
        for f in filelist:
            fdata = os.path.basename(f).split(".")[0].split("_")
            num = 0
            
            if len(fdata) > 3:
                num = int(fdata[-1])
            
            if num == 0:
                df = pd.read_csv(f, sep="\t", index_col=1)
            else:
                df = df.join(pd.read_csv(f, sep="\t", index_col=1), rsuffix="_%d" % num)
        df = df.apply(pd.to_numeric, errors="ignore")
        dfs[pheno] = df
    return dfs
param_dfs = get_param_dfs(param_files)

In [0]:
def get_hyp_dfs(files):
    dfs = {}
    for pheno, filelist in files.items():
        df = None
        for f in filelist:
            fdata = os.path.basename(f).split(".")[0].split("_")
            num = 0
            
            if len(fdata) > 3:
                num = int(fdata[-1])
            
            if num == 0:
                df = pd.read_csv(f, sep="\t")
            else:
                df = df.join(pd.read_csv(f, sep="\t"), rsuffix="_%d" % num)
        df = df.apply(pd.to_numeric, errors="ignore")
        dfs[pheno] = df
    return dfs
hyp_dfs = get_hyp_dfs(hyp_files)

In [0]:
def get_hmean_row(row):
    try:
        return sp.stats.hmean(row)
    except ValueError as e:
        return np.nan   

def get_hmean(param):
    d = {}
    for pheno in param_dfs:
        df = param_dfs[pheno]
        g = pd.DataFrame(df[[x for x in df if param in x]])
        #m = g.apply(get_hmean_row, axis=1)
        m = g.apply(np.mean, axis=1)
        g['%s_hmean' % param] = m
        d[pheno] = g
    return d

gamma_dfs = get_hmean('gamma')
beta_dfs = get_hmean('beta')
alpha_dfs = get_hmean('alpha')

In [0]:
combined_dfs = {}
for pheno in gamma_dfs:
    a = alpha_dfs[pheno]['alpha_hmean']
    b = beta_dfs[pheno]['beta_hmean']
    g = gamma_dfs[pheno]['gamma_hmean']
    t = pd.concat((a, b, g), axis=1)
    plt.scatter(t['alpha_hmean'], t['beta_hmean'])
    plt.xlim(np.min(a), np.max(a))
    plt.ylim(np.min(b), np.max(b))
    plt.ylabel("beta mean")
    plt.xlabel("alpha mean")
    plt.title(pheno)
    plt.show()
    t = np.abs(t)
    t['total_effect'] = t.apply(lambda x: x.alpha_hmean + x.beta_hmean, axis=1)
    combined_dfs[pheno] = t

In [0]:
for pheno in combined_dfs:
    print(pheno)
    display(combined_dfs[pheno].head())

In [0]:
from matplotlib_venn import venn3, venn3_unweighted, venn3_circles

In [0]:
effect_snps = {}
for pheno in combined_dfs:
    d = combined_dfs[pheno]
    x = 'gamma_hmean'
    for q in [0.995, 0.999]:
        x99_cutoff = d[x].quantile(q)
        xvals = d[x][d[x] >= x99_cutoff] 
        for y in ['alpha_hmean', 'beta_hmean', 'total_effect']:
            y99_cutoff = d[y].quantile(q)
            yvals = d[y][d[y] >= y99_cutoff]
            isect = set(xvals.index).intersection(set(yvals.index))
            effect_snps[pheno, x, y, q] = isect
            print(pheno, x, y, q, len(isect))
        print()

In [0]:
with open(os.path.join(filedir, "effect_snps.pkl"), "wb") as o:
    pickle.dump(effect_snps, o, pickle.HIGHEST_PROTOCOL)

In [0]:
for key in effect_snps:
    k = [str(x) for x in key]
    out = os.path.join(filedir, "%s_effect.txt" % "-".join(k))
    with open(out, "w") as o:
        o.write("\n".join(effect_snps[key]))

In [0]:
with open(os.path.join(filedir, "combined_dfs.pkl"), "wb") as o:
    pickle.dump(combined_dfs, o, pickle.HIGHEST_PROTOCOL)

In [0]:
venn_data = {}
for pheno in combined_dfs:
    d = combined_dfs[pheno]
    x = 'gamma_hmean'
    venn_data[pheno] = {}
    for q in [0.995, 0.999]:
        venn_data[pheno][q] = []
        x99_cutoff = d[x].quantile(q)
        xvals = d[x][d[x] >= x99_cutoff] 
        venn_data[pheno][q].append(set(xvals.index))
        for y in ['alpha_hmean', 'beta_hmean']:
            y99_cutoff = d[y].quantile(q)
            yvals = d[y][d[y] >= y99_cutoff]
            venn_data[pheno][q].append(set(yvals.index))

sns.set_context("talk")
for pheno, d in venn_data.items():
    for q in d:
        venn3(d[q], ("gamma", "alpha", "beta"))
        plt.title("%s_%.3f" % (pheno, q))
        plt.show()

In [0]:
for pheno in combined_dfs:
    d = combined_dfs[pheno]
    x = 'gamma_hmean'
    y = 'total_effect'
    sns.lmplot(x, y, d)
    plt.xlim(np.min(d[x]), np.max(d[x]))
    plt.ylim(np.min(d[y]), np.max(d[y]))
    plt.title(pheno)
    plt.show()

In [0]:
loci_fst = read_df(analysis_dir, "loci_fst")
loci_fst.columns = ["Fst"]

In [0]:
for pheno in combined_dfs:
    d = combined_dfs[pheno]
    j = d.join(loci_fst, how="inner")
    x = 'Fst'
    y = 'total_effect'
    sns.lmplot(x, y, j)
    plt.xlim(np.min(j[x]), np.max(j[x]))
    plt.ylim(np.min(j[y]), np.max(j[y]))
    plt.title(pheno)
    plt.show()