In [0]:
import scandir
import os
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.robjects as ro
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import scipy as sp
import dill
import random
import vcf
from hdfstorehelper import HDFStoreHelper
import statsmodels.api as sm
import statsmodels.formula.api as smf
import operator
import traceback
import warnings
import sklearn
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%load_ext rpy2.ipython
r = ro.r
%matplotlib inline
from utils import save_df, read_df

In [0]:
analysis_dir = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/beagle40/"
snp_file_gz = "isect_snps.recode.vcf.gz_sorted.vcf.gz"
tabix = "/home/cfriedline/g/src/htslib-1.3/tabix"

## write GEMMA files

In [0]:
pca_std_pheno = read_df(analysis_dir, 'pca_std_pheno')

In [0]:
pca_std_pheno.head()

In [0]:
gemma_pheno = pca_std_pheno[["Population",
                              "Number",
                              "Mass",
                              "Pupual Duration",
                              "Total Dev Time"]]
gemma_pheno.head()

In [0]:
save_df(analysis_dir, 'gemma_pheno', gemma_pheno)

In [0]:
pca_x = read_df(analysis_dir, 'pca_x')

In [0]:
pca_x.head()

In [0]:
pca_std_pheno = gemma_pheno.join(pca_x, how="inner")

In [0]:
pca_std_pheno.head()

In [0]:
gemma_pheno_pca = pca_std_pheno[[x for x in pca_std_pheno if "PC" in x or 'Mass' in x or 'Pupual' in x or 'Total Dev' in x]]
gemma_pheno_pca.columns = [x.replace(" ", "_") for x in gemma_pheno_pca.columns]
gemma_pheno_pca.index = [x for x in gemma_pheno_pca.index]
phenos = ["Mass", "Pupual_Duration", "Total_Dev_Time"]
for p in phenos:
    mod = smf.ols(formula="%s~PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8+PC9+PC10+PC11+PC12+PC13+PC14+PC15+PC16" % p, data=gemma_pheno_pca)
    res = mod.fit()
    col = "%s_resid" % p
    col = col.lower()
    gemma_pheno[col] = res.resid

In [0]:
gemma_pheno.head()

In [0]:
z12_swapped = read_df(analysis_dir, 'z12_swapped')

In [0]:
z12_swapped.head()

In [0]:
z12_df = read_df(analysis_dir, 'z12_df')

In [0]:
z12_df.head()

In [0]:
translation_df = pd.read_csv("translation_table.csv", sep="\t", index_col=0)

def get_correct_name(row, trans):
    trans[row.name] = "%s_%d_%d" % (row['pop'], row.indiv, row.dup)

name_translation = {}
translation_df.apply(get_correct_name, args=(name_translation,), axis=1);

In [0]:
readvcf = open(os.path.join(analysis_dir, snp_file_gz), "rb")
reader = vcf.VCFReader(readvcf)
gt_base_data = {}
gt_ref_alt = {}
at = 0
for snp in reader:
    snp_id = "%s_%d" % (snp.CHROM, snp.POS)
    gt_ref_alt[snp_id] = {'ref': snp.REF, 'alt': snp.ALT[0]}
    for sample in snp.samples:
        if not snp_id in gt_base_data:
            gt_base_data[snp_id] = {}
        sample_name = name_translation[sample.sample]
        bases = sample.gt_bases.replace("|","/")
        gt_base_data[snp_id][sample_name] = bases
    at += 1
    if at % 1000 == 0:
        print(at)
gt_base_df = pd.DataFrame(gt_base_data)
readvcf.close()

In [0]:
gt_base_df.head()

In [0]:
gt_ref_alt_df = pd.DataFrame(gt_ref_alt)
gt_ref_alt_df.head()

In [0]:
save_df(analysis_dir, 'gt_base_df', gt_base_df)
save_df(analysis_dir, 'gt_ref_alt_df', gt_ref_alt_df)

In [0]:
gt_base_df.shape

In [0]:
def swap_gt_alleles(gt, het):   
    if isinstance(gt, float): #NaN
        return np.NaN
    if gt is None:
        return np.NaN
    if gt[0] == gt[-1]:
        return gt
    else:
        return het # already in minor/major
    
def swap_gt(snp):
    vc = snp.value_counts()
    counts = {}
    for v in vc.index:
        if not v[0] in counts:
            counts[v[0]] = 0.0
        if not v[-1] in counts:
            counts[v[-1]] = 0.0
        counts[v[0]] += vc[v]
        counts[v[-1]] += vc[v]
    counts2 = sorted(list(counts.items()), key=operator.itemgetter(1)) #e.g., [('A', 110.0), ('G', 236.0)]
    minor = counts2[0][0]
    major = counts2[1][0]
    het = "%s/%s" % (minor, major)
    gt_ref_alt[snp.name]['minor'] = minor
    gt_ref_alt[snp.name]['major'] = major
    return snp.apply(swap_gt_alleles, args=(het,))

In [0]:
gt_base_df_swapped = gt_base_df.apply(swap_gt)
gt_base_df_swapped.head()

In [0]:
save_df(analysis_dir, 'gt_base_df_swapped', gt_base_df_swapped)

In [0]:
def convert_GP_to_L(q):
    return pow(10,(-q/10.0))

def get_dosage(gp, index):
    if not gp:
        return ["NA"]
    gp2 = [x for x in gp]
    dosage = (gp2[1] + 2*gp2[index])
    assert dosage >=0 and dosage <=2
    return gp, gp2, dosage

def get_GP(sample):
    if sample['GT'] is None:
        return None, None
    return sample['GT'], sample['GP']

def get_major_minor(snp, reader):
    d = snp.name.split("_")
    loc = int(d[-1])
    contig = "_".join(d[0:-1])
    minor = gt_ref_alt[snp.name]['minor']
    major = gt_ref_alt[snp.name]['major']
    ref = gt_ref_alt[snp.name]['ref']
    alt = gt_ref_alt[snp.name]['alt']
    minor_index = 0 #assume minor is reference
    if minor == alt:
        minor_index = 2
    dosages = []
    samples = []
    thesnp = list(reader.fetch(contig, loc-1, loc))[0]
    for sample in thesnp.samples:
        gt, gp = get_GP(sample)
        dosages.append(get_dosage(gp, minor_index)[-1])
        samples.append(sample.sample)
    data = [minor, major]
    index = ["minor", "major"]
    index.extend(samples)
    data.extend(dosages)
    ret = pd.Series(data, index=index)
    return ret

In [0]:
gt_base_df_swapped.head()

In [0]:
gt_ref_alt_minor_major = pd.DataFrame(gt_ref_alt)
save_df(analysis_dir, 'gt_ref_alt_minor_major', gt_ref_alt_minor_major)

In [0]:
h = open(os.path.join(analysis_dir, snp_file_gz), "rb")
reader = vcf.VCFReader(h)

In [0]:
[(x.CHROM,x.POS) for x in list(reader.fetch('ctg7180005039298', 121, 122))]

In [0]:
gemma_gt = gt_base_df_swapped.apply(get_major_minor, args=(reader,)).T
gemma_gt.head()

In [0]:
gemma_pheno.head()

In [0]:
gemma_pheno = gemma_pheno.reindex(index=gt_base_df.index)

In [0]:
gemma_pheno.head()

In [0]:
%R -i gemma_pheno

In [0]:
%%R
massx = qqnorm(gemma_pheno$mass_resid, plot.it=F)$x
tdtx = qqnorm(gemma_pheno$total_dev_time_resid, plot.it=F)$x
pdx = qqnorm(gemma_pheno$pupual_duration_resid, plot.it=F)$x

In [0]:
gemma_pheno['massx'] = r('massx')
gemma_pheno['tdtx'] = r('tdtx')
gemma_pheno['pdx'] = r('pdx')

In [0]:
gemma_pheno.head()

In [0]:
save_df(analysis_dir, "_gemma_gt", gemma_gt)

In [0]:
save_df(analysis_dir, "_gemma_pheno", gemma_pheno)

In [0]:
gemma_pheno.massx.to_csv(os.path.join(analysis_dir, "gemma_mass.txt"),
                                     index=False,
                                     header=False)
gemma_pheno.tdtx.to_csv(os.path.join(analysis_dir, "gemma_tdt.txt"),
                                     index=False,
                                     header=False)
gemma_pheno.pdx.to_csv(os.path.join(analysis_dir, "gemma_pd.txt"),
                                     index=False,
                                     header=False)
gemma_pheno.to_csv(os.path.join(analysis_dir, "gemma_pheno.txt"),
                                     index=True,
                                     header=True)
gemma_gt.to_csv(os.path.join(analysis_dir, "gemma_gt.txt"),
                index=True,
                header=False)

In [0]:
gemma_contigs = {}
with open(os.path.join(analysis_dir, "gemma_loc.txt"), "w") as o:    
    for x in gemma_gt.index:
        data = x.split("_")
        contig = "_".join(data[0:-1])
        pos = data[-1]
        if not contig in gemma_contigs:
            gemma_contigs[contig] = []
        gemma_contigs[contig].append(pos)
    
    chrom_id = 1
    for contig, positions in list(gemma_contigs.items()):
        for p in positions:
            o.write("%s_%s\t%s\t%d\n" % (contig, p, p, chrom_id))
        chrom_id += 1

In [0]:
import random

gemma = "/home/cfriedline/g/src/gemma-0.94.1/gemma"

def create_gemma_run_files(num_runs):
    phenos = ["mass", 'tdt', 'pd']
    for p in phenos:
        with open(os.path.join(analysis_dir, "gemma_%s_run.txt" % p), "w") as o:
            for i in range(num_runs):
                cmd = "%s \
-g gemma_gt.txt \
-p gemma_%s.txt -pos gemma_loc.txt \
-o gemma_%s_out_%d \
-w 1000000 \
-s 10000000 \
-num 500 \
-smin 1 \
-smax 100 \
-hmin 0.01 \
-hmax 0.9 \
-pmin 1 \
-pmax 1000 \
-r %.0f" % (gemma, p, p, i, int(random.getrandbits(32)))
                o.write("%s\n" % cmd) 
                


def create_qsub_files():
    files = !ls {analysis_dir}*run.txt
    for f in files:
        with open("%s_qsub.sh" % f, "w") as o:
            o.write("""#!/bin/bash
#$ -j y
#$ -V
#$ -N gemma_%s
#$ -cwd
unset module
parallel -a %s
""" % (os.path.basename(f).split("_")[1], f))
            
create_pimass_run_files(10)
create_qsub_files()

# GEMMA commands for Gypsy Moth

## Estimate Relatedness Matrix from Genotypes

`./gemma -g [filename] -p [filename] -gk [num] -o [prefix]`

```bash
~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_mass.txt \
-gk 1 -o mass

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_pd.txt \
-gk 1 -o pd

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_tdt.txt \
-gk 1 -o tdt
```

## Perform Eigen-Decomposition of the Relatedness Matrix

`./gemma -g [filename] -p [filename] -k [filename] -eigen -o [prefix]`

```bash
~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_mass.txt \
-k output/mass.cXX.txt -eigen -o mass

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_pd.txt \
-k output/pd.cXX.txt -eigen -o pd

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_tdt.txt \
-k output/tdt.cXX.txt -eigen -o tdt
```

## Association Tests with Univariate Linear Mixed Models

`./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -o [prefix]`

```bash
~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_mass.txt \
-a gemma_loc.txt -k output/mass.cXX.txt -lmm 4 -o mass_lmm

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_pd.txt \
-a gemma_loc.txt -k output/pd.cXX.txt -lmm 4 -o pd_lmm

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_tdt.txt \
-a gemma_loc.txt -k output/tdt.cXX.txt -lmm 4 -o tdt_lmm
```

## Association Tests with Multivariate Linear Mixed Models

For this test, all three phenotype files were combined into a single file, with 3 columns in this order: mass, pd, tdt.

`./gemma -g [filename] -p [filename] -a [filename] -k [filename] -lmm [num] -n [num1] [num2] [num3] -o [prefix]`

```bash
~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_mass.txt \
-a gemma_loc.txt -k output/mass.cXX.txt -lmm 4 -n 1 2 3 -o mass_mlmm

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_pd.txt \
-a gemma_loc.txt -k output/pd.cXX.txt -lmm 4 -n 1 2 3 -o pd_mlmm

~/g/src/gemma-0.94.1/gemma -g gemma_gt.txt -p gemma_tdt.txt \
-a gemma_loc.txt -k output/tdt.cXX.txt -lmm 4 -n 1 2 3 -o tdt_mlmm
```
## Fit a Bayesian Sparse Linear Mixed Model

First, set up a qsub script: `bslmm.sh` and `chmod +x` it

```bash
#!/bin/bash
#$ -N BSLMM
#$ -V
#$ -cwd
#$ -j y
#$ -l mem_free=20G
$HOME/g/src/gemma-0.94.1/gemma -g $1 -p $2 -a $3 -k $4 -bslmm $5 -o $6
```

Now, the scripts:

`./gemma -g [filename] -p [filename] -a [filename] -k [filename] -bslmm [num] -o [prefix]`

### Standard BSLMM

```bash 
qsub bslmm.sh "gemma_gt.txt" "gemma_mass.txt" "gemma_loc.txt" \
"output/mass.cXX.txt" "1" "mass_bslmm_std \
-w 1000000 -s 10000000 -smin 1 -smax 100 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0"

qsub bslmm.sh "gemma_gt.txt" "gemma_pd.txt" "gemma_loc.txt" \
"output/pd.cXX.txt" "1" "pd_bslmm_std \
-w 1000000 -s 10000000 -smin 1 -smax 100 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0"

qsub bslmm.sh "gemma_gt.txt" "gemma_tdt.txt" "gemma_loc.txt" \
"output/tdt.cXX.txt" "1" "tdt_bslmm_std \
-w 1000000 -s 10000000 -smin 1 -smax 100 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0"

```

### Ridge regression/GBLUP

```bash
qsub bslmm.sh "gemma_gt.txt" "gemma_mass.txt" "gemma_loc.txt" \
"output/mass.cXX.txt" "2" "mass_bslmm_ridge"

qsub bslmm.sh "gemma_gt.txt" "gemma_pd.txt" "gemma_loc.txt" \
"output/pd.cXX.txt" "2" "pd_bslmm_ridge"

qsub bslmm.sh "gemma_gt.txt" "gemma_tdt.txt" "gemma_loc.txt" \
"output/tdt.cXX.txt" "2" "tdt_bslmm_ridge"

```

### Probit BSLMM

```bash
qsub bslmm.sh "gemma_gt.txt" "gemma_mass.txt" "gemma_loc.txt" \
"output/mass.cXX.txt" "3" "mass_bslmm_probit \
-w 1000000 -s 10000000 -smin 1 -smax 100 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0"

qsub bslmm.sh "gemma_gt.txt" "gemma_pd.txt" "gemma_loc.txt" \
"output/pd.cXX.txt" "3" "pd_bslmm_probit \
-w 1000000 -s 10000000 -smin 1 -smax 100 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0"

qsub bslmm.sh "gemma_gt.txt" "gemma_tdt.txt" "gemma_loc.txt" \
"output/tdt.cXX.txt" "3" "tdt_bslmm_probit \
-w 1000000 -s 10000000 -smin 1 -smax 100 -hmin 0.01 -hmax 0.9 -pmin -3 -pmax 0"

```



## analyze and process GEMMA

In [0]:
assembly = "/home/cfriedline/eckertlab/projects/gypsy_moth/assemblies/masurca3/CA/10-gapclose/genome.ctg.fasta"

In [0]:
filedir = "/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/beagle40/output_comeault_isect"

In [0]:
def dump_session():
    dill.settings['recurse'] = True
    dill.settings['fmode'] = dill.HANDLE_FMODE
    dill.dump_session(filename=os.path.join(filedir, "pimass.dill"))

In [0]:
path_files = {}
mcmc_files = {}
gamma_files = {}
snp_files = {}
for root, dirs, files in scandir.walk(filedir):
    for f in files:
        d = f.split("_")
        pheno = d[1]
        if not pheno in path_files:
            path_files[pheno] = []
            mcmc_files[pheno]= []
            gamma_files[pheno] = []
            snp_files[pheno] = []
        if 'path' in f:
            path_files[pheno].append(os.path.join(root, f))
        elif 'mcmc' in f:
            mcmc_files[pheno].append(os.path.join(root, f))
        elif 'gamma' in f:
            gamma_files[pheno].append(os.path.join(root, f))
        elif 'snp' in f:
            snp_files[pheno].append(os.path.join(root, f))

In [0]:
%%R
library(coda)

In [0]:
mcmc = r('mcmc')
mcmc_list = r('mcmc.list')

In [0]:
dfs = {}
phenos = ["mass", "pd", "tdt"]
for pheno in phenos:
    frames = [pd.read_csv(x,sep="\t") for x in path_files[pheno]]
    frames = [x.ix[:,:-1] for x in frames]
    for df in frames:
        df.columns = [x.strip() for x in df.columns]
    dfs[pheno] = frames

In [0]:
dfs['mass'][0].head()

In [0]:
path_mcmc_r = {}
path_mcmc = {}
thin = 1
for key, dflist in list(dfs.items()):
    path_mcmc_r[key] = [mcmc(pandas2ri.DataFrame(x.sample(frac=thin).sort_index())) for x in dflist]
    path_mcmc[key] = [x.sample(frac=thin).sort_index() for x in dflist]

In [0]:
path_mcmc_list_mass = mcmc_list(path_mcmc_r['mass'])
path_mcmc_list_pd = mcmc_list(path_mcmc_r['pd'])
path_mcmc_list_tdt = mcmc_list(path_mcmc_r['tdt'])

In [0]:
%R -i path_mcmc_list_mass -i path_mcmc_list_pd -i path_mcmc_list_tdt

In [0]:
%%R
effective_sizes_mass = lapply(path_mcmc_list_mass,effectiveSize)
effective_sizes_pd = lapply(path_mcmc_list_pd,effectiveSize)
effective_sizes_tdt = lapply(path_mcmc_list_tdt,effectiveSize)

In [0]:
def get_effective_sizes(r_name):
    df = pd.DataFrame([pandas2ri.ri2py(x) for x in r[r_name]])
    test = r[r_name].rx2(1)
    df.columns = r('names')(test)
    return df
ne_tdt = get_effective_sizes('effective_sizes_tdt')
ne_pd= get_effective_sizes('effective_sizes_pd')
ne_mass= get_effective_sizes('effective_sizes_mass')


In [0]:
print(ne_tdt.mean())
print(ne_tdt.std())


In [0]:
ne_pd.mean()

In [0]:
ne_mass.mean()

In [0]:
print("MASS", r("summary")(path_mcmc_list_mass))
print("PD", r("summary")(path_mcmc_list_pd))
print("TDT", r("summary")(path_mcmc_list_tdt))

In [0]:
%%R
plot(path_mcmc_list_mass)
plot(path_mcmc_list_pd)
plot(path_mcmc_list_tdt)

In [0]:
mcmc = {}
for pheno, files in list(mcmc_files.items()):
    if not pheno in mcmc:
        mcmc[pheno] = pd.DataFrame()
    for f in files:
        index = os.path.basename(f).split("_")[-1].split(".")[0]
        testdf = pd.read_csv(f, sep="\t")
        testdf.columns = ["%s_%s" % (x.strip(), index) for x in testdf.columns]
        mcmc[pheno] = pd.concat([mcmc[pheno], testdf], axis=1)

In [0]:
mcmc_mass = mcmc['mass']
mcmc_pd = mcmc['pd']
mcmc_tdt = mcmc['tdt']

In [0]:
def get_hmean_row(row):
    try:
        return sp.stats.hmean(row)
    except ValueError as e:
        return np.nan
    
def get_hmean(df, col_pattern):
    cols = ['rs','chr']
    cols.extend(["%s_hmean" % x for x in col_pattern])
    d = pd.DataFrame(columns=cols, index=df.index)
    d['rs'] = df.rs_1.values
    d["chr"] = df.chr_1.values
    for cp in col_pattern:
        d["%s_hmean" % cp] = np.abs(df[[x for x in df if cp in x]]).apply(get_hmean_row, axis=1).values
    return d
mcmc_mass_hmean = get_hmean(mcmc_mass, ["postrb", "betarb"])
mcmc_tdt_hmean = get_hmean(mcmc_tdt, ["postrb", "betarb"])
mcmc_pd_hmean = get_hmean(mcmc_pd, ["postrb", "betarb"])

In [0]:
save_df(analysis_dir, 'mcmc_mass_hmean', mcmc_mass_hmean)
save_df(analysis_dir, 'mcmc_tdt_hmean', mcmc_tdt_hmean)
save_df(analysis_dir, 'mcmc_pd_hmean', mcmc_pd_hmean)

In [0]:
mcmc_hmean = {'mass': mcmc_mass_hmean,
             'tdt': mcmc_tdt_hmean,
             'pd': mcmc_pd_hmean}

In [0]:
def percent_difference(x, y):
    x = float(x)
    y = float(y)
    return (np.abs(x-y)/np.mean([x, y]))*100

def get_quant(name, data, q):
    d = data.quantile(q)
    d.index = [str(x) for x in d.index]
    d['median_val'] = data.median()
    d['mean_val'] = data.mean()
    d['cutoff'] = 0.01
    d["x99_cutoff"] = percent_difference(d['0.99'], d['cutoff'])
    d["x99_median"] =  percent_difference(d['0.99'], d['median_val'])
    d["x95_cutoff"] = percent_difference(d['0.95'], d['cutoff'])
    d["x95_median"] =  percent_difference(d['0.95'], d['median_val'])
    d['relaxed_cutoff'] = d['0.99']
    d['min'] = data.min()
    d['max'] = data.max()
    d.name = name
    return d

mass_quant = get_quant("mass", mcmc_mass_hmean.postrb_hmean, [0.1, 0.2, 0.6, 0.75, 0.90, 0.95,0.99])
pd_quant = get_quant("pd", mcmc_pd_hmean.postrb_hmean, [0.1, 0.2, 0.6, 0.75, 0.90, 0.95,0.99])
tdt_quant =get_quant("tdt", mcmc_tdt_hmean.postrb_hmean, [0.1, 0.2, 0.6, 0.75, 0.90, 0.95,0.99]) 

In [0]:
print("%s\n\n%s\n\n%s\n" % (mass_quant, pd_quant, tdt_quant))

In [0]:
sns.set_context("talk")
plt.hist(mcmc_mass_hmean.postrb_hmean, bins=100, alpha=0.3, label="mass")
plt.hist(mcmc_pd_hmean.postrb_hmean, bins=100, alpha=0.3, label="pd")
plt.hist(mcmc_tdt_hmean.postrb_hmean, bins=100, alpha=0.3, label="tdt")
plt.legend()
plt.show()

In [0]:
save_df(analysis_dir, 'mass_quant', mass_quant)
save_df(analysis_dir, 'pd_quant', pd_quant)
save_df(analysis_dir, 'tdt_quant', tdt_quant)

In [0]:
mass_quant

In [0]:
def get_quant_range(quant):
    return pd.Series(np.linspace(quant['0.9'], quant['cutoff'], 10))

mass_quant_range = get_quant_range(mass_quant)
pd_quant_range = get_quant_range(pd_quant)
tdt_quant_range = get_quant_range(tdt_quant)

In [0]:
save_df(analysis_dir, "mass_quant_range", mass_quant_range)
save_df(analysis_dir, "pd_quant_range", pd_quant_range)
save_df(analysis_dir, "tdt_quant_range", tdt_quant_range)

In [0]:
sig_snps_mass = mcmc_mass_hmean[mcmc_mass_hmean.postrb_hmean > mass_quant.cutoff]
sig_snps_tdt = mcmc_tdt_hmean[mcmc_tdt_hmean.postrb_hmean > tdt_quant.cutoff]
sig_snps_pd = mcmc_pd_hmean[mcmc_pd_hmean.postrb_hmean > pd_quant.cutoff]

relaxed_sig_snps_mass = mcmc_mass_hmean[mcmc_mass_hmean.postrb_hmean > mass_quant.relaxed_cutoff]
relaxed_sig_snps_tdt = mcmc_tdt_hmean[mcmc_tdt_hmean.postrb_hmean > tdt_quant.relaxed_cutoff]
relaxed_sig_snps_pd = mcmc_pd_hmean[mcmc_pd_hmean.postrb_hmean > pd_quant.relaxed_cutoff]

In [0]:
def get_range_snps(ranges, hmean_df):
    range_snps = []
    for i, cutoff in enumerate(ranges):
        range_snps.append(hmean_df[hmean_df.postrb_hmean > cutoff])
    return range_snps

range_snps_mass = get_range_snps(mass_quant_range, mcmc_mass_hmean)
range_snps_pd = get_range_snps(pd_quant_range, mcmc_pd_hmean)
range_snps_tdt = get_range_snps(tdt_quant_range, mcmc_tdt_hmean)

In [0]:
sig_snps_mass.shape, sig_snps_tdt.shape, sig_snps_pd.shape

In [0]:
relaxed_sig_snps_mass.shape, relaxed_sig_snps_tdt.shape, relaxed_sig_snps_pd.shape

In [0]:
relaxed_sig_snps_mass.head()

In [0]:
relaxed_sig_snps_tdt.head()

In [0]:
relaxed_sig_snps_pd.head()

In [0]:
contig_pips = {}
def get_contig_pip(row, pheno):
    if not pheno in contig_pips:
        contig_pips[pheno] = {}
        
    d = row.rs.split("_")
    contig = "_".join(d[:-1])
    if not contig in contig_pips[pheno]:
        contig_pips[pheno][contig] = {'betarb':0,'postrb':0}
    contig_pips[pheno][contig]['postrb'] += row.postrb_hmean
    contig_pips[pheno][contig]['betarb'] += row.betarb_hmean

for pheno, df in list(mcmc_hmean.items()):
    print(pheno)
    df.apply(get_contig_pip, args=(pheno,), axis=1)


In [0]:
contig_pip_dfs = {}
for pheno, data in list(contig_pips.items()):
    contig_pip_dfs[pheno] = pd.DataFrame(data).T

In [0]:
from Bio import SeqIO
contig_lengths = {}
for rec in SeqIO.parse(assembly,"fasta"):
    contig_lengths[rec.name] = {"length":len(rec)}

In [0]:
contig_length_df = pd.DataFrame(contig_lengths).T

In [0]:
contig_length_df.head()

In [0]:
contig_pip_mass = contig_pip_dfs['mass'].join(contig_length_df)
contig_pip_tdt = contig_pip_dfs['tdt'].join(contig_length_df)
contig_pip_pd = contig_pip_dfs['pd'].join(contig_length_df)

In [0]:
save_df(analysis_dir, 'contig_pip_mass', contig_pip_mass)
save_df(analysis_dir, 'contig_pip_tdt', contig_pip_tdt)
save_df(analysis_dir, 'contig_pip_pd', contig_pip_pd)

In [0]:
contig_pip_mass.head()

In [0]:
def plot_contig_length_vs_pip(df, title):
    plt.scatter(df.length, df.postrb)
    plt.title(title)
    plt.xlabel("length of contig")
    plt.ylabel("postrb")
    plt.show()
for key, df in list({'mass':contig_pip_mass, 
                'tdt': contig_pip_tdt, 
                'pd': contig_pip_pd}.items()):
    plot_contig_length_vs_pip(df[df.postrb < 0.10], key)

In [0]:
def save_fig(key, ext):
    plt.savefig(os.path.join(analysis_dir, "%s.%s" % (key, ext)))

In [0]:
plt.plot(contig_pip_dfs['tdt'].postrb.values, label="PIP")
plt.title("TDT contigs")
plt.legend()
save_fig("TDT", "pdf")
plt.show()

plt.plot(contig_pip_dfs['mass'].postrb.values, label="PIP")
plt.title("Mass contigs")
plt.legend()
save_fig("Mass", "pdf")
plt.show()

plt.plot(contig_pip_dfs['pd'].postrb.values, label="PIP")
plt.title("PD contigs")
plt.legend()
save_fig("PD", "pdf")
plt.show()



In [0]:
sns.set_context("talk")

In [0]:
plt.xlim(0, len(mcmc_mass))
plt.plot(mcmc_mass_hmean.postrb_hmean, alpha=0.5, label="PIP (RB)")
plt.plot(mcmc_mass_hmean.betarb_hmean, alpha=0.5, label="Beta (RB)")
plt.title("Mass")
plt.xlabel("SNP")
plt.legend()
save_fig("mass_pip_beta", "pdf")
plt.show()

In [0]:
plt.xlim(0, len(mcmc_pd))
plt.plot(mcmc_pd_hmean.postrb_hmean, alpha=0.5, label="PIP (RB)")
plt.plot(mcmc_pd_hmean.betarb_hmean, alpha=0.5, label="Beta (RB)")
plt.title("PD")
plt.xlabel("SNP")
plt.legend()
save_fig("pd_snp_beta", "pdf")
plt.show()

In [0]:
plt.xlim(0, len(mcmc_tdt))
plt.plot(mcmc_tdt_hmean.postrb_hmean, alpha=0.5, label="PIP (RB)")
plt.plot(mcmc_tdt_hmean.betarb_hmean, alpha=0.5, label="Beta (RB)")
plt.title("TDT")
plt.xlabel("SNP")
plt.legend()
save_fig("tdt_pip_beta", "pdf")
plt.show()

In [0]:
snps = {}
for pheno, files in list(snp_files.items()):
    if not pheno in snps:
        snps[pheno] = pd.DataFrame()
    for f in files:
        index = os.path.basename(f).split("_")[-1].split(".")[0]
        h = open(f)
        h.readline() ##skip header
        header = h.readline().strip().split()
        data = []
        for line in h:
            line = line.strip().split()
            data.append(line)
            
        testdf = pd.DataFrame(data, columns=header)
        testdf.columns = ["%s_%s" % (x.strip(), index) for x in testdf.columns]
        snps[pheno] = pd.concat([snps[pheno], testdf], axis=1)

In [0]:
snps_mass = snps['mass'][[x for x in snps['mass'] if '_1' in x]]

In [0]:
snps_mass.head()

In [0]:
def read_gamma(f):
    d = []
    h = open(f)
    header = h.readline().strip().split()
    for line in h:
        line = line.strip().split()
        d.append(line)
    df = pd.DataFrame(d, columns=header)
    return df.replace('NA', np.nan).astype(float)
gamma_mass = read_gamma(gamma_files['mass'][0])
gamma_pd = read_gamma(gamma_files['pd'][0])
gamma_tdt = read_gamma(gamma_files['tdt'][0])

In [0]:
save_df(analysis_dir, 'gamma_mass', gamma_mass)
save_df(analysis_dir, 'gamma_pd', gamma_pd)
save_df(analysis_dir, 'gamma_tdt', gamma_tdt)

In [0]:
save_df(analysis_dir, "sig_snps_mass", sig_snps_mass)
save_df(analysis_dir, "sig_snps_tdt", sig_snps_tdt)
save_df(analysis_dir, "sig_snps_pd", sig_snps_pd)
save_df(analysis_dir, "relaxed_sig_snps_mass", relaxed_sig_snps_mass)
save_df(analysis_dir, "relaxed_sig_snps_tdt", relaxed_sig_snps_tdt)
save_df(analysis_dir, "relaxed_sig_snps_pd", relaxed_sig_snps_pd)


In [0]:
def add_range_snps_to_hdf(snps, ranges, pheno):
    for i, val in enumerate(ranges):
        save_df(analysis_dir, 'range_snps_%s_%d' % (pheno, i), snps[i])
add_range_snps_to_hdf(range_snps_mass, mass_quant_range, "mass")
add_range_snps_to_hdf(range_snps_pd, pd_quant_range, "pd")
add_range_snps_to_hdf(range_snps_tdt, tdt_quant_range, "tdt")

In [0]:
[x.shape for x in range_snps_mass]

In [0]:
plt.hist(np.abs(sig_snps_mass.betarb_hmean.values))
plt.text(0.02, 1.5, r"$n = %d$" % len(sig_snps_mass))
plt.title(r"Mass ($> %.2f$)" % mass_quant.cutoff)
plt.xlabel(r"$\beta$")
save_fig("mass_beta_sig", "pdf")
plt.show()
plt.hist(np.abs(relaxed_sig_snps_mass.betarb_hmean.values))
plt.text(0.02, 700, r"$n = %d$" % len(relaxed_sig_snps_mass))
plt.title(r"Mass 99th($> %.5f$)" % mass_quant.relaxed_cutoff)
plt.xlabel(r"$\beta$")
save_fig("mass_beta_relaxed", "pdf")
plt.show()

In [0]:
plt.hist(np.abs(sig_snps_tdt.betarb_hmean.values))
plt.text(0.1, 10, r"$n = %d$" % len(sig_snps_tdt))
plt.title(r"TDT ($> %.2f$)" % tdt_quant.cutoff)
plt.xlabel(r"$\beta$")
save_fig("tdt_beta_sig", "svg")

plt.show()
plt.hist(np.abs(relaxed_sig_snps_tdt.betarb_hmean.values))
plt.text(0.1, 700, r"$n = %d$" % len(relaxed_sig_snps_tdt))
plt.title(r"TDT 99th ($> %.5f$)" % tdt_quant.relaxed_cutoff)
plt.xlabel(r"$\beta$")
save_fig("tdt_beta_relaxed", "svg")

plt.show()

In [0]:
plt.hist(np.abs(sig_snps_pd.betarb_hmean.values))
plt.text(0.005, 1.5, r"$n = %d$" % len(sig_snps_pd))
plt.title(r"PD ($> %.2f$)" % pd_quant.cutoff)
plt.xlabel(r"$\beta$")
save_fig("pd_beta_sig", "pdf")
plt.show()
plt.hist(np.abs(relaxed_sig_snps_pd.betarb_hmean.values))
plt.text(0.005, 600, r"$n = %d$" % len(relaxed_sig_snps_pd))
plt.title(r"PD 99th ($> %.5f$)" % pd_quant.relaxed_cutoff)
plt.xlabel(r"$\beta$")
save_fig("pd_beta_relaxed", "pdf")

plt.show()