In [1]:
import sys, os
sys.path.append("../include_utils/")

In [2]:
analysis_dir = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/notimputed"
analysis_vcf = "snps.vcf.gz"
good_snp_file_gz = '/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/notimputed/goodsnps.vcf.gz'

In [104]:
import pandas as pd
import numpy as np
import vcf
import pysam
import cyvcf
import include_utils as u
from hdfstorehelper import HDFStoreHelper
%load_ext autoreload
from pandas import HDFStore
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%load_ext rpy2.ipython
import rpy2.robjects as ro
import random

In [99]:
r = ro.r

In [5]:
%autoreload 2

In [13]:
files = hdf.get('files')

In [25]:
files

In [59]:
hdf = HDFStoreHelper(os.path.join(analysis_dir, "gypsy_samtools12_%s.hd5" % analysis_vcf))

In [117]:
gt_base_df_swapped = hdf.get('gt_base_df_swapped')
pca_std_pheno = hdf.get('pca_std_pheno')

In [62]:
gt_base_df_swapped.head()

In [67]:
def convert_GQ_to_p(q):
    return pow(10,(q/-10.0))

def get_dosage(GP):
    total = 0
    if sum(GP) == 0:
        return "NA"
    else:
        pvals = [convert_GQ_to_p(x) for x in GP]
        pval_sum = np.sum(pvals)
        pvals = [x/pval_sum for x in pvals]
        for i, val in enumerate(pvals):
            total += val*i
            
    return np.round(total, 3)

def get_GP(sample, flip):
    if flip:
        return sample, sample['GP'][::-1], "flipped"
    else:
        return sample['GT'], sample['GP'], ""
        

def get_major_minor(snp, reader):
    d = snp.name.split("_")
    contig = "_".join(d[0:-1])
    loc = int(d[-1])
    minor_major = []
    for gt in snp:
        if isinstance(gt, float):
            pass
        else:
            mm = "%s%s" % (gt[0],gt[-1])
            if not mm in minor_major and mm[0] != mm[1]:
                minor_major.append(mm)
    mm = minor_major[0]
    global thesnp
    for x in reader.fetch(contig, loc-1, loc):
        thesnp = x
    snp_mm = "%s%s" % (thesnp.REF, thesnp.ALT[0])
    flip = False
    if mm != snp_mm:
        flip = True
    dosages = []
    samples = []
    for sample in thesnp.samples:
        gt, gp, flipped = get_GP(sample, flip)
        dosages.append(get_dosage(gp))
        sample_name = "%s_0" % "_".join(sample.sample.split("_")[0:-1])
        samples.append(sample_name)
    data = [mm[0], mm[1]]
    index = ["minor", "major"]
    index.extend(samples)
    data.extend(dosages)
    ret = pd.Series(data, index=index)
    return ret

def get_pimass_gt(df):
    ret = None
    reader = cyvcf.VCFReader(filename=good_snp_file_gz)
    ret = df.apply(get_major_minor, args=(reader,))
    return ret.T

pimass_gt = get_pimass_gt(gt_base_df_swapped)
pimass_gt.head()

In [68]:
gt_base_df_swapped.head()

In [84]:
hdf.put('pimass_gt', pimass_gt)

In [85]:
pimass_gt.to_csv(os.path.join(analysis_dir, "pimass_gt.txt"),
                index=True,
                header=False)

In [86]:
pheno = hdf.get('pheno')

In [87]:
pimass_samples = ["_".join(x.split("_")[0:-1]) for x in pimass_gt.columns if "_" in x]

In [88]:
pheno.index=pheno.sample_pheno
pimass_pheno = pheno.ix[pimass_samples]

In [89]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

pimass_pheno_pca = pca_std_pheno[[x for x in pca_std_pheno if "PC" in x or 'Mass' in x or 'Pupual' in x or 'Total Dev' in x]]

pimass_pheno_pca.columns = [x.replace(" ", "_") for x in pimass_pheno_pca.columns]
pimass_pheno_pca.index = ["_".join(x.split("_")[0:-1]) for x in pimass_pheno_pca.index]
phenos = ["Mass", "Pupual_Duration", "Total_Dev_Time"]
for p in phenos:
    mod = smf.ols(formula="%s~PC1+PC2+PC3+PC4+PC5+PC6+PC7+PC8" % p, data=pimass_pheno_pca)
    res = mod.fit()
    col = "%s_resid" % p
    col = col.lower()
    pimass_pheno[col] = res.resid

In [95]:
%R -i pimass_pheno

In [96]:
%%R
massx = qqnorm(pimass_pheno$mass_resid, plot.it=F)$x
tdtx = qqnorm(pimass_pheno$total_dev_time_resid, plot.it=F)$x
pdx = qqnorm(pimass_pheno$pupual_duration_resid, plot.it=F)$x

In [100]:
pimass_pheno['massx'] = r('massx')
pimass_pheno['tdtx'] = r('tdtx')
pimass_pheno['pdx'] = r('pdx')

In [101]:
pimass_pheno.massx.to_csv(os.path.join(analysis_dir, "pimass_mass.txt"),
                                     index=False,
                                     header=False)
pimass_pheno.tdtx.to_csv(os.path.join(analysis_dir, "pimass_tdt.txt"),
                                     index=False,
                                     header=False)
pimass_pheno.pdx.to_csv(os.path.join(analysis_dir, "pimass_pd.txt"),
                                     index=False,
                                     header=False)
pimass_pheno.to_csv(os.path.join(analysis_dir, "pimass_pheno.txt"),
                                     index=True,
                                     header=True)

In [102]:
pimass_contigs = {}
with open(os.path.join(analysis_dir, "pimass_loc.txt"), "w") as o:    
    for x in pimass_gt.index:
        data = x.split("_")
        contig = "_".join(data[0:-1])
        pos = data[-1]
        if not contig in pimass_contigs:
            pimass_contigs[contig] = []
        pimass_contigs[contig].append(pos)
    
    chrom_id = 1
    for contig, positions in pimass_contigs.items():
        for p in positions:
            o.write("%s_%s\t%s\t%d\n" % (contig, p, p, chrom_id))
        chrom_id += 1

In [116]:
def create_pimass_run_files(num_runs):
    phenos = ["mass", 'tdt', 'pd']
    for p in phenos:
        with open(os.path.join(analysis_dir, "pimass_%s_run.txt" % p), "w") as o:
            for i in xrange(num_runs):
                cmd = "~/g/src/pimass/pimass-lin \
-g pimass_gt.txt \
-p pimass_%s.txt -pos pimass_loc.txt \
-o pimass_%s_out_%d \
-w 1000000 \
-s 10000000 \
-num 500 \
-smin 1 \
-smax 100 \
-hmin 0.01 \
-hmax 0.9 \
-pmin 1 \
-pmax 1000 \
-r %.0f" % (p, p, i, int(random.getrandbits(32)))
                o.write("%s\n" % cmd)
                


def create_qsub_files():
    files = !ls {analysis_dir}/*run.txt
    ret = []
    for f in files:
        with open("%s_qsub.sh" % f, "w") as o:
            o.write("""#!/bin/bash
#$ -j y
#$ -V
#$ -N pimass_%s
#$ -cwd
parallel -a %s
""" % (os.path.basename(f).split("_")[1], f))
            ret.append(o.name)
    return ret
            
create_pimass_run_files(10)
qsub_files = create_qsub_files()
qsub_files

## Run and save piMASS
```bash

./run_pimass.sh
mv output output_comeault
```