In [0]:
import sys

sys.path.append("../include_utils/")

#from IPython.parallel import Client
import ipyparallel as ipp
import os, time
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE
import seaborn as sns
from IPython.display import FileLink
import urllib.request as urllib2
import dill
import traceback
from pandas import Series, DataFrame
import gzip
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%config InlineBackend.figure_format = 'retina'
from Bio import SeqIO
import pysam
from collections import OrderedDict, namedtuple
import operator
import multiprocessing as mp

In [0]:
def setup_r():
    os.environ['R_HOME'] = '/home/cfriedline/g/R3/lib64/R'
    os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s:%s" % (os.environ['R_HOME'], 
                                                   os.environ['LD_LIBRARY_PATH'],
                                                     "/home/cfriedline/lib64")

In [0]:
setup_r()
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = robjects.r

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%reload_ext rpy2.ipython

In [0]:
def convert_GQ_to_p(q):
    return pow(10,(q/-10.0))

In [0]:
vcfutils = "perl /home/cfriedline/g/src/bcftools-1.3/vcfutils.pl"
vcftools = "/home/cfriedline/bin/vcftools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.3/bcftools"
tabix = "/home/cfriedline/gpfs/src/htslib-1.3/tabix"
bgzip = "/home/cfriedline/gpfs/src/htslib-1.3/bgzip"

In [0]:
analysis_dir = '/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/'
vcf_file = os.path.join(analysis_dir, "samtools_1.3.vcf.gz")
assert os.path.exists(vcf_file)
vcf_file

In [0]:
vcf_filtered = "%s.recode.vcf" % vcf_file
vcf_filtered_gz = "%s.gz" % vcf_filtered

In [0]:
!bgzip -c $vcf_filtered > {vcf_filtered_gz}
!tabix {vcf_filtered_gz}

In [0]:
vcf_varfilter = "%s_filtered.vcf" % vcf_filtered
vcf_varfilter_gz = "%s_filtered.vcf.gz" % vcf_filtered

In [0]:
# filter by RMS Mapping quality MQ (-Q)
# filter by depth to remove artefacts due to high coverage DP (-D) and low coverage (-d)

!$vcfutils varFilter -Q 10 -D 1500 -d 5 {vcf_filtered} > {vcf_varfilter}

In [0]:
!$bgzip -c $vcf_varfilter > $vcf_varfilter_gz
!$tabix $vcf_varfilter_gz

In [0]:
vcf_df = pd.read_csv(gzip.open(vcf_filtered_gz), 
                     iterator=True, 
                     chunksize=10000, 
                     sep="\t",
                    comment="#",
                    header=None)

In [0]:
vcf_df = pd.concat(vcf_df)
vcf_df.head()

In [0]:
header = !$tabix -H {vcf_filtered_gz}

In [0]:
vcf_df.columns = header[-1].strip().split("\t")

In [0]:
vcf_df.head()

In [0]:
vcf_df.index = [int(x) for x in vcf_df.index]

In [0]:
index = vcf_df.apply(lambda x: "%s_%d" % (x["#CHROM"], int(x.POS)), axis=1)

In [0]:
vcf_df.index = index

In [0]:
info_dict = {}
def separate_info(row):
    d = row.INFO.strip().split(";")
    data = {}
    for elem in d:
        key, val = elem.split("=")
        data[key] = val
    info_dict[row.name] = data
vcf_df.apply(separate_info, axis=1);


In [0]:
vcf_info = pd.DataFrame(info_dict).T

In [0]:
vcf_df.head()

In [0]:
formats = vcf_df.ix[0].FORMAT.split(":")
format_dict = {}
def get_formats(row):
    start = 9
    data = row[start:]
    cols = vcf_df.columns[start:]
    format_dict[data.name] = {}
    for i, elem in enumerate(data):
        d = elem.split(":")
        for key, val in zip(formats, d):
            if not key in format_dict[data.name]:
                format_dict[data.name][key] = {}
            format_dict[data.name][key][cols[i]] = val
vcf_df.apply(get_formats, axis=1);

In [0]:
vcf_panel = pd.Panel(format_dict)

In [0]:
for snp in vcf_panel.iteritems():
    print(snp)

In [0]:
vcf_format.

In [0]:
vcf_data = pd.concat([vcf_df, vcf_info, vcf_format], axis=1)

In [0]:
vcf_format

In [0]:
vcf_data

In [0]:
!$vcftools --012 \
--gzvcf \
$vcf_varfilter \
--out $vcf_varfilter

In [0]:
snp_vcf = os.path.join(analysis_dir, "snps.vcf")

In [0]:
h = open(vcf_varfilter)
reader = vcf.VCFReader(h)
#start only with true SNPs
o = open(snp_vcf, "w")
writer = vcf.VCFWriter(o, reader)
for i, rec in enumerate(reader):
    if len(rec.REF) == 1 and len(rec.ALT) == 1:
        writer.write_record(rec)
    if i % 1000 == 0:
        print("at %d" % i)
o.close()
h.close()

In [0]:
snp_vcf_gz = "%s.gz" % snp_vcf

In [0]:
!$bgzip -c $snp_vcf > $snp_vcf_gz
!$tabix $snp_vcf_gz

In [0]:
!zcat $snp_vcf_gz | grep -c -v '^#'

## Impute genotypes

```bash
$HOME/jdk1.7.0_25/bin/java -jar ~/g/src/BEAGLE4/beagle.r1399.jar \
gl=snps.vcf.gz \
out=imputed40 \
nthreads=30 \
phase-its=20 \
burnin-its=20
impute-its=20
```

In [0]:
%%R
library(VariantAnnotation)

In [0]:
vcf_filtered_gz

In [0]:
%R -i vcf_filtered_gz

In [0]:
%%R
vcf = readVcf(vcf_filtered_gz, "gypsy_moth")