In [0]:
import sys

sys.path.append("../include_utils")

#from IPython.parallel import Client
import ipyparallel as ipp
import os, time
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE
import seaborn as sns
from IPython.display import FileLink
import urllib.request as urllib2
import dill
import traceback
from pandas import Series, DataFrame
import gzip
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%config InlineBackend.figure_format = 'retina'
from Bio import SeqIO
import pysam
from collections import OrderedDict, namedtuple
import operator
import multiprocessing as mp

In [0]:
def setup_r():
    os.environ['R_HOME'] = '/home/cfriedline/g/R3/lib64/R'
    os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s:%s" % (os.environ['R_HOME'], 
                                                   os.environ['LD_LIBRARY_PATH'],
                                                     "/home/cfriedline/lib64")

In [0]:
setup_r() #skip on mac

In [0]:
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = robjects.r

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%reload_ext rpy2.ipython

In [0]:
def convert_GQ_to_p(q):
    return pow(10,(q/-10.0))

In [0]:
vcfutils = "perl /home/cfriedline/g/src/bcftools-1.3/vcfutils.pl"
vcftools = "/home/cfriedline/bin/vcftools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.3/bcftools"
tabix = "/home/cfriedline/gpfs/src/htslib-1.3/tabix"
bgzip = "/home/cfriedline/gpfs/src/htslib-1.3/bgzip"

# For Mac
vcfutils = "perl /Users/chris/src/bcftools-1.3/vcfutils.pl"
vcftools = "/Users/chris/bin/vcftools"
bcftools = "/Users/chris/src/bcftools-1.3/bcftools"
tabix = "/Users/chris/src/htslib-1.3/tabix"
bgzip = "/Users/chris/src/htslib-1.3/bgzip"

In [0]:
analysis_dir = '/home/cfriedline/eckertlab/gypsy_indiv/raw_demult/analysis/samtools1.3_masurca3/'
analysis_dir = '/Volumes/backup/gypsy_moth'

vcf_file = os.path.join(analysis_dir, "samtools_1.3.vcf.gz")
assert os.path.exists(vcf_file)
vcf_file

In [0]:
!$vcftools --remove-indels \
--max-missing 0.5 \
--remove-filtered-all \
--recode \
--recode-INFO-all \
--gzvcf \
$vcf_file \
--out $vcf_file

In [0]:
vcf_filtered = "%s.recode.vcf" % vcf_file
vcf_filtered_gz = "%s.gz" % vcf_filtered

In [0]:
!$bgzip -c $vcf_filtered > {vcf_filtered_gz}
!$tabix {vcf_filtered_gz}

In [0]:
!$vcftools --gzvcf $vcf_filtered_gz \
--out $vcf_filtered_gz \
--depth 

!$vcftools --gzvcf $vcf_filtered_gz \
--out $vcf_filtered_gz \
--site-depth 

!$vcftools --gzvcf $vcf_filtered_gz \
--out $vcf_filtered_gz \
--site-mean-depth

In [0]:
!$vcftools --gzvcf $vcf_filtered_gz \
--site-quality \
--out $vcf_filtered_gz

!$vcftools --gzvcf $vcf_filtered_gz \
--out $vcf_filtered_gz \
--missing-indv

!$vcftools --gzvcf $vcf_filtered_gz \
--out $vcf_filtered_gz \
--missing-site

In [0]:
!$vcftools --gzvcf $vcf_filtered_gz \
--out $vcf_filtered_gz \
--freq

!$vcftools --gzvcf $vcf_filtered_gz \
--out $vcf_filtered_gz \
--counts

In [0]:
!$vcftools --gzvcf $vcf_filtered_gz \
--out $vcf_filtered_gz \
--hardy

!$vcftools --gzvcf $vcf_filtered_gz \
--out $vcf_filtered_gz \
--het

In [0]:
hardy_files = !ls /Volumes/backup/gypsy_moth/*.hwe
hardy = pd.read_csv(hardy_files[0], sep="\t")

In [0]:
hardy.columns = ['CHROM', 'POS', 'OBS(HOM1/HET/HOM2)', 'E(HOM1/HET/HOM2)', 'ChiSq_HWE',
       'P_HWE', 'P_HET_DEFICIT', 'P_HET_EXCESS']
hardy.index = hardy.apply(lambda x: "%s-%d" % (x.CHROM, x.POS), axis=1)

In [0]:
loci_files = !ls /Volumes/backup/gypsy_moth/*.l* | grep -v log

In [0]:
frq_files = !ls /Volumes/backup/gypsy_moth/*.frq* | grep -v count

In [0]:
loci_df = pd.concat([pd.read_csv(x, sep="\t", skiprows=0) for x in loci_files], axis=1)
chrom_pos = loci_df.ix[:,0:2]

In [0]:
frq_data = []
h = open(frq_files[0])
header = h.readline().strip().split()
for line in h:
    frq_data.append(line.strip().split('\t'))

In [0]:
header = ['CHROM', 'POS', 'N_ALLELES', 'N_CHR', 'A1_FREQ', "A2_FREQ"]
frq_df = pd.DataFrame(frq_data)
frq_df = frq_df.drop([6,7],axis=1)
frq_df.columns = header
frq_df.index = frq_df.apply(lambda x: "%s-%s" % (x.CHROM, x.POS), axis=1)

In [0]:
loci_df = loci_df.drop(['CHROM','CHR','POS'], axis=1)

In [0]:
loci_df = pd.concat([chrom_pos, loci_df], axis=1)

In [0]:
loci_df.index = loci_df.apply(lambda x: "%s-%d" % (x.CHROM, x.POS), axis=1)

In [0]:
loci_df.head()

In [0]:
loci_df.SUM_DEPTH.describe()

In [0]:
loci_df.QUAL.plot(kind="hist");

In [0]:
hardy.head()

In [0]:
loci_df = pd.concat([loci_df, frq_df, hardy], axis=1)

In [0]:
loci_df["A1_allele"] = loci_df.apply(lambda row: row.A1_FREQ.split(":")[0], axis=1)
loci_df["A2_allele"] = loci_df.apply(lambda row: row.A2_FREQ.split(":")[0], axis=1)

In [0]:
loci_df["A1_freq"] = loci_df.apply(lambda row: float(row.A1_FREQ.split(":")[1]), axis=1)
loci_df["A2_freq"] = loci_df.apply(lambda row: float(row.A2_FREQ.split(":")[1]), axis=1)

In [0]:
def get_MAF(row):
    try:
        return np.min([row.A1_freq, row.A2_freq])
    except:
        print(row)

In [0]:
loci_df['MAF'] = loci_df.apply(get_MAF, axis=1)

In [0]:
loci_df = loci_df.drop(['CHROM', 'POS'], axis=1)

In [0]:
pd.set_option('display.max_columns', 100)

In [0]:
def get_correction(n):
    #for finite sample size
    return (2*n)/(2*n-1)

def calculate_Fis(vals):
    try:
        data = [float(x) for x in vals.split("/")]
        assert len(data) == 3
        num_individuals = np.sum(data)
        total_alleles = 2*num_individuals
        a1_count = 2*data[0]
        a2_count = 2*data[2]
        het_count = data[1]
        a1_count += het_count
        a2_count += het_count
        a1_freq = a1_count/total_alleles
        a2_freq = a2_count/total_alleles
        assert a1_freq + a2_freq == 1.0
        He = 2 * a1_freq * a2_freq * get_correction(num_individuals)
        Ho = het_count/num_individuals
        Fis = 1 - (Ho/He)
        return Fis
    except:
        return -9

loci_df['Fis'] = loci_df['OBS(HOM1/HET/HOM2)'].apply(calculate_Fis)

In [0]:
loci_df.to_csv(os.path.join(analysis_dir, "loci_stats.txt"),
              sep="\t",
              index=False)

In [0]:
len(loci_df[loci_df.Fis == -9])

In [0]:
len(loci_df[loci_df.QUAL >= 10]) - len(loci_df[loci_df.QUAL >= 20])

In [0]:
len(loci_df[loci_df.QUAL < 20])

In [0]:
len(loci_df[loci_df.Fis >= 0.5]), len(loci_df[loci_df.Fis <= -0.5]), len(loci_df[loci_df.MAF < 0.01])

In [0]:
loci_stage1 = loci_df[(loci_df.SUM_DEPTH >= 100) & 
                      (loci_df.SUM_DEPTH < 1500) & 
                      (loci_df.QUAL >= 20) & 
                      (loci_df.MAF >= 0.01) & 
                      (loci_df.Fis < 0.5) & 
                      (loci_df.Fis > -0.05)]
loci_stage1.shape

In [0]:
with open(os.path.join(analysis_dir, "stage1_positions.txt"), "w") as o:
    for elem in loci_stage1.index:
        o.write("%s\n" % "\t".join(elem.split("-")))
    

In [0]:
!$vcftools --gzvcf $vcf_filtered_gz \
--remove-indels  \
--remove-filtered-all \
--recode \
--recode-INFO-all \
--positions {os.path.join(analysis_dir, "stage1_positions.txt")} \
--out {os.path.join(analysis_dir, "good_snps")}

In [0]:
good_snps = os.path.join(analysis_dir, "good_snps.recode.vcf")
good_snps_gz = good_snps + ".gz"
!$bgzip -c {good_snps} > {good_snps_gz}
!$tabix {good_snps_gz}

## Impute genotypes

```bash
$HOME/jdk1.7.0_25/bin/java -jar ~/g/src/BEAGLE4/beagle.r1399.jar \
gl=good_snps.recode.vcf.gz \
out=imputed40 \
nthreads=30 \
phase-its=20 \
burnin-its=20
impute-its=20
```