In [0]:
import sys

sys.path.append("../include_utils/")

import ipyparallel as ipp
import os, time
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import cyvcf
import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE
import seaborn as sns
from IPython.display import FileLink
import urllib2
import dill
import traceback
from pandas import Series, DataFrame
import gzip
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%config InlineBackend.figure_format = 'retina'
from Bio import SeqIO
import pysam
from collections import OrderedDict, namedtuple
import operator
import multiprocessing as mp
from hdfstorehelper import HDFStoreHelper
import dill

samtools = "/home/cfriedline/gpfs/src/samtools-1.2/samtools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.2/bcftools"
picard = "/home/cfriedline/gpfs/src/broadinstitute-picard-03a1d72/dist/picard.jar"
java = "/home/cfriedline/g/src/jdk1.8.0_60/bin/java"
perl = "/home/cfriedline/gpfs/opt/ActivePerl-5.16/bin/perl"

vcfutils = "perl /home/cfriedline/g/src/bcftools-1.2/vcfutils.pl"
vcftools = "/home/cfriedline/bin/vcftools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.2/bcftools"
tabix = "/home/cfriedline/gpfs/src/samtools-1.2/htslib-1.2.1/tabix"
bgzip = "/home/cfriedline/gpfs/src/samtools-1.2/htslib-1.2.1/bgzip"


def setup_r():
    os.environ['R_HOME'] = '/home/cfriedline/g/R3/lib64/R'
    os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s:%s" % (os.environ['R_HOME'], 
                                                   os.environ['LD_LIBRARY_PATH'],
                                                     "/home/cfriedline/lib64")

setup_r()
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = robjects.r

%reload_ext autoreload
%autoreload 2
%matplotlib inline
%reload_ext rpy2.ipython

In [0]:
notimputed_vcf_gz = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/notimputed/isect.vcf.gz.sorted.gz"
imputed_vcf_gz = '/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle40/isect.vcf.gz.sorted.gz'

In [0]:
hdf_files = [os.path.join(os.path.dirname(x), "isect.hd5") for x in [notimputed_vcf_gz, imputed_vcf_gz]]

In [0]:
hdfs = [HDFStoreHelper(x) for x in hdf_files]

In [0]:
hdfs[0].get_group_names()

In [0]:
hierf_trans = {0:11, 1:12, 2:22, -1:'NA'}
def apply_hierf_trans(series):
    return [hierf_trans[x] if x in hierf_trans else x for x in series]

In [0]:
z12_swapped = [x['z12_swapped'] for x in hdfs]

In [0]:
hierf_df = [x.ix[:,:-2].apply(apply_hierf_trans) for x in z12_swapped]

In [0]:
hierf_df[0].shape

In [0]:
for i, df in enumerate(hierf_df):
    hierf_df[i] = z12_swapped[i][['popid']].join(df)

In [0]:
for i, elem in enumerate(hdf_files):
    filedir = os.path.dirname(elem)
    outfile = os.path.join(filedir, "isect_hierfstat.txt")
    hierf_df[i].to_csv(outfile, header=True, index=False, sep="\t")

### Put into R (because it can be slow)

```R
library(hierfstat)
data = read.table("isect_hierfstat.txt", header=T, sep="\t")
levels = data.frame(data$popid)
loci = data[,2:ncol(data)]
bs = basic.stats(data)
saveRDS(bs, "isect_hierfstat_basic_stats.rds")
res = varcomp.glob(levels=levels, loci=loci, diploid=T)
saveRDS(res, "isect_hierfstat_varcomp.rds")

```

In [0]:
%%R
varcomp_not = readRDS("/gpfs_fs/home/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/notimputed/isect_hierfstat_varcomp.rds")
bs_not = readRDS("/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/notimputed/isect_hierfstat_basic_stats.rds")
varcomp_imp = readRDS("/gpfs_fs/home/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle40/isect_hierfstat_varcomp.rds")
bs_imp = readRDS("/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle40/isect_hierfstat_basic_stats.rds")

In [0]:
def get_r_series(key):
    s = pd.Series(get_r(key))
    s.index = get_r("names(%s)" % key)
    return s

def get_r_df(key):
    df = pd.DataFrame(get_r(key))
    try:
        rname = get_r("rownames(%s)" % key)
        df.index = rname
    except:
        pass
    
    try:
        cname = get_r("colnames(%s)" % key)
        df.columns = cname
    except:
        pass
    
    return df

def get_r(key):
    return r(key)

In [0]:
perloc_not = get_r_df("bs_not$perloc")
Ho_not = get_r_df("bs_not$Ho")
Hs_not = get_r_df("bs_not$Hs")
Fis_not = get_r_df("bs_not$Fis")
overall_not = get_r_series("bs_not$overall")
n_ind_samp_not = get_r_df("bs_not$n.ind.samp")

In [0]:
perloc_imp = get_r_df("bs_imp$perloc")
Ho_imp = get_r_df("bs_imp$Ho")
Hs_imp = get_r_df("bs_imp$Hs")
Fis_imp = get_r_df("bs_imp$Fis")
overall_imp = get_r_series("bs_imp$overall")
n_ind_samp_imp = get_r_df("bs_imp$n.ind.samp")

In [0]:
hdfs[0]['perloc'] = perloc_not
hdfs[0]['Ho'] = Ho_not
hdfs[0]["Hs"] = Hs_not
hdfs[0]["Fis"] = Fis_not
hdfs[0]["overall"] = overall_not
hdfs[0]["n_ind_samp"] = n_ind_samp_not

In [0]:
hdfs[1]['perloc'] = perloc_imp
hdfs[1]['Ho'] = Ho_imp
hdfs[1]["Hs"] = Hs_imp
hdfs[1]["Fis"] = Fis_imp
hdfs[1]["overall"] = overall_imp
hdfs[1]["n_ind_samp"] = n_ind_samp_imp

In [0]:
loc_df_not = get_r_df('varcomp_not$loc')
F_df_not = get_r_df('varcomp_not$F')
overall_df_not = get_r_df('varcomp_not$overall')

In [0]:
loc_df_imp = get_r_df('varcomp_imp$loc')
F_df_imp = get_r_df('varcomp_imp$F')
overall_df_imp = get_r_df('varcomp_imp$overall')

In [0]:
loc_df_imp.head()

In [0]:
F_df_not

In [0]:
F_df_imp

In [0]:
hdfs[0]['F_df'] = F_df_not
hdfs[1]['F_df'] = F_df_imp

In [0]:
def compute_fst(series):
    Hs = series[0]
    Ht = sum(series)
    return Hs/Ht

In [0]:
loci_fst_not = loc_df_not.apply(compute_fst, axis=1)
loci_fst_imp = loc_df_imp.apply(compute_fst, axis=1)

In [0]:
loci_fst_not.describe()

In [0]:
loci_fst_imp.describe()

In [0]:
hdfs[0]['loci_fst'] = loci_fst_not
hdfs[1]['loci_fst'] = loci_fst_imp

In [0]:
plt.hist(loci_fst_not, bins=50)
plt.title("not imputed n=%d mean=%.4f +/- %.4f [%.4f, %.4f]" % (len(loci_fst_not), 
                                                    np.mean(loci_fst_not), 
                                                    np.std(loci_fst_not),
                                                    np.min(loci_fst_not), 
                                                    np.max(loci_fst_not)))
plt.show()

plt.hist(loci_fst_imp, bins=50)
plt.title("imputed n=%d mean=%.4f +/- %.4f [%.4f, %.4f]" % (len(loci_fst_imp), 
                                                    np.mean(loci_fst_imp), 
                                                    np.std(loci_fst_imp),
                                                    np.min(loci_fst_imp), 
                                                    np.max(loci_fst_imp)))
plt.show()