In [0]:
import vcf, os, sys
from IPython.parallel import Client
import time
import vcf
import pandas as pd
from collections import defaultdict
import matplotlib.pyplot as plt
import numpy as np
import scipy as sp
from IPython.display import FileLink

In [0]:
rc = Client(profile="huge")
dview = rc[:]
lview = rc.load_balanced_view()
len(rc)

In [0]:
def get_idle_engines(rc):
    qs = rc.queue_status()
    time.sleep(10)
    active = [eid for eid in sorted(qs)[:-1] if not qs[eid]['queue']]
    d = rc[active]
    l = rc.load_balanced_view(targets=active)
    return d, l

In [0]:
dview, lview = get_idle_engines(rc)
len(dview)

In [0]:
with dview.sync_imports():
    import vcf
    import os
    import sys
    import pandas
    from collections import defaultdict

In [0]:
vcf_dir = "/data7/eckertlab/projects/ethan/analysis/samtools1.1/"
vcfs = !ls {vcf_dir}/*.vcf
vcfs = sorted(vcfs)
vcfs

In [0]:
vcftools = "/home/cfriedline/data7/src/vcftools_0.1.12b/bin/vcftools"
bcftools = "/home/cfriedline/data7/src/bcftools/bcftools"

In [0]:
vcfs = vcfs[-1:]

In [0]:
vcfs

##Extracts SNPs from vcfs

In [0]:
def extract_snps_from_vcf(v):
    out = v.replace(".vcf", "_snps.vcf")
    with open(out, "w") as o:
        reader = vcf.VCFReader(filename=v)
        writer = vcf.VCFWriter(o, reader)
        for rec in reader:
            if rec.is_snp:
                writer.write_record(rec)
    return out
dview['extract_snps_from_vcf'] = extract_snps_from_vcf

In [0]:
vcfs

In [0]:
snp_count = 0
other_count = 0
snp_alts = []
other_alts = []
reader = vcf.VCFReader(filename=vcfs[0])
for i, rec in enumerate(reader):
    if rec.is_snp:
        snp_count += 1
        snp_alts.append(rec.ALT)
    else:
        other_count += 1
        other_alts.append(rec.ALT)
    if i % 100 == 0:
        print "at %d" % i
print snp_count, other_count

In [0]:
snps = []
for v in vcfs:
    snps.append(lview.apply_async(extract_snps_from_vcf, v))

In [0]:
[x.ready() for x in snps]

In [0]:
snps = [x.get() for x in snps]

In [0]:
snps

##SNP on linkage map

In [0]:
linkage_map_file = "/data7/eckertlab/projects/ethan/analysis/final_maps_cleaned_10_28_2014.xlsx"
linkage_map = pd.read_excel(linkage_map_file)

In [0]:
linkage_map["position"] = linkage_map.apply(lambda row: int(row.TGG_nomenclature.split("-")[1]), axis=1)

In [0]:
linkage_map[0:5]

In [0]:
def create_map_index(linkage_map):
    map_index = defaultdict(set)
    for row in linkage_map.iterrows():
        map_index[row[1].rad_tag_id].add(row[1].position)
    return map_index

In [0]:
map_index = create_map_index(linkage_map)

In [0]:
def extract_snps_on_map(v):
    out = v.replace(".vcf", "_on_map.vcf")
    with open(out, "w") as o:
        reader = vcf.Reader(open(v))
        writer = vcf.Writer(o, reader)
        for rec in reader:
            if rec.CHROM in map_index:
                if rec.POS in map_index[rec.CHROM]:
                    writer.write_record(rec)
    return out

In [0]:
mapped_snps = defaultdict(list)
for v in snps:
    print v
    extract_snps_on_map(v)

In [0]:
on_map = !ls /data7/eckertlab/projects/ethan/analysis/samtools1.1/*on_map.vcf.gz
on_map

In [0]:
for v in on_map:
    !/home/cfriedline/data7/src/htslib/bgzip $v
    !tabix {v}.gz

In [0]:
snp_isect = !$bcftools isec -n +1 {" ".join(on_map)}

In [0]:
snp_isect = [x.split("\t") for x in snp_isect[1:]]
for x in snp_isect:
    x.extend((list(x[-1])))

In [0]:
columns=["contig",
         "pos",
         "ref",
         "alt",
         "isec"]
columns.extend([os.path.basename(x) for x in on_map])
snp_isect_df = pd.DataFrame(snp_isect, columns=columns)

In [0]:
len(snp_isect_df[snp_isect_df.isec=="1111"])

In [0]:
for v in on_map:
    quals = []
    v = "%s" % v
    reader = vcf.Reader(filename=v)
    for rec in reader:
        quals.append(rec.QUAL)
    plt.hist(quals)
    title = "%s %.2f %.2f [%d, %d] (n=%d)" % (os.path.basename(v),
                                       np.mean(quals),
                                       np.std(quals),
                                       np.min(quals),
                                       np.max(quals),
                                       len(quals))
    plt.title(title)
    plt.show()

In [0]:
for v in on_map:
    v = "%s.gz" % v
    reader = vcf.Reader(filename=v)
    for rec in reader:
        print rec.INFO
        for sample in rec.samples:
            print sample
        break
    break

In [0]:
for v in on_map:
    v = "%s" % v
    vcf_string = "--gzvcf %s --012 --minQ %d --out %s" % (v, 20, v)
    !/home/cfriedline/data7/src/vcftools_0.1.12b/bin/vcftools $vcf_string

In [0]:
!/home/cfriedline/data7/src/vcftools_0.1.12b/bin/vcftools --help

In [0]:
z12_files = ["%s.012" % x for x in on_map]

In [0]:
z12_files

In [0]:
df = pd.read_csv(z12_files[-1], sep="\t", header=None)
pos = pd.read_csv("%s.pos" % z12_files[-1], header=None, sep="\t", names=["contig","pos"])
pos['name'] = pos.apply(lambda row: "%s-%d" % (row.contig, row.pos), axis=1)
indv = pd.read_csv("%s.indv" % z12_files[-1], header=None, sep="\t", index_col=0)
df.index = indv.index
df = df.drop(0, axis=1)
df.index.name = "sample"
df.columns = pos.name
df[0:5]

In [0]:
len(df.columns)

In [0]:
df.to_csv("samtools_1.1_results.txt", index=True)

In [0]:
FileLink("samtools_1.1_results.txt")

In [0]:
def calc_missing_perc(col):
    num = 0
    for val in col:
        if val == -1:
           num += 1
    return (num*100)/len(col)

In [0]:
missing_percs = df.apply(calc_missing_perc)

In [0]:
for i in xrange(0, 101, 10):
    print "missing >= %d%% = %d snps" % (i, len(missing_percs[missing_percs>=i]))

In [0]:
len(linkage_map)

In [0]:
missing_percs