In [0]:
from IPython.parallel import Client
import os, time
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE
%matplotlib inline
import seaborn as sns

In [0]:
%reload_ext autoreload
%autoreload 2

In [0]:
R_HOME = '/home/cfriedline/R3/lib64/R'
os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s" % (os.environ['R_HOME'], os.environ['LD_LIBRARY_PATH'])
%reload_ext rpy2.ipython

In [0]:
import rpy2.robjects as robjects
import pandas.rpy.common as com
r = robjects.r

In [0]:
rc = u.get_client("sge")

In [0]:
dview, lview = u.get_views(rc)

In [0]:
dview, lview = u.get_idle_engines(rc)

In [0]:
len(dview)

In [0]:
with dview.sync_imports():
    import os
    import sys
    import socket
    import stopwatch
    from subprocess import Popen, PIPE
    import tempfile
    import shutil

In [0]:
hlview = u.get_single_host_lview(rc, "all")
len(hlview)

In [0]:
def shell(cmd):
    p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()
    return stdout.split("\n"), stderr.split("\n")

In [0]:
bam_dir = "/data7/eckertlab/gypsy_indiv/analysis2"
analysis_dir = os.path.join(bam_dir, "samtools1.2")
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)
assert os.path.exists(analysis_dir)

In [0]:
bam_files,err = shell("find /data7/eckertlab/gypsy_indiv | grep new | grep 'rg.bam$'")
bam_files = [os.path.abspath(x) for x in bam_files if '.bam' in x]
len(bam_files)

In [0]:
samtools = "/home/cfriedline/data7/src/samtools-1.2/samtools"
bcftools = "/home/cfriedline/data7/src/bcftools-1.2/bcftools"
picard = "/home/cfriedline/data7/src/broadinstitute-picard-03a1d72/dist/picard.jar"
java = "/home/cfriedline/jdk1.7.0_25/bin/java"
perl = "/home/cfriedline/data7/opt/ActivePerl-5.16/bin/perl"

In [0]:
def mark_duplicates(args):
    java, picard, bam_file, analysis_dir = args
    out_bam = os.path.join("%s_dedup.bam" % os.path.basename(bam_file))
    out_bam = os.path.join(analysis_dir, out_bam)
    t = tempfile.NamedTemporaryFile(delete=False)
    cmd = "%s -jar %s MarkDuplicates \
    INPUT=%s OUTPUT=%s METRICS_FILE=%s.metrics" %     (java,
                              picard,
                              bam_file,
                              t.name,
                              out_bam)
    print cmd
    !$cmd
    shutil.move(t.name, out_bam)
    return cmd, out_bam
dview['mark_duplicates'] = mark_duplicates

In [0]:
rmdup_jobs = []
for b in bam_files:
    rmdup_jobs.append(hlview.apply_async(mark_duplicates, (java, picard, b, analysis_dir)))

In [0]:
u.get_async_progress(rmdup_jobs)

In [0]:
assembly = "/home/cfriedline/data7/assemblies/gypsy/masurca_new/CA/10-gapclose/genome.ctg.fasta"

In [0]:
bam_rmdup_files = sorted([x.r[1] for x in rmdup_jobs])
bam_rmdup_files[0:5]

In [0]:
for b in bam_rmdup_files:
    rg = !$samtools view -H {b} | grep '^@RG'
    print "\t".join(rg)
    break

In [0]:
def create_ploidy_file(args):
    bam_files, analysis_dir = args
    ploidy_file = os.path.join(analysis_dir, "%s.ploidy" % "all")
    with open(ploidy_file, "w") as o:
        for b in bam_files:
            name = "%s" % os.path.basename(b).split(".fastq")[0]
            ploidy = 2
            o.write("%s\t%d\n" % (name, ploidy))
    return ploidy_file
dview['create_ploidy_file'] = create_ploidy_file

In [0]:
def call_snps(args):
    print socket.gethostname()
    timer = stopwatch.Timer()
    samtools, reference, bam_sorted, bcftools, raw_vcf, out_dir = args 
    if not out_dir:
        out_dir = os.environ['TMPDIR']
    raw_vcf = os.path.join(out_dir, raw_vcf)
    ploidy_file = create_ploidy_file((bam_sorted, out_dir))
    pileup = "%s mpileup -ugf %s %s | %s call -S %s -vmO z -o %s" % (samtools, 
                                                                     reference, 
                                                                     ' '.join(bam_sorted), 
                                                                     bcftools, 
                                                                     ploidy_file, 
                                                                     raw_vcf) 
    
    print pileup
    !$pileup
    timer.stop()
    return pileup, timer.elapsed
dview['call_snps'] = call_snps

In [0]:
args = [samtools, 
        assembly, 
        bam_rmdup_files, 
        bcftools, 
        "samtools_1.2.vcf.gz", 
        analysis_dir]

In [0]:
samtools_job = lview.apply_async(call_snps, args)

In [0]:
print samtools_job.stdout

In [0]:
samtools_job.ready()

In [0]:
%connect_info

##some snp analysis

In [0]:
vcf_file = os.path.join(analysis_dir, "samtools_1.2.vcf.gz")

In [0]:
vcftools = "/home/cfriedline/data7/src/vcftools_0.1.12b/bin/vcftools"
bcftools = "/home/cfriedline/data7/src//bcftools-1.2/bcftools"
tabix = "/home/cfriedline/data7/src/samtools-1.2/htslib-1.2.1/tabix"
bgzip = "/home/cfriedline/data7/src/samtools-1.2/htslib-1.2.1//bgzip"

In [0]:
!$vcftools --remove-indels \
--min-alleles 2 \
--max-alleles 2 \
--mac 1 \
--remove-filtered-all \
--012 \
--gzvcf \
$vcf_file \
--out $vcf_file

## snp filtering

In [0]:
from hdfstorehelper import HDFStoreHelper

In [0]:
hdf = HDFStoreHelper("gypsy_samtools12.hd5")

In [0]:
z12_file = os.path.join(analysis_dir, "%s.012" % vcf_file)
z12_file

In [0]:
z12_data = []
for i, line in enumerate(open(z12_file)):
    line = line.strip()
    line = [int(x) for x in line.split("\t")]
    z12_data.append(np.array(line))
    if i % 10 == 0:
        print i
z12_data = np.array(z12_data)

In [0]:
z12_df = pd.DataFrame(z12_data)
z12_df = z12_df.drop(0, axis=1)
z12_df.columns = pd.Series(z12_df.columns)-1

In [0]:
z12_df[0:10]

In [0]:
hdf.put("z12_df", z12_df)

In [0]:
len(z12_df.columns)

In [0]:
def get_percent_missing(col):
    return len(col[col==-1])*1.0/len(col)   

In [0]:
percent_missing = z12_df.apply(get_percent_missing, axis=0)

In [0]:
percent_missing[0:5]

In [0]:
hdf.put('percent_missing', percent_missing)

In [0]:
percent_missing[percent_missing >= 0.5]

In [0]:
z12_df_50_perc = z12_df.ix[:,percent_missing <= 0.5]

In [0]:
z12_df_50_perc[0:5]

In [0]:
hdf.put('z12_df_50_perc', z12_df_50_perc)

In [0]:
def is_monomorphic(col):
    u = col[col != -1].value_counts()
    if len(u) == 1:
        return True
    return False

monomorphic_loci = z12_df_50_perc.apply(is_monomorphic, axis=0)
monomorphic_loci = monomorphic_loci[monomorphic_loci==True]

In [0]:
len(monomorphic_loci)

In [0]:
z12_df_50_perc_polymorphic = z12_df_50_perc.drop(monomorphic_loci.index, axis=1)

In [0]:
hdf.put('z12_df_50_perc_polymorphic', z12_df_50_perc_polymorphic)

In [0]:
z12_df_50_perc_polymorphic[0:5]

In [0]:
translation_df = pd.read_csv("translation_table.csv", sep="\t", index_col=0)
indv = os.path.join(analysis_dir, "%s.indv" % os.path.basename(z12_file))

In [0]:
indv

In [0]:
translation_df.ix["NC_131_ATAATCCA"]

In [0]:
def get_translated_name(n):
    row = translation_df.ix[n.strip()]
    return "%s_%d_%d" % (row['pop'], row.indiv, row.dup)

In [0]:
names = [get_translated_name(x) for x in open(indv).readlines()]

In [0]:
names[0:5]

In [0]:
z12_df_50_perc_polymorphic.index = names

In [0]:
z12_df_50_perc_polymorphic['population'] = z12_df_50_perc_polymorphic.apply(lambda row: row.name.split("_")[0], 
                                                                     axis=1)

In [0]:
z12_df_50_perc_polymorphic['duplicate'] = z12_df_50_perc_polymorphic.apply(lambda row: row.name[-1],
                                                                           axis=1)

In [0]:
z12_df_50_perc_polymorphic[0:5]

In [0]:
hdf.put('z12_df_50_perc_polymorphic', z12_df_50_perc_polymorphic)

In [0]:
def get_correction(n):
    #for finite sample size
    return (2*n)/(2*n-1)

def get_allele_freqs(locus):
    c = locus[locus != -1].value_counts()
    total_alleles = 2.0*sum(c)
    num_individuals = sum(c)
    P = 0
    Q = 0
    PQ = 0
    if 0 in c:
        P = 2*c[0]
    if 2 in c:
        Q = 2*c[2]
    if 1 in c:
        PQ = c[1]
    P += PQ
    Q += PQ
    p = P/total_alleles
    q = Q/total_alleles
    assert p + q == 1.0
    He = 2 * p * q * get_correction(num_individuals)
    Ho = PQ*1.0/num_individuals
    Fis = 1 - (Ho/He)
    #print p, q, He, Ho, Fis
    return pd.Series({"p":p, 
                      "q":q,
                      "P":P,
                      "Q":Q,
                      "He":He,
                      "Ho":Ho, 
                      "Fis":Fis})

In [0]:
allele_freqs = z12_df_50_perc_polymorphic.ix[:,:-2].apply(get_allele_freqs)
mafs = allele_freqs.apply(lambda x: min(x["p"], x["q"]))
mafs[mafs<0.01]

In [0]:
hdf.put('allele_freqs', allele_freqs)

In [0]:
z12_df_50_perc_polymorphic_maf = z12_df_50_perc_polymorphic.drop(mafs[mafs<0.01].index, axis=1)

In [0]:
z12_df_50_perc_polymorphic_maf[0:5]

In [0]:
hdf.put('z12_df_50_perc_polymorphic_maf', z12_df_50_perc_polymorphic_maf)

In [0]:
global_fis = allele_freqs[z12_df_50_perc_polymorphic_maf.columns[:-2]].apply(lambda x: x["Fis"])
fis_outliers = global_fis[(global_fis < -0.5) | (global_fis > 0.5)]
z12_df_50_perc_polymorphic_maf_fis = z12_df_50_perc_polymorphic_maf.drop(fis_outliers.index, axis=1)

In [0]:
z12_df_50_perc_polymorphic_maf_fis[0:5]

In [0]:
hdf.put('z12_df_50_perc_polymorphic_maf_fis', z12_df_50_perc_polymorphic_maf_fis)

In [0]:
duplicates = z12_df_50_perc_polymorphic_maf_fis[z12_df_50_perc_polymorphic_maf_fis.duplicate=="1"]

In [0]:
duplicates

In [0]:
z12_df_50_perc_polymorphic_maf_fis_dedup = z12_df_50_perc_polymorphic_maf_fis.drop(duplicates.index)

In [0]:
z12_df_50_perc_polymorphic_maf_fis_dedup[0:5]

In [0]:
hdf.put('z12_df_50_perc_polymorphic_maf_fis_dedup', z12_df_50_perc_polymorphic_maf_fis_dedup)

In [0]:
working_df = z12_df_50_perc_polymorphic_maf_fis_dedup

In [0]:
working_df.to_csv(os.path.join(analysis_dir,
                               "z12_df_50_perc_polymorphic_maf_fis_dedup.txt"),
                                          header=True,
                                          index=True,
                                          sep="\t")

##Start here for now

In [0]:
working_df = pd.read_csv(os.path.join(analysis_dir, 
                                      "z12_df_50_perc_polymorphic_maf_fis_dedup.txt"),
                                      sep="\t",
                                      index_col=0)
cols = [int(x) for x in working_df.columns.tolist()[:-2]]
cols.extend(working_df.columns.tolist()[-2:])
working_df.columns = cols

In [0]:
working_df = hdf.get('z12_df_50_perc_polymorphic_maf_fis_dedup')

In [0]:
hierf_trans = {0:11, 1:12, 2:22, -1:'NA'}

In [0]:
def apply_hierf_trans(series):
    return [hierf_trans[x] if x in hierf_trans else x for x in series]

In [0]:
hierf_trans_df = working_df.apply(apply_hierf_trans)

In [0]:
hdf.put('hierf_trans_df', hierf_trans_df)

In [0]:
import urllib2
url = 'https://docs.google.com/uc?export=download&id=0B4xHxBFoPCoWT0NneHJadUI0OHM'
response = urllib2.urlopen(url)
pheno = pd.read_excel(response, "Males-forGenomics-final")
pheno=pheno[['Population', 'Number', 'Mass', 'Pupual Duration', 'Total Dev Time']]
for x in pheno.index:
    pheno.ix[x, 'sample_pheno'] = "%s_%d" % (pheno.ix[x, 'Population'], pheno.ix[x, 'Number'])

In [0]:
pop_id = {}
i = 0
for p in sorted(working_df['population'].unique()):
    pop_id[p] = i
    i+=1
pop_id

In [0]:
def assign_popid(series):
    series['popid'] = pop_id[series['population']]
    return series

In [0]:
hierf_trans_df = hierf_trans_df.apply(assign_popid, axis=1)

In [0]:
hierf_trans_df.columns = ["L%d" % x if isinstance(x, numbers.Number) else x for x in hierf_trans_df.columns]

In [0]:
hierf_trans_df[0:5]

In [0]:
hierf_trans_df.to_csv("hierf_trans_df.txt", header=True, index=True, sep="\t")

In [0]:
hierf_trans_df = pd.read_csv("hierf_trans_df.txt", header=0, index_col=0, sep="\t")

In [0]:
hdf.put('hierf_trans_df', hierf_trans_df)

In [0]:
hierf_trans_df = hdf.get('hierf_trans_df')

##Make sure we're only working with SNPs (not DNPs, MNPs, etc)

In [0]:
hierf_trans_df.columns[-3:]

In [0]:
snp_pos = pd.read_csv(os.path.join(analysis_dir, "samtools_1.2.vcf.gz.012.pos"), 
                              sep="\t",
                              header=None,
                              names=['contig', 'pos'])
snps = pd.DataFrame([int(x[1:]) for x in hierf_trans_df.columns[:-3]])
snps.columns = ['snp_id']
snps.index=snps.snp_id
snp_id_pos = pd.merge(snps, snp_pos, how="inner", left_index=True, right_index=True)
snp_id_pos.index = ["L%d" % x for x in snp_id_pos.index]

In [0]:
snp_id_pos[:5]

In [0]:
hdf.put('snp_pos', snp_pos)
hdf.put('snp_id_pos', snp_id_pos)

In [0]:
# remove malformed header in vcf
# ##INFO=<ID=VDB,Number=1,Type=Float,Description="Variant Distance Bias for filtering splice-site artefacts in RNA-seq data (bigger is better)",Version="3">
# gunzip samtools_1.2.vcf.gz
# sed '/ID=VDB/d' samtools_1.2.vcf > samtools_1.2.vcf_novdb.vcf
# bgzip samtools_1.2.vcf_novdb.vcf
# tabix samtools_1.2.vcf_novdb.vcf.gz
reader = vcf.VCFReader(filename=os.path.join(analysis_dir, "samtools_1.2.vcf_novdb.vcf.gz"))
def get_ref_alt_alleles(row):
    alleles = reader.fetch(row.contig, row.pos)
    row['ref'] = alleles.REF
    row['alt'] = alleles.ALT
    return row
snp_id_pos_ref_alt = snp_id_pos.apply(get_ref_alt_alleles, axis=1)
snp_id_pos_ref_alt['ref_len'] = snp_id_pos_ref_alt.apply(lambda x: len(x.ref), axis=1)
snp_id_pos_ref_alt_snps_only = snp_id_pos_ref_alt[snp_id_pos_ref_alt.ref_len==1]

In [0]:
hdf.put('snp_id_pos_ref_alt', snp_id_pos_ref_alt)
hdf.put('snp_id_pos_ref_alt_snps_only', snp_id_pos_ref_alt_snps_only)

In [0]:
snp_id_pos_ref_alt.to_csv("snp_id_pos_ref_alt.csv", 
                          header=True, 
                          index=True, 
                          sep="\t")
snp_id_pos_ref_alt_snps_only.to_csv("snp_id_pos_ref_alt_snps_only.csv",
                                    header=True, 
                                    index=True,
                                    sep="\t")

In [0]:
snp_id_pos_ref_alt_snps_only = pd.read_csv("snp_id_pos_ref_alt_snps_only.csv", 
                                           header=0,
                                           index_col=0,
                                           sep="\t")

In [0]:
snp_id_pos_ref_alt_snps_only = hdf.get('snp_id_pos_ref_alt_snps_only')

In [0]:
snp_id_pos_ref_alt_snps_only[0:5]

In [0]:
cols = ['popid']
# cols.extend(hierf_trans_df.columns[:-3])
cols.extend(snp_id_pos_ref_alt_snps_only.index)

In [0]:
hierf_trans_df2 = hierf_trans_df[cols]

In [0]:
hierf_trans_df2[0:5]

In [0]:
hdf.put('hierf_trans_df2', hierf_trans_df2)

In [0]:
hierf_trans_df2.popid.unique()

In [0]:
hierf_trans_df2.to_csv("hierfstat_samtools.txt", header=True, index=False, sep="\t")

##Put into R (because it can be slow)

    library(hierfstat)
    data = read.table("hierfstat_samtools.txt", header=T, sep="\t")
    levels = data.frame(data$popid)
    loci = data[,2:ncol(data)]
    res = varcomp.glob(levels=levels, loci=loci, diploid=T)
    saveRDS(res, "hierfstat_samtools_new.rds")
    

In [0]:
%%R
res = readRDS("hierfstat_samtools_new.rds")

In [0]:
res = com.convert_robj(robjects.r('res'))

In [0]:
loc_df = res['loc']
F_df = res['F']
overall_df = res['overall']

In [0]:
F_df

In [0]:
def compute_fst(series):
    Hs = series[0]
    Ht = sum(series)
    return Hs/Ht

In [0]:
loci_fst = loc_df.apply(compute_fst, axis=1)

In [0]:
pop_id

In [0]:
plt.hist(loci_fst, bins=20)
plt.title("n=%d mean=%.2f +/- %.2f [%.2f, %.2f]" % (len(loci_fst), 
                                                    np.mean(loci_fst), 
                                                    np.std(loci_fst),
                                                    np.min(loci_fst), 
                                                    np.max(loci_fst)))
plt.show()

In [0]:
working_df = hdf.get('z12_df_50_perc_polymorphic_maf_fis_dedup')

In [0]:
working_df[0:5]

In [0]:
working_df.columns = ["L%d" % x if isinstance(x, numbers.Number) else x for x in working_df.columns]

In [0]:
allele_freqs = working_df.ix[:,:-2].apply(get_allele_freqs)

In [0]:
allele_freqs.to_csv("allele_freqs.txt", header=True,index=True, sep="\t")

In [0]:
allele_freqs

In [0]:
def swap_alleles(locus):
    if locus.name.startswith("L"):
        #locus_id = int(locus.name[1:]) #drop the L and convert
        locus_id = locus.name
        freqs = allele_freqs[locus_id]
        maf = min(freqs["p"], freqs["q"])
        if maf == freqs["p"]:
            return locus.replace({0:2,2:0})
        return locus
    else:
        return locus

In [0]:
working_df[:5]

In [0]:
pca_maf = working_df.apply(swap_alleles)

In [0]:
pca_maf[:5]

In [0]:
def center_and_standardize_value(val, u, var):
    if val == -1:
        return 0.0
    return (val-u)/np.sqrt(var)

In [0]:
def center_and_standardize(locus):
    if locus.name.startswith("L"):
        #locus_id = int(locus.name[1:])
        locus_id = locus.name
        freqs = allele_freqs[locus_id]
        maf = min(freqs["p"], freqs["q"])
        var = np.sqrt(maf*(1-maf))
        u = np.mean(locus)
        return locus.apply(center_and_standardize_value, args=(u, var))
    return locus

In [0]:
for col in pca_maf.ix[:,0:30]:
    print pca_maf[col].unique()

In [0]:
pca_std = pca_maf.apply(center_and_standardize)

In [0]:
pca_std.ix[0:5,-5:]

In [0]:
pca_std_data = pca_std.ix[:,:-2]

In [0]:
pca_std_data_snps = pca_std_data.ix[:,snp_id_pos_ref_alt_snps_only.index]

In [0]:
prcomp = r('prcomp')
summary = r('summary')

In [0]:
pca_std_data_snps[:5]

In [0]:
pca_std_data_snps.to_csv("pca_std_data_snps.txt", header=True, index=True, sep="\t")

In [0]:
prcomp_res = prcomp(pca_std_data_snps, scale=False, center=False)

In [0]:
print summary(prcomp_res)

In [0]:
x = com.convert_robj(prcomp_res.rx2("x"))
x.index = pca_std_data.index
joined = x.join(pca_maf)

In [0]:
hdf.put('joined', joined)

In [0]:
norm = mcolors.Normalize(min(pop_id.values()), max(pop_id.values()))

In [0]:
legend = {}
for row in joined.iterrows():
    pop = row[1]['population']
    n = norm(pop_id[pop])
    color = cm.rainbow(n)
    legend[pop] = color
    plt.scatter(row[1].PC1, 
                row[1].PC2, 
                s=50, 
                c=color)
fig = plt.gcf()
ax = plt.gca()
cmap = plt.get_cmap()
fig.set_size_inches(10,8)
plt.title("PCA of n=%d samples on %d loci" % (len(joined), len(pca_std_data.columns)))
plt.xlabel("PC1 (3.259%)")
plt.ylabel("PC2 (1.983%)")

handles = []
for pop in sorted(legend):
    handles.append(mpatches.Patch(color=legend[pop], label=pop))
plt.legend(handles=sorted(handles))
plt.show()

In [0]:
%%R
source("tw_calc.R")
test=read.table("twtable", header=F)

In [0]:
TWcalc = r('TWcalc')

In [0]:
robjects.globalenv['pca_std_data_snps'] = com.convert_to_r_matrix(pca_std_data_snps)

In [0]:
%%R
saveRDS(pca_std_data_snps, file="pca_std_data_snps.rds")

In [0]:
tw = TWcalc(com.convert_to_r_matrix(pca_std_data_snps),20)

In [0]:
tw_p = com.convert_robj(tw.rx2(2))
tw_e = com.convert_robj(tw.rx2(1))

In [0]:
tw_num = 0
for i, p in enumerate(tw_p):
    print p
    if p > 0.05:
        tw_num = i
        break
print "Tracy-Widom test yields %d axes of pop structure" % tw_num

Tracy-Widom test yields 12 axes of pop structure

In [0]:
pca_cov = x.ix[:,0:12]

In [0]:
pca_cov[:5]

In [0]:
pheno['sample_id'] = pheno.apply(lambda x: "%s_0" % x.sample_pheno, axis=1)
pheno.index = pheno['sample_id']
pheno.drop('sample_id', axis=1)

In [0]:
pca_std_pheno = pheno.join(pca_cov, how="inner").join(pca_maf[snp_id_pos_ref_alt_snps_only.index], how="inner")

In [0]:
set(pca_std_data.index) - set(pca_std_pheno.index)

In [0]:
pca_std_pheno[:5]

In [0]:
translation_df[:5]

In [0]:
snp_id_pos_ref_alt_snps_only[0:5]

In [0]:
reader = vcf.VCFReader(filename=os.path.join(analysis_dir, "samtools_1.2.vcf_novdb.vcf.gz"))

def get_correct_name(name):
    row = translation_df.ix[name,:]
    return "%s_%d_%d" % (row['pop'], row.indiv, row.dup)

gt_base_data = {}
for i, row in enumerate(snp_id_pos_ref_alt_snps_only.iterrows()):
    snp_id = row[0]
    snp = reader.fetch(row[1].contig, row[1].pos)
    for sample in snp.samples:
        if not snp_id in gt_base_data:
            gt_base_data[snp_id] = {}
        sample_name = get_correct_name(sample.sample)
        gt_base_data[snp_id][sample_name] = sample.gt_bases
    if i % 10 == 0:
        print i
gt_base_df = pd.DataFrame(gt_base_data)

In [0]:
for col in gt_base_df.ix[:,0:30]:
    print col, gt_base_df[col].unique()


In [0]:
hdf.put('gt_base_df', gt_base_df)

In [0]:
gt_base_df=hdf.get('gt_base_df')

In [0]:
gt_base_df.to_csv("gt_base_df.csv", header=True, index=True, sep="\t")

In [0]:
gt_base_df = pd.read_csv("gt_base_df.csv", index_col=0, sep="\t", dtype=str)

In [0]:
gt_base_df[:5]

In [0]:
gt_base_df[:5]

In [0]:
%%R
library(SNPassoc)

In [0]:
pheno.columns

In [0]:
pheno_cols = ['Mass','Pupual Duration','Total Dev Time']
pheno_cols.extend(gt_base_df.columns)

In [0]:
pheno_gt_base = pheno.merge(gt_base_df, left_index=True, right_index=True, how="inner")

In [0]:
pheno_gt_base = pheno_gt_base[pheno_cols]

In [0]:
hdf.put('pheno_gt_base', pheno_gt_base)

In [0]:
pheno_gt_base_pca = pheno_gt_base.merge(pca_cov, left_index=True, right_index=True, how="inner")

In [0]:
pheno_gt_base_pca.columns = pheno_gt_base_pca.apply(lambda x: x.name.replace(" ", "_"))

In [0]:
pheno_gt_base_pca.ix[:,0:3] = preprocessing.scale(pheno_gt_base_pca.ix[:,0:3])

In [0]:
pheno_gt_base_pca[:5]

In [0]:
pheno_gt_base_pca.ix[:,0:3].apply(np.mean)

In [0]:
hdf.put('pheno_gt_base_pca', pheno_gt_base_pca)

In [0]:
pheno_gt_base_pca=hdf.get('pheno_gt_base_pca')

In [0]:
pheno_gt_base_pca.to_csv("pheno_gt_base_pca.txt",
                         header=True,
                         index=True,
                         sep="\t")

## Do this in R b/c slow
    
    library(SNPassoc)
    
    pheno_gt_base_pca = read.table("pheno_gt_base_pca.txt", sep="\t", row.names=1, header=T)
    pheno_gt_base_pca[pheno_gt_base_pca=="None"] = NA

    #subtract b/c those are the PCA axes
    snp_cols = 4:(ncol(pheno_gt_base_pca)-12)
    
    snp_data = setupSNP(pheno_gt_base_pca, colSNPs=snp_cols, sep="/")
    
    pca_cols = (ncol(pheno_gt_base_pca)-11):ncol(pheno_gt_base_pca)
    pca_data = pheno_gt_base_pca[,pca_cols]
    
    wg_mass_co = WGassociation(Mass~1+pca_data$PC1+pca_data$PC2+pca_data$PC3+pca_data$PC4+pca_data$PC5+pca_data$PC6+pca_data$PC7+pca_data$PC8+pca_data$PC9+pca_data$PC10+pca_data$PC11+pca_data$PC12, data=snp_data, model="co", genotypingRate=50)
    
    wg_pd_co = WGassociation(Pupual_Duration~1+pca_data$PC1+pca_data$PC2+pca_data$PC3+pca_data$PC4+pca_data$PC5+pca_data$PC6+pca_data$PC7+pca_data$PC8+pca_data$PC9+pca_data$PC10+pca_data$PC11+pca_data$PC12, data=snp_data, model="co", genotypingRate=50)
    
    wg_tdt_co = WGassociation(Total_Dev_Time~1+pca_data$PC1+pca_data$PC2+pca_data$PC3+pca_data$PC4+pca_data$PC5+pca_data$PC6+pca_data$PC7+pca_data$PC8+pca_data$PC9+pca_data$PC10+pca_data$PC11+pca_data$PC12, data=snp_data, model="co", genotypingRate=50)
    
    saveRDS(wg_mass_co, "wg_mass_co.rds")
    ...
    
    wgstats_mass = WGstats(wg_mass_co)    
    saveRDS(wgstats_mass, "wgstats_mass.rds")
    ...

In [0]:
%%R
wg_mass_co = readRDS("wg_mass_co.rds")
wg_pd_co = readRDS("wg_pd_co.rds")
wg_tdt_co = readRDS("wg_tdt_co.rds")

wgstats_mass = readRDS("wgstats_mass.rds")
wgstats_pd = readRDS("wgstats_pd.rds")
wgstats_tdt = readRDS("wgstats_tdt.rds")

In [0]:
wgstats_mass = r['wgstats_mass']
wgstats_mass_labels = r('labels(wg_mass_co)')

wgstats_pd = r['wgstats_pd']
wgstats_pd_labels = r('labels(wg_pd_co)')

wgstats_tdt = r['wgstats_tdt']
wgstats_tdt_labels = r('labels(wg_tdt_co)')

In [0]:
test = com.convert_robj(wgstats_mass)

In [0]:
for x in test:
    print pd.DataFrame(test[x])
    break

In [0]:
alpha_vals = {}

wgstats = {"mass":[wgstats_mass, wgstats_mass_labels],
           "pd":[wgstats_pd, wgstats_pd_labels],
           "tdt":[wgstats_tdt, wgstats_tdt_labels]}

In [0]:
for key, datalist in wgstats.items():
    print "converting %s" % key
    wgstats[key] = [com.convert_robj(x) for x in datalist]

In [0]:
def get_alleles(data):
    a = set()
    for x in data.index:
        for elem in x.split("/"):
            a.add(elem)
    return list(a)  

def get_allele_freqs_wg(data, AA, Aa, aa):
    total = np.sum(data['n'])*2
    A = data.ix[AA, "n"]*2 + data.ix[Aa, "n"]
    a = data.ix[aa, "n"]*2 + data.ix[Aa, "n"]
    return A/total, a/total

def get_genotypes(data, alleles):
    homos = ["%s/%s" % (x,x) for x in alleles]
    Aa = "%s/%s" % (alleles[0], alleles[1])
    if Aa not in data.index:
        Aa = Aa[::-1] #reverse it
    AA, aa = homos
    if data.ix[AA, "n"] < data.ix[aa, "n"]:
        AA, aa = homos[::-1] #reverse it so that major is first
    return AA, Aa, aa

def get_genotypic_values(data, alleles):
    AA, Aa, aa = get_genotypes(data, alleles)
    G_AA = float(data.ix[AA, 'me'])
    G_aa = float(data.ix[aa, 'me'])
    additive = (G_AA-G_aa)/2
    G_Aa = float(data.ix[Aa, 'me'])
    dominance = G_Aa - ((G_AA+G_aa)/2)
    return additive, dominance, AA, Aa, aa
    
def get_alpha(data):
    alleles = get_alleles(data)
    additive, dominance, AA, Aa, aa = get_genotypic_values(data, alleles)
    p, q = get_allele_freqs_wg(data, AA, Aa, aa)
    return additive + (dominance*(q-p))

In [0]:
lt3 = {}
errors = {}
for p in wgstats:
    print "running %s" % p
    df = pd.DataFrame(index=["alpha", "p"])
    alpha_vals[p] = df
    lt3[p] = 0
    errors[p] = set()
    d = wgstats[p][0]
    labels = wgstats[p][1]
    for i, locus in enumerate(d):
        try:
            data = pd.DataFrame(d[locus])
            snp = labels[i]
            genotypes = [g for g in data.index if "/" in g]
            data = data.ix[genotypes,:]
            pvalue = data['p-value'].dropna()[0]
            if len(genotypes) == 3:
                alpha = get_alpha(data)
                df[snp] = [alpha,pvalue]
            elif len(genotypes) < 3:
                lt3[p] += 1
        except Exception as e: #needed for genotypes that are skipped b/c of genotyping rate
            errors[p].add(e.message)

In [0]:
errors

In [0]:
alpha_files = []
for p in alpha_vals:
    d = alpha_vals[p].T
    print len(d), len(d[d['p'] < 0.05])
    
#     f = "alpha_%s.txt" % p
#     alpha_files.append(os.path.abspath(f))
#     d.to_csv(f,
#              index=True,
#              header=False,
#              sep="\t")

In [0]:
squat_dir = "/data7/eckertlab/src/PolygenicAdaptationCode/Scripts/"

In [0]:
def get_squat_vars(pheno):
    d = {"gwas.data.file":"",
         "freqs.file":"",
         "env.var.data.files":"list()",
         "match.pop.file":"",
         "full.dataset.file":"",
         "path":"squat/%s" % pheno,
         "match.categories":"",
         "match.bins":"list(seq(0,0.5,0.02), c(2), seq(0,1000,100))",
         "cov.SNPs.per.cycle":5000,
         "cov.cycles":1,
         "null.phenos.per.cycle":1000,
         "null.cycles":1,
         "load.cov.mat":"F",
         "sim.null":"T",
         "check.allele.orientation":"T"}
    return ',\n'.join("%s=%s" % (key,val) for (key,val) in d.items())

def create_squat_run_file(pheno):
    if not os.path.exists("squat"):
        os.mkdir("squat")
    squat_file = os.path.join("squat", "squat_%s.r" % pheno)
    with open(squat_file, "w") as o:
        o.write("source(%s)\n" % os.path.join(squat_dir, "funtions.R"))
        o.write("PolygenicAdaptationFunction(%s)\n" % get_squat_vars(pheno))
    return squat_file

for f in alpha_files:
    pheno = os.path.basename(f).split(".")[0].split("_")[1]
    squat_file = create_squat_run_file(pheno)
    print squat_file
    !cat $squat_file
    print ""