In [0]:
import os, sys
from IPython.display import Image
import pandas as pd
from __future__ import division
import numpy as np
import rpy2
from rpy2 import robjects as ro
import pandas.rpy.common as com
import matplotlib.pyplot as plt
import seaborn as sns
import operator
import scipy as sp
import traceback
from sklearn import preprocessing
from IPython.parallel import Client
from subprocess import Popen, PIPE
import shutil
from IPython.display import FileLink, FileLinks, Image
import psutil
import multiprocessing
%matplotlib inline

%load_ext rpy2.ipython
pd.set_option('display.width', 80)
pd.set_option('max.columns', 30)

sns.set_context("talk")

In [0]:
%%R
R.home()

In [0]:
data_ai = pd.read_excel("/gdc_home4/cfried/landscape_genetics_data/Genetics_2010/Eckert_Genetics_2010_data.xlsx")

In [0]:
ai_cols = ['AI_Q1','AI_Q2','AI_Q3','AI_Q4']

In [0]:
data_gt = pd.read_excel("/gdc_home4/cfried/landscape_genetics_data/Genetics_2010/Eckert_Genetics_2010_data.xlsx", 
                        sheetname="genotyping_data")

data_loc = pd.read_excel("/gdc_home4/cfried/landscape_genetics_data/Genetics_2010/Eckert_Genetics_2010_data.xlsx",
                         sheetname="county_locality")

results = pd.read_excel("/gdc_home4/cfried/landscape_genetics_data/Genetics_2010/Eckert_Genetics_2010_results.xlsx")

In [0]:
trait_name = "sucrose"

In [0]:
pheno = pd.read_excel("/gdc_home4/cfried/landscape_genetics_data/Pinus_taeda_metabolite_data.xlsx", 
                      sheetname="metabolite_phenotype_data",
                      header=2)
pheno = pheno[['Longitude', 'Latitude','Clone_id',trait_name]]

pheno.index = pheno.Clone_id
#pheno = pheno.drop('Clone_id', axis=1)
pheno[0:5]

In [0]:
def get_phenotype(row):
    return np.max(pheno[(pheno.Longitude==row.long) & (pheno.Latitude==row.lat)])

In [0]:
data_gt[:5]

In [0]:
data_loc

In [0]:
results.index = results.locus
results = results.drop("locus", axis=1)

In [0]:
results[0:5]

In [0]:
genotypes = data_gt.ix[:,[x for x in data_gt.columns if '-' in x]]

In [0]:
genotypes[:5]

In [0]:
genotypes.shape

In [0]:
def is_homozygous(gt):
    if len(set([x.strip() for x in gt.split("/")])) == 1:
        return True
    return False

def get_allele_counts(counts):
    a = {}
    het = 0
    for gt in counts.index:
        for allele in [x.strip() for x in gt.split("/")]:
            if not allele in a:
                a[allele] = 0
            a[allele] += counts[gt]
        if not is_homozygous(gt):
            het += counts[gt]
    return sorted(a.items(), key=lambda x: x[1], reverse=True), het

def get_correction(n):
    #for finite sample size
    return (2*n)/(2*n-1)

def get_allele_freqs(locus):
    locus = locus[locus != '?/?']
    locus = locus[locus != 'NA']
    c = locus.value_counts()
    c = c.sort(inplace=False, ascending=False)
    allele_counts = get_allele_counts(c)
    total_alleles = 2.0*sum(c)
    num_individuals = sum(c)
    A = ""
    a = ""
    P = 0
    Q = 0
    if len(allele_counts[0]) == 2:
        A = allele_counts[0][0][0]
        a = allele_counts[0][1][0]
        P = allele_counts[0][0][1]
        Q = allele_counts[0][1][1]
    else:
        A = allele_counts[0][0][0]
        P = P = allele_counts[0][0][1]
    PQ = allele_counts[-1]
    p = P/total_alleles
    q = Q/total_alleles
    assert p + q == 1.0
    He = 2 * p * q * get_correction(num_individuals)
    Ho = PQ*1.0/num_individuals
    Fis = 1 - (Ho/He)
    #print p, q, He, Ho, Fis
    ret = pd.Series({"p":p, 
                      "q":q,
                      "P":P,
                      "Q":Q,
                      "He":He,
                      "Ho":Ho, 
                      "Fis":Fis,
                    "PQ": PQ,
                    "total_alleles":total_alleles,
                    "num_indiv":num_individuals,
                    "A":A,
                    "a":a})
    return ret
#genotypes.ix[:,0:2].apply(get_allele_freqs)

In [0]:
af = genotypes.apply(get_allele_freqs)

In [0]:
genotypes

In [0]:
af.ix[:,0:5]

In [0]:
def plot_hist(df, index):
    d = df.ix[index,:]
    plt.hist(d, bins=20)
    plt.title("%s %.2f $\pm$ %.3f [%.2f, %.2f]" % (index, 
                                                   np.mean(d), 
                                                   np.std(d),
                                                  np.min(d),
                                                  np.max(d)))
    plt.show()
plot_hist(af, "Fis")

In [0]:
def convert_to_z12(locus):
    freq = af[locus.name]
    trans = {"%s/%s" % (freq["A"],freq["A"]): 0,
            "%s/%s" % (freq["a"],freq["a"]): 2,
            "%s/%s" % (freq["A"],freq["a"]): 1,
            "%s/%s" % (freq["a"],freq["A"]): 1,
            "?/?":-1}
    return locus.apply(lambda x: trans[x])
z12 = genotypes.apply(convert_to_z12)

In [0]:
def center_and_standardize_value(val, u, var):
    if val == -1:
        return 0.0
    return (val-u)/np.sqrt(var)

def center_and_standardize(snp):
    maf = af.ix["q",snp.name]
    u = np.mean([x for x in snp if x != -1])
    var = np.sqrt(maf*(1-maf))
    return snp.apply(center_and_standardize_value, args=(u, var))

In [0]:
pca_std = z12.apply(center_and_standardize)
pca_std.apply(np.mean)

In [0]:
r = ro.r

In [0]:
prcomp = r('prcomp')
summary = r('summary')

In [0]:
prcomp_res = prcomp(pca_std, scale=False, center=False)

In [0]:
print summary(prcomp_res)

In [0]:
x = com.convert_robj(prcomp_res.rx2("x"))
x.index = pca_std.index
x.ix[0:5,0:10]

In [0]:
plt.scatter(x.PC1, x.PC2)
plt.show()

In [0]:
%%R
source("tw_calc.R")
test=read.table("twtable", header=F)

In [0]:
TWcalc = r('TWcalc')

In [0]:
tw = TWcalc(com.convert_to_r_matrix(pca_std), 25)

In [0]:
tw_p = com.convert_robj(tw.rx2(2))
tw_e = com.convert_robj(tw.rx2(1))

In [0]:
tw_num = 0
for i, p in enumerate(tw_p):
    print i, p
    if p > 0.05:
        tw_num = i
        break
print "Tracy-Widom test yields %d axes of pop structure" % tw_num

In [0]:
y = pd.DataFrame(x)
for col in y.columns[0:10]:
    s_cutoff = np.std(y[col])*6
    u = np.mean(y[col])
    cutoff = sorted([u+s_cutoff, u-s_cutoff], reverse=True)
    outliers = y[col][(y[col] > cutoff[0]) | (y[col] < cutoff[1])]
    print col
    print outliers
    y = y.drop(outliers.index)
y.ix[0:5,0:10]

In [0]:
gt_drop = genotypes.ix[y.index,:]

In [0]:
gt_drop[0:10]

In [0]:
gt_drop.shape

In [0]:
z12_drop = gt_drop.apply(convert_to_z12)
z12_drop[0:10]

In [0]:
pca_drop_std = z12_drop.apply(center_and_standardize)

In [0]:
pca_drop_std.shape

In [0]:
pca_drop_std[0:10]

In [0]:
pca_drop_std.describe().ix[:,0:5]

In [0]:
prcomp_res_drop = prcomp(pca_drop_std, scale=False, center=False)

In [0]:
x_drop = com.convert_robj(prcomp_res_drop.rx2("x"))
x_drop.index = pca_drop_std.index
x_drop.ix[0:5,0:5]

In [0]:
plt.scatter(x_drop.PC1, x_drop.PC2)
plt.show()

In [0]:
print summary(prcomp_res_drop)

In [0]:
tw = TWcalc(com.convert_to_r_matrix(pca_drop_std), 25)

In [0]:
tw_p = com.convert_robj(tw.rx2(2))
tw_e = com.convert_robj(tw.rx2(1))

In [0]:
tw_num = 0
for i, p in enumerate(tw_p):
    print i, p
    if p > 0.05:
        tw_num = i
        break
print "Tracy-Widom test yields %d axes of pop structure" % tw_num

In [0]:
hierf_trans = {0:11, 1:12, 2:22, -1:'NA'}

In [0]:
def apply_hierf_trans(series):
    return [hierf_trans[x] if x in hierf_trans else x for x in series]

In [0]:
hierf_df = z12_drop.apply(apply_hierf_trans)

In [0]:
hierf_df.shape

In [0]:
hierf_df.insert(0, "countyid", None)
hierf_df[0:5]

In [0]:
loc_hierf = data_loc.join(hierf_df, how="inner")
bayenv_df = loc_hierf.copy()

In [0]:
print hierf_df.shape, data_loc.shape, loc_hierf.shape, bayenv_df.shape

In [0]:
loc_hierf['county_state'] = loc_hierf.apply(lambda row: "%s_%s" % (row.county, row.state), axis=1)
usable_counties = set()
county_counts = loc_hierf.county_state.value_counts()
county_counts = county_counts.sort(inplace=False, ascending=False)
for c in county_counts.index:
    print c, county_counts[c]
for c in county_counts.index:
    if county_counts[c] >=5:
        usable_counties.add(c)
usable_counties = sorted(list(usable_counties))

In [0]:
county_id = {}
for i, county in enumerate(usable_counties):
    county_id[county] = i+1
county_id


In [0]:
loc_hierf['usable'] = loc_hierf.apply(lambda row: row.county_state in county_id, axis=1)

In [0]:
drop = loc_hierf[loc_hierf.usable==False]

In [0]:
loc_hierf = loc_hierf.drop(drop.index)

In [0]:
loc_hierf

In [0]:
loc_hierf['countyid'] = loc_hierf.apply(lambda row: county_id[row.county_state], axis=1)

In [0]:
loc_hierf[0:10]

In [0]:
loc_hierf.shape

In [0]:
sorted(loc_hierf.countyid.unique())

In [0]:
loc_hierf.ix[:,4:-2].to_csv("hierf.txt", sep="\t", header=True, index=False)

In [0]:
%%R
# library(hierfstat)
# data = read.table("hierf.txt", header=T, sep="\t")
# data = data[order(data$countyid),]
# levels = data.frame(data$countyid)
# loci = data[,2:ncol(data)]
# bs = basic.stats(data)
# saveRDS(bs, "basic_stats.rds")
# res = varcomp.glob(levels=levels, loci=loci, diploid=T)
# saveRDS(res, "hierf.rds")

In [0]:
%%R
bs = readRDS("basic_stats.rds")
res = readRDS("hierf.rds")

In [0]:
res = com.convert_robj(ro.r('res'))

In [0]:
bs = com.convert_robj(ro.r('bs'))
Fis = bs['Fis']
Hs = bs['Hs']
pop_freq_temp = bs['pop.freq']
pop_freq = {}
perloc = bs['perloc']
n_ind_samp = bs['n.ind.samp']
Ho = bs['Ho']
overall = bs['overall']

for df in [Fis, Hs, perloc, n_ind_samp, Ho]:
    df.index = [x[1:].replace(".","-") for x in df.index]

for locus, data in pop_freq_temp.items():
    if len(data) == 2:
        data.index = ['p','q']
    else:
        data.index = ['p']
    pop_freq[locus[1:].replace(".", "-")] = data

Ho = Ho.T
perloc = perloc.T
n_ind_samp = n_ind_samp.T
Hs = Hs.T
Fis = Fis.T

In [0]:
Ho

In [0]:
perloc['0-10037-01-257']

In [0]:
af['0-10037-01-257']

In [0]:
loc_df = res['loc']
F_df = res['F']
overall_df = res['overall']

In [0]:
F_df

In [0]:
def compute_fst(series):
    Va = series[0]
    Vt = sum(series)
    return Va/Vt

In [0]:
loci_fst = loc_df.apply(compute_fst, axis=1).dropna()
loci_fst.index = [x[1:].replace(".", "-") for x in loci_fst.index]

In [0]:
plt.hist(loci_fst, bins=50)
plt.xlim(-.03, .5)
plt.title("Fst for %d loci ($\mu=%.3f \pm %.4f$) [%.3f, %.3f]" % (len(loci_fst),
                                                                  np.mean(loci_fst),
                                                                  np.std(loci_fst),
                                                                  np.min(loci_fst),
                                                                  np.max(loci_fst)))
plt.show()

In [0]:
loc_hierf.shape

In [0]:
trait = loc_hierf.apply(get_phenotype, axis=1)

In [0]:
trait_loc_hierf = trait.join(loc_hierf, how="inner")

In [0]:
trait_complete = trait_loc_hierf.drop(trait_loc_hierf[np.isnan(trait_loc_hierf[trait_name])].index)

In [0]:
trait_complete[:5]

In [0]:
trait_complete.shape

In [0]:
def convert_to_snpassoc(col):
    if "-" in col.name:
        freqs = af[col.name]
        trans = {11: "%s/%s" % (freqs["A"], freqs["A"]),
                12: "%s/%s" % (freqs["A"], freqs["a"]),
                22: "%s/%s" % (freqs["a"], freqs["a"]),
                "NA":"NA"}
        return col.apply(lambda x: trans[x])
    return col
trait_snpassoc = trait_complete.apply(convert_to_snpassoc)

In [0]:
pca_cov = x_drop.ix[:,0:14]

In [0]:
trait_snpassoc_pca = trait_snpassoc.join(pca_cov, how="inner")

In [0]:
trait_snpassoc_pca = trait_snpassoc_pca.drop(['county_state',
                                                   'usable',
                                                   'Longitude',
                                                   'Latitude',
                                                   'Clone_id',
                                                   'county',
                                                   'state',
                                                   'lat',
                                                   'long',
                                                   'countyid'], axis=1)

In [0]:
trait_snpassoc_pca[0:10]

In [0]:
trait_snpassoc.shape

In [0]:
trait_snpassoc_pca.to_csv("snpassoc.txt",
                             header=True,
                             index=True,
                             sep="\t")

In [0]:
def write_snpassoc_file(df, input_file, num_pca_axes):
    pheno = df.columns[0:1]
    out_files = []
    for p in pheno:
        with open("snpassoc_%s.R" % p.lower(), "w") as o:
            print "writing %s" % o.name
            out_files.append(o.name)
            text = '''
library(SNPassoc)

d = read.table('%s', sep="\\t", row.names=1, header=T)

#subtract b/c those are the PCA axes
snp_cols = 2:(ncol(d)-%d)
snp_data = setupSNP(d, colSNPs=snp_cols, sep="/")
pca_cols = (ncol(d)-%d):ncol(d)
pca_data = d[,pca_cols]

wg = WGassociation(%s~1+pca_data$PC1+pca_data$PC2+pca_data$PC3+pca_data$PC4+
pca_data$PC5+pca_data$PC6+pca_data$PC7+pca_data$PC8+pca_data$PC9+pca_data$PC10+
+pca_data$PC11+pca_data$PC12+pca_data$PC13+pca_data$PC14, 
data=snp_data, 
model="co", 
genotypingRate=5)

saveRDS(wg, "wg_%s_co.rds")
stats = WGstats(wg)
saveRDS(stats, "wgstats_%s.rds")
''' % (input_file, 
       num_pca_axes,
       num_pca_axes-1,
       p, 
       p.lower(), 
       p.lower())
        
            o.write(text)
    return out_files

In [0]:
write_snpassoc_file(trait_snpassoc_pca, "snpassoc.txt", 14)

##Run this in R
```R
source("snpassoc_<trait>.R")
```

In [0]:
%%R
wg_trait_co.rds = readRDS('wg_sucrose_co.rds')
wgstats_trait.rds = readRDS('wgstats_sucrose.rds')

In [0]:
wgstats_trait = r['wgstats_trait.rds']
wgstats_trait_labels = r('labels(wg_trait_co.rds)')

In [0]:
wgstats = {trait_name:[wgstats_trait, wgstats_trait_labels.rx2(1)]}
for key, datalist in wgstats.items():
    print "converting %s" % key
    wgstats[key] = [com.convert_robj(x) for x in datalist]

In [0]:
def get_alleles(data):
    a = set()
    for x in data.index:
        for elem in x.split("/"):
            a.add(elem)
    return list(a)  

def get_allele_freqs_wg(data, AA, Aa, aa):
    total = np.sum(data['n'])*2
    A = data.ix[AA, "n"]*2 + data.ix[Aa, "n"]
    a = data.ix[aa, "n"]*2 + data.ix[Aa, "n"]
    return A/total, a/total

def get_genotypes(data, alleles):
    homos = ["%s/%s" % (x,x) for x in alleles]
    Aa = "%s/%s" % (alleles[0], alleles[1])
    if Aa not in data.index:
        Aa = Aa[::-1] #reverse it
    AA, aa = homos
    if data.ix[AA, "n"] < data.ix[aa, "n"]:
        AA, aa = homos[::-1] #reverse it so that major is first
    return AA, Aa, aa

def get_genotypic_values(data, alleles):
    AA, Aa, aa = get_genotypes(data, alleles)
    G_AA = float(data.ix[AA, 'me'])
    G_aa = float(data.ix[aa, 'me'])
    additive = (G_AA-G_aa)/2
    G_Aa = float(data.ix[Aa, 'me'])
    dominance = G_Aa - ((G_AA+G_aa)/2)
    return additive, dominance, AA, Aa, aa
    
def get_alpha(data):
    alleles = get_alleles(data)
    additive, dominance, AA, Aa, aa = get_genotypic_values(data, alleles)
    p, q = get_allele_freqs_wg(data, AA, Aa, aa)
    alpha = additive + (dominance*(q-p))
    return alpha, AA, aa, p, q

In [0]:
alpha_vals = {}
for p in wgstats:
    print "running %s" % p
    df = pd.DataFrame(index=["alpha", "p-value", "AA", "aa", "p", "q"])
    alpha_vals[p] = df
    d = wgstats[p][0]
    labels = wgstats[p][1]
    for i, locus in enumerate(d):
        try:
            data = pd.DataFrame(d[locus])
            snp = labels[i]
            genotypes = [g for g in data.index if "/" in g]
            data = data.ix[genotypes,:]
            pvalue = data['p-value'].dropna()[0]
            if len(genotypes) == 3:
                alpha, AA, aa, p, q = get_alpha(data)
                df[snp] = [alpha, pvalue, AA, aa, p, q]
        except Exception as e: 
            pass

In [0]:
alpha_vals[trait_name].ix[:,0:10]

In [0]:
plt.hist(alpha_vals[trait_name].ix['p-value',:], bins=30)
plt.title("p-values")
plt.show()

In [0]:
plt.hist(alpha_vals[trait_name].ix['alpha',:], bins=30)
plt.title("alpha values $\mu %.4f \pm %.4f \ [%.4f, %.4f]$" % (np.mean(alpha_vals[trait_name].ix['alpha',:]),
                                                            np.std(alpha_vals[trait_name].ix['alpha',:]),
                                                            np.min(alpha_vals[trait_name].ix['alpha',:]),
                                                             np.max(alpha_vals[trait_name].ix['alpha',:])))
plt.show()

In [0]:
trait_snpassoc_pca_county = pd.concat([loc_hierf.countyid, trait_snpassoc_pca], axis=1)
trait_snpassoc_pca_county = trait_snpassoc_pca_county.drop(trait_snpassoc_pca_county[np.isnan(trait_snpassoc_pca_county[trait_name])].index)
trait_snpassoc_pca_county[0:5]
snpassoc_af = trait_snpassoc_pca_county.ix[:,2:-14].apply(get_allele_freqs)

In [0]:
pop_allele_freqs = {}
for pop,data in trait_snpassoc_pca_county.groupby("countyid"):
    print "getting allele freqs for pop % d" % pop
    pop_allele_freqs[pop] = data.ix[:,2:-14].apply(get_allele_freqs)

In [0]:
def write_gwas_data_file(df, pheno, outdir):
    out = "%s_gwas_data_file.txt" % pheno
    out = os.path.join(outdir, out)
    df = df.sort_index()
    df[['A1', 'A2', 'EFF', 'FRQ']].to_csv(out,
                                          header=True, 
                                          index=True,
                                          sep="\t")
    print out
    return out

def write_freqs_file(df, pheno, pop_freqs, outdir):
    out = "%s_freqs_file.txt" % pheno
    out = os.path.join(outdir, out)
    print out
    with open(out, "w") as o:
        o.write("SNP\tCLST\tA1\tA2\tFRQ\n")
        for pop, data in pop_freqs.items():
            m = data.T.merge(df, how="inner", left_index=True, right_index=True)
            m['population'] = pop
            m.index.name = 'SNP'
            m = m.sort_index()
            o.write(m[['population','A1','A2','p']].to_csv(header=False, 
                                                             index=True,
                                                             sep="\t"))
def write_match_pop_file(df, pheno, pop_freqs, pop, outdir):
    out = "%s_match_pop_file.txt" % pheno
    out = os.path.join(outdir, out)
    print out
    with open(out, "w") as o:
        o.write("SNP\tCLST\tA1\tA2\tFRQ\n")
        for key, data in pop_freqs.items():
            if key == pop:
                m = data.T.merge(df, how="inner", left_index=True, right_index=True)
                m['population'] = pop
                m.index.name = 'SNP'
                m = m.sort_index()
                o.write(m[['population','A1','A2','p']].to_csv(header=False, 
                                                                 index=True,
                                                                 sep="\t"))
                break
                
def write_full_dataset_file(df, pheno, pop_freqs, outdir):
    out = "%s_full_dataset_file.txt" % pheno
    out = os.path.join(outdir, out)
    print out
    with open(out, "w") as o:
        o.write("SNP\tCLST\tA1\tA2\tFRQ\n")
        for pop, data in pop_freqs.items():
            m = data.T.merge(df, how="inner", left_index=True, right_index=True)
            m['population'] = pop
            m.index.name = 'SNP'
            m = m.sort_index()
            o.write(m[['population','A1','A2','p']].to_csv(header=False, 
                                                             index=True,
                                                             sep="\t"))   
def write_env_var_data_file(pheno, pop_freqs, outdir):
    out = "%s_env_var_data_file.txt" % pheno
    out = os.path.join(outdir, out)
    print out
    with open(out, "w") as o:
        o.write("CLST\tENV\tREG\n")
        pop_id = 0
        for pop in pop_freqs:
            pop_id += 1
            o.write("%s\t%f\t%d\n" % (pop, np.random.randn(), pop_id))

In [0]:
pwd

In [0]:
squat_outdir = "squat_cfried" #change for your username
if not os.path.exists(squat_outdir):
    os.mkdir(squat_outdir)

for p in alpha_vals:
    full = alpha_vals[p].T
    full.index = [x.replace(".", "-") for x in full.index]
    full.index = [x[1:] if x.startswith("X") else x for x in full.index]
    full.index.name = "SNP"
    full.AA = full.AA.apply(lambda x: x[0])
    full.aa = full.aa.apply(lambda x: x[0])
    full = full.rename(columns={'alpha':'EFF',
                                'AA':'A1',
                                'aa':'A2',
                                'p': 'FRQ'})
    candidates = full[full['p-value']<0.001]
    write_gwas_data_file(candidates, p, squat_outdir)
    write_freqs_file(candidates, p, pop_allele_freqs, squat_outdir)
    write_match_pop_file(full, p, pop_allele_freqs, 2, squat_outdir)
    write_full_dataset_file(full, p, pop_allele_freqs, squat_outdir)
    write_env_var_data_file(p, pop_allele_freqs, squat_outdir)

In [0]:
pwd

In [0]:
squat_scripts_dir = "/gdc_home4/cfried/src/PolygenicAdaptationCode/Scripts"
!rm {squat_outdir}/Scripts && ln -s {squat_scripts_dir} {squat_outdir}/Scripts
def get_squat_vars(pheno):
    d = {"gwas.data.file":"'%s_gwas_data_file.txt'" % pheno,
         "freqs.file":"'%s_freqs_file.txt'" % pheno,
         "env.var.data.files":"list('%s_env_var_data_file.txt')" % pheno,
         "match.pop.file":"'%s_match_pop_file.txt'" % pheno,
         "full.dataset.file":"'%s_full_dataset_file.txt'" % pheno,
         "path":"'%s'" % pheno,
         "match.categories":"c('MAF')",
         "match.bins":"list(seq(0,0.5,0.02), c(2), seq(0,1000,100))",
         "cov.SNPs.per.cycle":5000,
         "cov.cycles":1,
         "null.phenos.per.cycle":1000,
         "null.cycles":1,
         "load.cov.mat":"F",
         "sim.null":"T",
         "check.allele.orientation":"F"}
    return ',\n'.join("%s=%s" % (key,val) for (key,val) in d.items())

def create_squat_run_file(pheno):
    squat_file = os.path.join(squat_outdir, "squat_%s.r" % pheno)
    with open(squat_file, "w") as o:
        o.write('system("rm -rf %s")\n'% pheno)
        o.write("source('%s')\n" % os.path.join(squat_scripts_dir, "CreateTraitFile.R"))
        o.write("source('%s')\n" % os.path.join(squat_scripts_dir, "functions.R"))
        o.write("PolygenicAdaptationFunction(%s)\n" % get_squat_vars(pheno))
    return squat_file

for pheno in alpha_vals:
    squat_file = create_squat_run_file(pheno)
    print squat_file
    !cat $squat_file
    print ""

In [0]:
def run_squat(p):
    print "running %s" % p
    output = "%s/%s" % (squat_outdir, p)
    if os.path.exists(output):
        !rm -rf {output}
    cmds = ["setwd('%s')" % squat_outdir,
            'source("squat_%s.r")' % (p),
            "setwd('../')"]
    for cmd in cmds:
        print cmd
        r(cmd)
    
run_squat(trait_name)

In [0]:
rfiles = !find {squat_outdir} | grep Robj | grep Output | grep {trait_name}
bc = {}
for f in rfiles:
    d = f.split("/")
    if not d[1] in bc:
        bc[d[1]] = []
    bc[d[1]].append(f)
bc

In [0]:
for pheno in bc:
    print pheno
    for obj in bc[pheno]:
        r('load("%s")' % obj)
    print r("the.stats")
    print("------------------")
    print r("p.vals")

#Bayenv

##Setup Bayenv input files

In [0]:
bayenv_df['county_state'] = bayenv_df.apply(lambda row: "%s_%s" % (row.county, row.state), axis=1)
bayenv_df = bayenv_df.drop(drop.index)
bayenv_df['countyid'] = bayenv_df.apply(lambda row: county_id[row.county_state], axis=1)

In [0]:
bayenv_df[:5]

In [0]:
bayenv_dir = "bayenv"
snp_names = [x for x in bayenv_df.columns if "-" in x]
popids = sorted(trait_snpassoc.countyid.unique())

if not os.path.exists(bayenv_dir):
    os.mkdir(bayenv_dir)

In [0]:
def get_bayenv_snp(snp_name, popids):
    P = []
    Q = []
    for popid in popids:
        P.append(pop_allele_freqs[popid].ix["P",name])
        Q.append(pop_allele_freqs[popid].ix["Q",name])
    return P, Q

def write_bayenv_snp(fh_snp, fh_names, name, P, Q):
    if sum(Q) > 0: #exclude monomorphic loci
        if fh_names:
            fh_names.write("%s\n" % name)
        P = [str(x) for x in P]
        Q = [str(x) for x in Q]
        fh_snp.write("%s\t\n" % "\t".join(Q))
        fh_snp.write("%s\t\n" % "\t".join(P))



In [0]:
with open("bayenv.txt", "w") as o:
    with open("bayenv_names.txt", "w") as n:
        for name in snp_names:
            P,Q = get_bayenv_snp(name, popids)
            write_bayenv_snp(o, n, name, P, Q)

In [0]:
!cp bayenv.txt {bayenv_dir}

In [0]:
!head bayenv.txt

In [0]:
!head bayenv_names.txt

In [0]:
len(popids)

##Run Bayenv to create variance-covariance matrix

```bash
    cd bayenv && /gdc_home4/cfried/src/bayenv2/bayenv2 -i bayenv.txt -p 30 -k 100000 -r 63479 > matrix.out
```

* -p number of populations (`len(popids)`)
* -k mcmc generations
* -r random seed

##Run Bayenv mcmc

In [0]:
data_ai['county_state'] = data_ai.apply(lambda row: "%s_%s" % (row.County, row.State), axis=1)

In [0]:
bayenv_df_ai = bayenv_df.merge(data_ai, on='county_state')

In [0]:
bayenv_df_ai[0:10]

In [0]:
bayenv_df_ai.shape

In [0]:
def get_bayenv_env(data):
    E = pd.Series()
    for col in data.columns[:-1]:
        E[col] = data[col].values[0]
    return E

ai_cols = [x for x in bayenv_df_ai if 'AI_' in x]
ai_cols.append('countyid')
bayenv_df_ai_groups = bayenv_df_ai.ix[:,ai_cols].groupby("countyid")
env_ai = []
for popid in popids:
    env_ai.append(get_bayenv_env(bayenv_df_ai_groups.get_group(popid))) 
env_ai_df = pd.DataFrame(env_ai).T
env_ai_df

In [0]:
env_ai_df = env_ai_df.apply(preprocessing.scale, axis=1)

In [0]:
env_ai_df

In [0]:
env_ai_df.apply(np.mean, axis=1)

In [0]:
with open("%s/envmatrix.txt" % bayenv_dir, "w") as o:
    for row in env_ai_df.iterrows():
        vals = "\t".join([str(x) for x in row[1].values])
        o.write("%s\t\n" % vals)

In [0]:
!tail -n 13 bayenv/matrix.out > bayenv/matrix_last.out

In [0]:
def setup_bayenv_cmd(snpfile, name):
    work_dir = "/gdc_home4/cfried/ipython/bayenv"
    bayenv = "/gdc_home4/cfried/src/bayenv2/bayenv2"
    bayenv_matrix = "matrix_last.out"
    bayenv_seed = -47372
    bayenv_pops = 12
    bayenv_runs = 100000
    bayenv_environs = 4
    bayenv_envmatrix = "envmatrix.txt"
    bayenv_cmd = "cd %s/%s && %s -i %s -m %s -e %s -p %d -k %d -n %d -t -c -f -o %s" % (work_dir, 
                                                                                        name,
                                                                                        bayenv,
                                                                         snpfile,
                                                                         bayenv_matrix,
                                                                         bayenv_envmatrix,
                                                                         bayenv_pops,
                                                                     bayenv_runs,
                                                                     bayenv_environs,
                                                                             snpfile)
    shutil.copy(bayenv_matrix, os.path.join(work_dir, name))
    shutil.copy(bayenv_envmatrix, os.path.join(work_dir, name))
    return bayenv_cmd

In [0]:
cmds = []
if not os.path.exists(bayenv_dir):
    os.mkdir(bayenv_dir)

for name in snp_names:
    P,Q = get_bayenv_snp(name,popids)
    if sum(Q) > 0:
        file_dir = os.path.join(bayenv_dir, name)
        
        if os.path.exists(file_dir):
            shutil.rmtree(file_dir)
        
        if not os.path.exists(file_dir):
            os.mkdir(file_dir)
        o = open(os.path.join(file_dir, "%s.txt" % name), "w")
        write_bayenv_snp(o, None, name, P, Q)
        o.close()
        cmd = setup_bayenv_cmd(os.path.basename(o.name), name)
        cmds.append(cmd)

In [0]:
print cmds[0]

In [0]:
rc = Client(profile="gdcsrv2")

In [0]:
dview = rc[:]
lview = rc.load_balanced_view()
len(lview)

In [0]:
len(dview)

In [0]:
def get_hostname():
    import socket
    return socket.gethostname()
dview['get_hostname'] = get_hostname

In [0]:
dview.scatter("cpu", range(len(rc)), flatten=True)
def run_cmd(cmd):
    import stopwatch
    from subprocess import Popen, PIPE
    import psutil
    import multiprocessing
    t = stopwatch.Timer()
    p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
    proc = psutil.Process(p.pid)
    proc.set_cpu_affinity([cpu])
    print "affinity is %s" % proc.get_cpu_affinity() 
    stdout, stderr = p.communicate()
    t.stop()
    return cmd, stdout, stderr, str(t)


In [0]:
dview['run_cmd'] = run_cmd

In [0]:
%%px
import psutil
import os
import multiprocessing
p = psutil.Process(os.getpid())
p.set_cpu_affinity([cpu])
#p.set_cpu_affinity(range(multiprocessing.cpu_count()))

In [0]:
#bayenv_jobs = lview.map_async(run_cmd, cmds)


In [0]:
#bayenv_jobs.progress

In [0]:
bf_files = !find bayenv | grep bf

In [0]:
bf_files

In [0]:
bf_data = {}
for b in bf_files:
    d = open(b).readlines()
    d = d[-1].strip().split("\t")[1:]
    if len(d) == 12:
        bf_data[os.path.basename(b).replace(".txt.bf","")] = d

In [0]:
bf = pd.DataFrame(bf_data).T.astype(float)
bf.shape

In [0]:
bf

In [0]:
freq_files = !find bayenv | grep freqs
freq_data = {}
for f in freq_files:
    d = open(f).readline().strip().split()
    if len(d) ==  12:
        freq_data[os.path.basename(f).replace(".txt.freqs","")] = d

In [0]:
freq_df = pd.DataFrame(freq_data).T
freq_df.shape

In [0]:
freq_df.to_csv("bayenv_freqs.txt", header=True, index=True, sep="\t")
bf.to_csv("bayenv_bf.txt", header=True, index=True, sep="\t")

In [0]:
FileLink("bayenv_bf.txt")

In [0]:
FileLink("bayenv_freqs.txt")

In [0]:
plt.scatter(bf.ix[:,1], bf.ix[:,2])
plt.xlabel("Spearman")
plt.ylabel("Pearson")
plt.show()

plt.scatter(bf.ix[:,1], bf.ix[:,0])
plt.xlabel("Spearman")
plt.ylabel("Bayes factor")
plt.show()

##Outliers

In [0]:
def get_outliers(df, key, num_std):
    if key == "bf":
        key = 0
    elif key == "rho":
        key = 1  
    outliers = {}   
    ai = 0
    for i in xrange(key, len(df.columns), 3):
        d = df.ix[:,i]
        d_std = np.std(d)
        d_mean = np.mean(d)
        cutoffs = [d_mean + (num_std*d_std), d_mean - (num_std*d_std)]
        env = ai_cols[ai]
        outliers[env] = d[(d >= cutoffs[0]) | (d <= cutoffs[1])]
        ai += 1
    return outliers

In [0]:
def plot_outliers(df, key, num_std):
    if key == "bf":
        key = 0
    elif key == "rho":
        key = 1   
    ai = 0
    for i in xrange(key, len(df.columns), 3):
        d = df.ix[:,i]
        d_std = np.std(d)
        d_mean = np.mean(d)
        env = ai_cols[ai]
        ax = plt.gca()
        if key == 0:
            ax.set_yscale('log')
        plt.hist(d, bins=100)
        plt.xlim(np.min(d), d_mean+(num_std*d_std))
        plt.title("%s $\mu = %.4f \pm %.4f [%.4f, %.4f])$" % (env,
                                                            d_mean,
                                                            d_std,
                                                            np.min(d),
                                                            np.max(d)))
        plt.show()
        ai += 1

In [0]:
plot_outliers(bf, "bf", 20)

In [0]:
plot_outliers(bf, "rho", 6)

In [0]:
bf_outliers = get_outliers(bf, "bf", 6)    
rho_outliers = get_outliers(bf, "rho", 3)

In [0]:
bf_outliers.keys()

In [0]:
rho_outliers.keys()

In [0]:
%%R
library(VennDiagram)

In [0]:
def draw_venn(outliers, title):
    keys = sorted(list(outliers.keys()))
    a1 = set(outliers[keys[0]].index)
    a2 = set(outliers[keys[1]].index)
    a3 = set(outliers[keys[2]].index)
    a4 = set(outliers[keys[3]].index)
    area1 = len(a1)
    area2 = len(a2)
    area3 = len(a3) 
    area4 = len(a4)
    n12 = len(a1.intersection(a2))
    n13 = len(a1.intersection(a3))
    n14 = len(a1.intersection(a4))
    n23 = len(a2.intersection(a3))
    n24 = len(a2.intersection(a4))
    n34 = len(a3.intersection(a4))
    n123 = len(set.intersection(a1, a2, a3))
    n124 = len(set.intersection(a1, a2, a4))
    n134 = len(set.intersection(a1, a3, a4))
    n234 = len(set.intersection(a2, a3, a4))
    n1234 = len(set.intersection(a1, a2, a3, a4))
    venn = "venn_%s.png" % title.replace(" ", "_")
    r("library(VennDiagram)")
    r("png('%s')" % venn)
    r('draw.quad.venn')(area1, 
                  area2,
                  area3,
                  area4,
                  n12,
                  n13,
                  n14,
                  n23,
                  n24,
                  n34,
                  n123,
                  n124,
                  n134,
                  n234,
                  n1234,
                       category=keys)
    r('dev.off()')
    return venn

In [0]:
Image(draw_venn(bf_outliers, "Bayes factor outliers"))

In [0]:
Image(draw_venn(rho_outliers, "Rho outliers"))

In [0]:
combined_outliers = {}
for key in bf_outliers:
    a = bf_outliers[key].index
    b = rho_outliers[key].index
    combined_outliers[key] = pd.Series(index=a.intersection(b))

In [0]:
Image(draw_venn(combined_outliers, "combined"))

In [0]:
boxplot_data = {}
for key, val in bf_outliers.items():
    val = val.sort(inplace=False, ascending=False)
    boxplot_data[key] = {val.index[0]: val[0]}

In [0]:
boxplot_data

In [0]:
bayenv_df_ai_basegt = bayenv_df_ai.apply(convert_to_snpassoc)
for env in boxplot_data:
    for snp in boxplot_data[env]:
        vals = {}
        for gt, group in bayenv_df_ai_basegt.groupby(snp):
            if not gt == 'NA':
                vals[gt.replace("/", "")] = group[env]
        vals = pd.DataFrame(vals, dtype=float)
        vals.index.name = env

        sns.boxplot([vals[x].dropna() for x in vals], 
                    names=vals.columns)
        plt.title("%s/%s (%.4f)" % (snp, vals.index.name, boxplot_data[env][snp]))
        plt.show()

        sns.violinplot([vals[x].dropna() for x in vals], 
                    names=vals.columns)
        plt.title("%s/%s (%.4f)" % (snp, vals.index.name, boxplot_data[env][snp]))
        plt.show()