In [0]:
import scandir
import os
import rpy2
from rpy2.robjects import pandas2ri
pandas2ri.activate()
import rpy2.robjects as ro
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import numpy as np
import dill
import random
import cyvcf
from hdfstorehelper import HDFStoreHelper
import statsmodels.api as sm
import statsmodels.formula.api as smf
import operator
import traceback
%load_ext rpy2.ipython
from rpy2.robjects import pandas2ri as p2r
p2r.activate()
r = ro.r

In [0]:
analysis_dir_notimp = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/notimputed/"
analysis_dir_imp = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle40/"
hdf_notimp = HDFStoreHelper(os.path.join(analysis_dir_notimp, "isect.hd5"))
hdf_imp = HDFStoreHelper(os.path.join(analysis_dir_imp, "isect.hd5"))

In [0]:
hdf_notimp['z12_swapped'].head()

In [0]:
analysis_dir = [analysis_dir_notimp, analysis_dir_imp]

In [0]:
hdfs = [hdf_notimp, hdf_imp]

In [0]:
hdfs[0].get_group_names()

In [0]:
gps = {'QC32':[47.2509807, -79.4060515],
      'QC93': [46.9089631, -70.8061075],
      'NC': [36.449125, -76.024672],
      'NY': [42.897768, -74.094761],
      'VA1': [38.657615, -77.463603],
      'VA2': [38.857470, -77.695003]}
gps_df = pd.DataFrame(gps).T
gps_df.columns = ['lat','lon']

latlon = pandas2ri.py2ri(gps_df[['lon', 'lat']])

In [0]:
%%R
library(raster)

In [0]:
raster = r("raster")
extract = r("extract")

In [0]:
bioclim_dir = "/home/cfriedline/eckertlab/bioclim"
bioclim = !ls {bioclim_dir}/*.bil
bioclim = sorted(bioclim)
bioclim_df = pd.DataFrame(gps_df)
for b in bioclim:
    rast = raster(b)
    bio = os.path.basename(b).replace(".bil", "").replace("_", "").upper()
    vals = pd.DataFrame(pandas2ri.ri2py(extract(rast, latlon)))
    vals.index = bioclim_df.index
    vals.columns = [bio]
    bioclim_df = bioclim_df.join(vals)
bioclim_df = bioclim_df.sort_index()

In [0]:
for hdf in hdfs:
    hdf['bioclim'] = bioclim_df

In [0]:
region_id = 1
for popname in bioclim_df.index:
    bioclim_df.ix[popname, 'region'] = region_id
    region_id+=1

In [0]:
bioclim_env_files = []
for bio in bioclim_df[[x for x in bioclim_df if 'BIO' in x]]:
    bio_temp = bioclim_df[[bio, 'region']]
    bio_temp.index.name = "CLST"
    bio_temp.columns = ["ENV", "REG"]
    bio_out = os.path.join(analysis_dir[0], "../%s.txt" % bio)
    bioclim_env_files.append(bio_out)
    bio_temp.to_csv(bio_out, sep="\t", header=True, index=True)

In [0]:
z12_swapped = [x['z12_swapped'] for x in hdfs]

In [0]:
def get_correction(n):
    #for finite sample size
    return (2*n)/(2*n-1)

def get_allele_freqs(locus, debug):
    c = locus[locus != -1].value_counts()
    total_alleles = 2.0*sum(c)
    num_individuals = sum(c)
    P = 0
    Q = 0
    PQ = 0
    if 0 in c:
        P = 2*c[0]
    if 2 in c:
        Q = 2*c[2]
    if 1 in c:
        PQ = c[1]
    P += PQ
    Q += PQ
    p = P/total_alleles
    q = Q/total_alleles
    assert p + q == 1.0
    He = 2 * p * q * get_correction(num_individuals)
    Ho = PQ*1.0/num_individuals
    Fis = 1 - (Ho/He)
    #print p, q, He, Ho, Fis
    
        
    ret = pd.Series({"p":p, 
                      "q":q,
                      "P":P,
                      "Q":Q,
                      "He":He,
                      "Ho":Ho, 
                      "Fis":Fis})
    if debug:
        print ret
    return ret

In [0]:
allele_freqs = [x.ix[:,:-2].apply(get_allele_freqs, args=(False,)) for x in z12_swapped]

In [0]:
for i, af in enumerate(allele_freqs):
    hdfs[i]['allele_freqs_swapped'] = af
    

In [0]:
pop_allele_freqs = []
for z12 in z12_swapped:
    paf = {}
    pop_allele_freqs.append(paf)
    for pop, data in z12.groupby('population'):
        data = data.ix[:,:-2]
        paf[pop] = data.apply(get_allele_freqs, args=(False,))

In [0]:
for i, paf in enumerate(pop_allele_freqs):
    outfile = os.path.join(analysis_dir[i], "pop_allele_freqs.dill")
    dill.dump(paf, open(outfile, "w"))

In [0]:
pimass_gt = [x['pimass_gt'].replace("NA", np.nan) for x in hdfs]

In [0]:
translation_df = pd.read_csv("translation_table.csv", sep="\t", index_col=0)
def get_translated_name(n):
    n = n.strip()
    if n in translation_df.index:
        row = translation_df.ix[n.strip()]
        return "%s_%d_%d" % (row['pop'], row.indiv, row.dup)
    return n

In [0]:
for p in pimass_gt:
    p.columns = [get_translated_name(x) for x in p.columns]

In [0]:
hdfs[0]['pimass_pheno'].head()

In [0]:
phenos = [x['pimass_pheno'][['mass_resid', 'pupual_duration_resid', 'total_dev_time_resid']] for x in hdfs]

In [0]:
alphas = []
for i, pheno_df in enumerate(phenos):
    alphas.append({})
    data = pheno_df.join(pimass_gt[i].T.ix[2:,])
    data.columns = [x.replace(" ", "_") for x in data.columns]
    data = data.astype(float)
    for pheno in data.columns[0:3]:
        alphas[i][pheno] = {}
        print i, pheno
        for snp in data.columns[3:]:
            model = sm.OLS(data[pheno], data[snp], missing='drop')
            fit = model.fit()
            alphas[i][pheno][snp] = fit.params.values[0]

In [0]:
alpha_df = [pd.DataFrame(x) for x in alphas]

In [0]:
pheno_names = phenos[0].columns

In [0]:
def get_sig_pheno_name(col):
    return {'mass_resid': 'mass',
           'pupual_duration_resid': 'pd',
           'total_dev_time_resid': 'tdt'}[col]

In [0]:
def write_freqs_file(df, paf, outdir, pheno, imputed, key, sig_key):
    df = df.copy()
    outfile = os.path.join(outdir, "freqs.file.%s.%s.%s.%s.txt" % (sig_key, key, pheno, imputed))
    df.index.name = "SNP"
    dfs = []
    for pop in paf:
        df2 = df.join(paf[pop].T)
        df2['CLST'] = pop
        df2['POS'] = df2.apply(lambda x: x.name.split("_")[1], axis=1)
        df2['CHR'] = df2.apply(lambda x: x.name.split("_")[0], axis=1)
        df2 = df2[['CLST', 'minor', 'major', 'q', 'POS', 'CHR']]
        df2.columns = ['CLST', 'A1', 'A2', 'FRQ', 'POS', 'CHR']
        dfs.append(df2)
    combined = pd.concat(dfs)
    combined.to_csv(outfile, header=True, index=True, sep="\t")
    return outfile, combined['CLST'].unique()

def write_gwas_data_file(df, outdir, pheno, imputed, sig_key):
    df = df.copy()
    outfile = os.path.join(outdir, "gwas.data.file.%s.%s.%s.txt" % (sig_key, pheno, imputed))
    df.index.name = "SNP"
    df = df[['minor', 'major', 'alpha', 'maf']]
    df.columns = ["A1", "A2", "EFF", "FRQ"]
    df.to_csv(outfile, header=True, index=True, sep="\t")
    return outfile
    
def write_env_var_data_files(paf, pops, outdir, imputed, pheno, sig_key):
    outfiles = []
    outfile = os.path.join(outdir, "env.var.data.file.%s.%s.%s.txt" % (sig_key, pheno, imputed))
    outfiles.append(outfile)
    with open(outfile, "w") as o:
        o.write("CLST\tENV\tREG\n")
        pop_id = 1
        for pop in sorted(paf):
            if pop in pops:
                o.write("%s\t%g\t%d\n" % (pop, np.random.randn(), pop_id))
            pop_id += 1
    return outfiles
  
def write_match_pop_file(df, paf, outdir, imputed, matchpop, pheno, sig_key):
    outfile = os.path.join(outdir, "match.pop.file.%s.%s.%s.%s.txt" % (sig_key, pheno, matchpop, imputed))
    df2 = df.join(paf[matchpop].T)
    df2.index.name="SNP"
    df2['CLST'] = matchpop
    df2['POS'] = df2.apply(lambda x: x.name.split("_")[1], axis=1)
    df2['CHR'] = df2.apply(lambda x: x.name.split("_")[0], axis=1)
    df2 = df2[['CLST', 'minor', 'major', 'q', 'POS', 'CHR']]
    df2.columns = ['CLST', 'A1', 'A2', 'FRQ', 'POS', 'CHR']
    df2.to_csv(outfile, header=True, index=True, sep="\t")
    return outfile

def get_squat_vars(pheno,
                   gwas_data_file, 
                  gwas_freqs_file, 
                  env_var_files,
                  match_pop_file,
                  full_freqs_file):
    d = {"gwas.data.file": "'%s'" % gwas_data_file,
         "freqs.file": "'%s'" % gwas_freqs_file,
         "env.var.data.files": "list(%s)" % ','.join(["'%s'" % x for x in env_var_files]),
         "match.pop.file": "'%s'" % match_pop_file,
         "full.dataset.file": "'%s'" % full_freqs_file,
         "path":"'%s'" % pheno,
         "match.categories":"c('FRQ')",
         "match.bins":"list(seq(0,0.5,0.02), c(2), seq(0,1000,100))",
         "cov.SNPs.per.cycle":5000,
         "cov.cycles":1,
         "null.phenos.per.cycle":1000,
         "null.cycles":1,
         "load.cov.mat":"F",
         "sim.null":"T",
         "check.allele.orientation":"T"}
    return ',\n'.join("%s=%s" % (key,val) for (key,val) in d.items())

def create_squat_run_file(pheno, outdir, squat_vars, sig_key):
    squat_dir = "/home/cfriedline/eckertlab/src/PolygenicAdaptationCode/Scripts/"
    out_dir = os.path.join(outdir, "squat_%s" % sig_key)
    res_dir = os.path.join(out_dir, pheno)
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
        os.symlink(squat_dir, os.path.join(out_dir, "Scripts"))
    squat_file = os.path.join(out_dir, "squat_%s.%s.r" % (sig_key, pheno))
    with open(squat_file, "w") as o:
        o.write("system('rm -rf %s')\n" % res_dir)
        o.write("setwd('/home/cfriedline/eckertlab/src/PolygenicAdaptationCode')\n")
        o.write("source('%s')\n" % os.path.join(squat_dir, "CreateTraitFile.R"))
        o.write("source('%s')\n" % os.path.join(squat_dir, "functions.R"))
        o.write("setwd('%s')\n" % out_dir)
        o.write("PolygenicAdaptationFunction(%s)\n" % squat_vars)
    return squat_file

squat_files = []

for i, hdf in enumerate(hdfs):
    outdir = analysis_dir[i]
    for name in pheno_names:
        paf = pop_allele_freqs[i]
        pheno_hdf = get_sig_pheno_name(name)
        for sig_key, sig in {"relaxed": hdf['relaxed_sig_snps_%s' % pheno_hdf],
                            "sig": hdf['sig_snps_%s' % pheno_hdf]}.items():
            print sig_key, name, pheno_hdf, len(sig)
            full = hdf['mcmc_%s_hmean' % pheno_hdf]
            full.index = full.rs
            full.index.name="%s_%d" % (name, i)
            alpha = alphas[i][name]
            full['alpha'] = full.apply(lambda x: alpha[x.name], axis=1)       
            full = full.drop("rs", axis=1)
            full = full.join(pimass_gt[i][['minor', 'major']])
            af = hdfs[i]['allele_freqs_swapped']
            full['maf'] = full.apply(lambda x: af[x.name]['q'], axis=1)

            # write files
            gwas_data_file = write_gwas_data_file(full.ix[sig.rs], outdir, name, i, sig_key)
            gwas_freqs_file, gwas_pops = write_freqs_file(full.ix[sig.rs], paf, outdir, name, i, "gwas", sig_key)
            #env_var_files = write_env_var_data_files(paf, gwas_pops, outdir, i, name, sig_key)
            match_pop_file = write_match_pop_file(full, paf, outdir, i, "QC32", name, sig_key)
            full_freqs_file, full_pops = write_freqs_file(full, paf, outdir, name, i, "full", sig_key)
            squat_vars = get_squat_vars(name, gwas_data_file,
                          gwas_freqs_file, 
                          bioclim_env_files,
                          match_pop_file,
                          full_freqs_file)
            squat_file = create_squat_run_file(name, outdir, squat_vars, sig_key)
            print "wrote %s" % squat_file
            squat_files.append(squat_file)

In [0]:
full

In [0]:
squat_files

In [0]:
%%R
setwd("~/")

In [0]:
for squat_file in squat_files:
    print squat_file
    r("source('%s')" % squat_file)

In [0]:
with open(os.path.join(os.path.dirname(os.path.dirname(analysis_dir_imp)), "squat_results.txt"), "w") as o:  
    o.write("%s\n" % "\t".join(["significance", "phenotype", "imputation", "Qx", "Pr(Qx)", 
                               "Fst", "Pr(Fst)", "LD", "Pr(LD)"]))
    for squat_file in squat_files:
        squat_dir = os.path.dirname(squat_file)
        datatype = os.path.basename(os.path.dirname(squat_dir))
        pheno = os.path.basename(squat_file).split(".")[1]
        pheno_dir = os.path.join(squat_dir, pheno)
        output_dir = os.path.join(pheno_dir, "Output")
        assert os.path.exists(output_dir)

        for obj in os.listdir(output_dir):
            obj = os.path.join(output_dir, obj)
            r("load('%s')" % obj)
            
        res = "\t".join([str(x) for x in [os.path.basename(squat_dir),
                                          pheno,
                                          datatype,
                                          r('the.stats$Qx')[0],
                                          r('p.vals$Qx')[0],
                                          r('the.stats$Fst.comp')[0],
                                          r('p.vals$Fst.comp')[0],
                                          r('the.stats$LD.comp')[0],
                                          r('p.vals$LD.comp')[0]]])
        print res
        o.write("%s\n" % res)
    

In [0]:
%%R
the.stats