In [0]:
import sys

In [0]:
sys.path.append("../include_utils/")

In [0]:
#from IPython.parallel import Client
import ipyparallel as ipp
import os, time
import include_utils as u
import pandas as pd
import numpy as np
import scipy as sp
import numbers
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import matplotlib.cm as cm
import matplotlib.colors as mcolors
import cyvcf
import vcf
from sklearn import preprocessing
from subprocess import Popen, PIPE
import seaborn as sns
from IPython.display import FileLink
import urllib2
import dill
import traceback
from pandas import Series, DataFrame
import gzip
import warnings
warnings.filterwarnings('ignore',category=pd.io.pytables.PerformanceWarning)
%config InlineBackend.figure_format = 'retina'
from Bio import SeqIO
import pysam

In [0]:
pwd

In [0]:
home = "/home/cfriedline/ipynb/gypsy_moth/"

In [0]:
def setup_r():
    os.environ['R_HOME'] = '/home/cfriedline/g/R3/lib64/R'
    os.environ['LD_LIBRARY_PATH'] = "%s/lib:%s:%s" % (os.environ['R_HOME'], 
                                                   os.environ['LD_LIBRARY_PATH'],
                                                     "/home/cfriedline/lib64")

In [0]:
setup_r()
import rpy2.robjects as robjects
from rpy2.robjects import pandas2ri
pandas2ri.activate()
r = robjects.r

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline
%reload_ext rpy2.ipython

In [0]:
%%R
R.home()

In [0]:
def save_session():
    dill.dump_session("session.dill")
    
def load_session():
    dill.load_session("session.dill")

In [0]:
bam_dir = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis"
analysis_dir = os.path.join(bam_dir, "samtools1.2")
if not os.path.exists(analysis_dir):
    os.makedirs(analysis_dir)
assert os.path.exists(analysis_dir)

In [0]:
bam_files = !find /home/cfriedline/eckertlab/gypsy_indiv/masked | grep new | grep 'rg.bam$' | grep -v OTIS
bam_files = [os.path.abspath(x) for x in bam_files if '.bam' in x]
len(bam_files)

In [0]:
samtools = "/home/cfriedline/gpfs/src/samtools-1.2/samtools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.2/bcftools"
picard = "/home/cfriedline/gpfs/src/broadinstitute-picard-03a1d72/dist/picard.jar"
java = "/home/cfriedline/g/src/jdk1.8.0_60/bin/java"
perl = "/home/cfriedline/gpfs/opt/ActivePerl-5.16/bin/perl"

In [0]:
assembly = "/home/cfriedline/gpfs/assemblies/gypsy/masurca_new/CA/10-gapclose/genome.ctg.fasta"

In [0]:
bam_rmdup_files = !ls {analysis_dir} | grep 'dedup.bam$' | grep -v OTIS
bam_rmdup_files = sorted([os.path.join(analysis_dir, x) for x in bam_rmdup_files])
len(bam_rmdup_files)

In [0]:
assembly_dir = os.path.dirname(assembly)

In [0]:
vcf_file = os.path.join(analysis_dir, "samtools_1.2.vcf.gz")
assert os.path.exists(vcf_file)
vcf_file

In [0]:
vcfutils = "perl /home/cfriedline/g/src/bcftools-1.2/vcfutils.pl"
vcftools = "/home/cfriedline/bin/vcftools"
bcftools = "/home/cfriedline/gpfs/src/bcftools-1.2/bcftools"
tabix = "/home/cfriedline/gpfs/src/samtools-1.2/htslib-1.2.1/tabix"
bgzip = "/home/cfriedline/gpfs/src/samtools-1.2/htslib-1.2.1/bgzip"

In [0]:
analysis_dir = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle41"
analysis_vcf = "imputed_41.vcf.gz"

In [0]:
working_vcf = os.path.join(analysis_dir, analysis_vcf)

In [0]:
!$vcftools --012 \
--gzvcf \
$working_vcf \
--out $working_vcf

In [0]:
analysis_dir

In [0]:
from hdfstorehelper import HDFStoreHelper
hdf = HDFStoreHelper(os.path.join(analysis_dir, "gypsy_samtools12_%s.hd5" % analysis_vcf))

In [0]:
z12_file = os.path.join(analysis_dir, "%s.012" % analysis_vcf)
z12_file

In [0]:
assert os.path.exists(z12_file)

In [0]:
z12_data = []
for i, line in enumerate(open(z12_file)):
    line = line.strip()
    line = [int(x) for x in line.split("\t")]
    z12_data.append(np.array(line))
    if i % 10 == 0:
        print(i)
z12_data = np.array(z12_data)

In [0]:
z12_df = pd.DataFrame(z12_data)
z12_df.head()

In [0]:
z12_df = z12_df.drop(0, axis=1)
z12_df.columns = pd.Series(z12_df.columns)-1

In [0]:
z12_df.head()

In [0]:
z12_df.columns = ["L%d" % x for x in z12_df.columns]

In [0]:
z12_df.head()

In [0]:
translation_df = pd.read_csv("translation_table.csv", sep="\t", index_col=0)
indv = os.path.join(analysis_dir, "%s.indv" % os.path.basename(z12_file))

def get_translated_name(n):
    row = translation_df.ix[n.strip()]
    return "%s_%d_%d" % (row['pop'], row.indiv, row.dup)

In [0]:
names = [get_translated_name(x) for x in open(indv).readlines()]
z12_df.index=names

z12_df['population'] = z12_df.apply(lambda row: row.name.split("_")[0], axis=1)
z12_df['duplicate'] = z12_df.apply(lambda row: row.name[-1], axis=1)

In [0]:
z12_df.head()

In [0]:
snp_pos = pd.read_csv(os.path.join(analysis_dir, indv.replace("indv", "pos")), 
                              sep="\t",
                              header=None,
                              names=['contig', 'pos'])
snps = pd.DataFrame([int(x[1:]) for x in z12_df.columns[:-2]])
snps.columns = ['snp_id']
snps.index=snps.snp_id
snp_id_pos = pd.merge(snps, snp_pos, how="inner", left_index=True, right_index=True)
snp_id_pos.index = ["L%d" % x for x in snp_id_pos.index]

In [0]:
snp_pos.shape, snp_id_pos.shape, z12_df.shape

In [0]:
snp_pos.head()

In [0]:
snp_id_pos.head()

In [0]:
hdf.put('snp_pos', snp_pos)
hdf.put('snp_id_pos', snp_id_pos)

In [0]:
snp_id_pos.head()

In [0]:
reader = vcf.VCFReader(filename=os.path.join(analysis_dir, analysis_vcf))
def get_ref_alt_alleles(row):
    alleles = reader.fetch(row.contig, row.pos-1)
    for allele in alleles:
        row['ref'] = allele.REF
        row['alt'] = allele.ALT
        break
    return row
snp_id_pos_ref_alt = snp_id_pos.apply(get_ref_alt_alleles, axis=1)

In [0]:
snp_id_pos_ref_alt.head()

In [0]:
snp_id_pos_ref_alt['ref_len'] = snp_id_pos_ref_alt.apply(lambda x: len(x.ref), axis=1)
snp_id_pos_ref_alt_snps_only = snp_id_pos_ref_alt[snp_id_pos_ref_alt.ref_len==1]

In [0]:
hdf.put('snp_id_pos_ref_alt', snp_id_pos_ref_alt)
hdf.put('snp_id_pos_ref_alt_snps_only', snp_id_pos_ref_alt_snps_only)

In [0]:
snp_id_pos_ref_alt.to_csv(os.path.join(analysis_dir, 
                                       "snp_id_pos_ref_alt.csv"), 
                          header=True, 
                          index=True, 
                          sep="\t")
snp_id_pos_ref_alt_snps_only.to_csv(os.path.join(analysis_dir, 
                                                 "snp_id_pos_ref_alt_snps_only.csv"),
                                    header=True, 
                                    index=True,
                                    sep="\t")

In [0]:
z12_df.head()

In [0]:
def add_snp_names(col):
    if col.name.startswith("L"):
        return"%s_%s_%d" % (col.name,
                             snp_id_pos_ref_alt_snps_only.ix[col.name, 'contig'],
                                snp_id_pos_ref_alt_snps_only.ix[col.name, 'pos'])
    return col.name
colnames = z12_df.apply(add_snp_names)

In [0]:
z12_df.columns = colnames

In [0]:
hdf.put("z12_df", z12_df)

In [0]:
z12_df.shape

In [0]:
def get_percent_missing(col):
    return len(col[col==-1])*1.0/len(col)   

In [0]:
percent_missing = z12_df.apply(get_percent_missing, axis=0)

In [0]:
percent_missing.head()

In [0]:
hdf.put('percent_missing', percent_missing)

In [0]:
z12_df_50_perc = z12_df.ix[:,percent_missing <= 0.5]

In [0]:
z12_df_50_perc.shape

In [0]:
hdf.put('z12_df_50_perc', z12_df_50_perc)

In [0]:
def is_monomorphic(col):
    u = col[col != -1].value_counts()
    if len(u) == 1:
        return True
    return False

In [0]:
monomorphic_loci = z12_df_50_perc.apply(is_monomorphic, axis=0)
monomorphic_loci = monomorphic_loci[monomorphic_loci==True]

In [0]:
len(monomorphic_loci)

In [0]:
z12_df_50_perc_polymorphic = z12_df_50_perc.drop(monomorphic_loci.index, axis=1)

In [0]:
hdf.put('z12_df_50_perc_polymorphic', z12_df_50_perc_polymorphic)

In [0]:
z12_df_50_perc_polymorphic.shape

In [0]:
def get_correction(n):
    #for finite sample size
    return (2*n)/(2*n-1)

def get_allele_freqs(locus, debug):
    c = locus[locus != -1].value_counts()
    total_alleles = 2.0*sum(c)
    num_individuals = sum(c)
    P = 0
    Q = 0
    PQ = 0
    if 0 in c:
        P = 2*c[0]
    if 2 in c:
        Q = 2*c[2]
    if 1 in c:
        PQ = c[1]
    P += PQ
    Q += PQ
    p = P/total_alleles
    q = Q/total_alleles
    assert p + q == 1.0
    He = 2 * p * q * get_correction(num_individuals)
    Ho = PQ*1.0/num_individuals
    Fis = 1 - (Ho/He)
    #print p, q, He, Ho, Fis
    
        
    ret = pd.Series({"p":p, 
                      "q":q,
                      "P":P,
                      "Q":Q,
                      "He":He,
                      "Ho":Ho, 
                      "Fis":Fis})
    if debug:
        print(ret)
    return ret

In [0]:
allele_freqs = z12_df_50_perc_polymorphic.ix[:,:-2].apply(get_allele_freqs, args=(False,))
mafs = allele_freqs.apply(lambda x: min(x["p"], x["q"]))
len(mafs[mafs<0.01])

In [0]:
allele_freqs.ix[:,0:5]

In [0]:
hdf.put('allele_freqs', allele_freqs)

In [0]:
z12_df_50_perc_polymorphic_maf = z12_df_50_perc_polymorphic.drop(mafs[mafs<0.01].index, axis=1)

In [0]:
z12_df_50_perc_polymorphic_maf.shape

In [0]:
z12_df_50_perc_polymorphic_maf.head()

In [0]:
hdf.put('z12_df_50_perc_polymorphic_maf', z12_df_50_perc_polymorphic_maf)

In [0]:
global_fis = allele_freqs[z12_df_50_perc_polymorphic_maf.columns[:-2]].apply(lambda x: x["Fis"])
fis_outliers = global_fis[(global_fis < -0.5) | (global_fis > 0.5)]
z12_df_50_perc_polymorphic_maf_fis = z12_df_50_perc_polymorphic_maf.drop(fis_outliers.index, axis=1)

In [0]:
z12_df_50_perc_polymorphic_maf.shape, z12_df_50_perc_polymorphic_maf_fis.shape

In [0]:
hdf.put('z12_df_50_perc_polymorphic_maf_fis', z12_df_50_perc_polymorphic_maf_fis)

In [0]:
duplicates = z12_df_50_perc_polymorphic_maf_fis[z12_df_50_perc_polymorphic_maf_fis.duplicate=="1"]

In [0]:
duplicates[0:5]

In [0]:
z12_df_50_perc_polymorphic_maf_fis_dedup = z12_df_50_perc_polymorphic_maf_fis.drop(duplicates.index)

In [0]:
z12_df_50_perc_polymorphic_maf_fis_dedup.head()

In [0]:
hdf.put('z12_df_50_perc_polymorphic_maf_fis_dedup', z12_df_50_perc_polymorphic_maf_fis_dedup)

In [0]:
z12_df_50_perc_polymorphic_maf_fis_dedup = hdf.get('z12_df_50_perc_polymorphic_maf_fis_dedup')

In [0]:
working_df = z12_df_50_perc_polymorphic_maf_fis_dedup

In [0]:
allele_freqs = hdf.get("allele_freqs")

In [0]:
working_df.to_csv(os.path.join(analysis_dir,
                               "z12_df_50_perc_polymorphic_maf_fis_dedup.txt"),
                                          header=True,
                                          index=True,
                                          sep="\t")

In [0]:
working_df.shape

In [0]:
plt.hist(allele_freqs.T["Fis"].values, bins=50)
plt.show()

In [0]:
allele_freqs_final = z12_df_50_perc_polymorphic_maf_fis_dedup.ix[:,:-2].apply(get_allele_freqs,
                                                                              args=(False,))

In [0]:
hdf.put('allele_freqs_final', allele_freqs_final)

In [0]:
allele_freqs_final = hdf.get('allele_freqs_final')

In [0]:
plt.hist(allele_freqs_final.T["Fis"].values, bins=50)
plt.show()

plt.hist(allele_freqs.T["Fis"].values, bins=50)
plt.show()

In [0]:
allele_freqs_final.head()

In [0]:
allele_freqs_final.shape

In [0]:
mafs = allele_freqs_final.apply(lambda x: np.min((x['p'], x['q'])))

In [0]:
plt.hist(mafs, bins=20)
plt.show()

In [0]:
working_df = pd.read_csv(os.path.join(analysis_dir, 
                                      "z12_df_50_perc_polymorphic_maf_fis_dedup.txt"),
                                      sep="\t",
                                      index_col=0)
# cols = [int(x) for x in working_df.columns.tolist()[:-2]]
# cols.extend(working_df.columns.tolist()[-2:])
# working_df.columns = cols

In [0]:
working_df.head()

In [0]:
working_df = hdf.get('z12_df_50_perc_polymorphic_maf_fis_dedup')

In [0]:
def swap_alleles(locus):
    if locus.name.startswith("L"):
        locus_id = locus.name
        freqs = allele_freqs[locus_id]
        maf = min(freqs["p"], freqs["q"])
        if maf == freqs["p"]:
            return locus.replace({0:2,2:0})
        return locus
    else:
        return locus

In [0]:
working_swapped = working_df.apply(swap_alleles)

In [0]:
hdf.put("working_swapped", working_swapped)

In [0]:
hierf_trans = {0:11, 1:12, 2:22, -1:'NA'}

In [0]:
def apply_hierf_trans(series):
    return [hierf_trans[x] if x in hierf_trans else x for x in series]

In [0]:
hierf_trans_df = working_swapped.apply(apply_hierf_trans)

In [0]:
working_df.head()

In [0]:
working_swapped.head()

In [0]:
hierf_trans_df.head()

In [0]:
hdf.put('hierf_trans_df', hierf_trans_df)

In [0]:
hierf_trans_df = hdf.get('hierf_trans_df')

In [0]:
url = 'https://docs.google.com/uc?export=download&id=0B4xHxBFoPCoWT0NneHJadUI0OHM'
response = urllib.request.urlopen(url)
pheno = pd.read_excel(response, "Males-forGenomics-final")
pheno=pheno[['Population', 'Number', 'Mass', 'Pupual Duration', 'Total Dev Time']]
for x in pheno.index:
    pheno.ix[x, 'sample_pheno'] = "%s_%d" % (pheno.ix[x, 'Population'], pheno.ix[x, 'Number'])

In [0]:
hdf.put("pheno", pheno)

In [0]:
pop_id = {}
i = 1
for p in sorted(working_df['population'].unique()):
    pop_id[p] = i
    i+=1
pop_id

In [0]:
def assign_popid(series):
    series['popid'] = pop_id[series['population']]
    return series

In [0]:
hierf_trans_df = hierf_trans_df.apply(assign_popid, axis=1)

In [0]:
hierf_trans_df.head()

In [0]:
pwd

In [0]:
hierf_trans_df.to_csv(os.path.join(analysis_dir, "hierf_trans_df.txt"), header=True, index=True, sep="\t")

In [0]:
hierf_trans_df = pd.read_csv(os.path.join(analysis_dir, "hierf_trans_df.txt"), header=0, index_col=0, sep="\t")

In [0]:
hdf.put('hierf_trans_df', hierf_trans_df)

In [0]:
hierf_trans_df = hdf.get('hierf_trans_df')

In [0]:
cols = ['popid']
cols.extend(hierf_trans_df.columns[:-3])
#cols.extend(snp_id_pos_ref_alt_snps_only.index)

In [0]:
len(cols)

In [0]:
hierf_trans_df2 = hierf_trans_df[[x for x in cols if x in hierf_trans_df.columns]]

In [0]:
hierf_trans_df2.head()

In [0]:
hierf_trans_df2 = hierf_trans_df2.sort("popid")

In [0]:
hdf.put('hierf_trans_df2', hierf_trans_df2)

In [0]:
hierf_trans_df2 = hdf.get('hierf_trans_df2')

In [0]:
hierf_trans_df2.popid.unique()

In [0]:
hierf_trans_df2.to_csv(os.path.join(analysis_dir, "hierfstat_samtools.txt"), header=True, index=False, sep="\t")

In [0]:
hierf_trans_df2[[x for x in hierf_trans_df2 if x.startswith("L")]].shape

## Put into R (because it can be slow)

```R
library(hierfstat)
data = read.table("hierfstat_samtools.txt", header=T, sep="\t")
levels = data.frame(data$popid)
loci = data[,2:ncol(data)]
res = varcomp.glob(levels=levels, loci=loci, diploid=T)
saveRDS(res, "hierfstat_samtools_new.rds")
```

In [0]:
%%R
library(gtools)
library(ade4)

In [0]:
%%R
library(hierfstat)
f = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle41/hierfstat_samtools.txt"
data = read.table(f, header=T, sep="\t")
levels = data.frame(data$popid)
loci = data[,2:ncol(data)]

In [0]:
%%R
bs = basic.stats(data)
saveRDS(bs, "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle41/basic_stats.rds")

In [0]:
%%R
res = readRDS("/gpfs_fs/home/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle41/hierfstat_samtools_new.rds")
bs = readRDS("/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle41/basic_stats.rds")

In [0]:
def get_r_series(key):
    s = pd.Series(get_r(key))
    s.index = get_r("names(%s)" % key)
    return s

def get_r_df(key):
    df = pd.DataFrame(get_r(key))
    try:
        rname = get_r("rownames(%s)" % key)
        df.index = rname
    except:
        pass
    
    try:
        cname = get_r("colnames(%s)" % key)
        df.columns = cname
    except:
        pass
    
    return df

def get_r(key):
    return r(key)

In [0]:
perloc = get_r_df("bs$perloc")
Ho = get_r_df("bs$Ho")
Hs = get_r_df("bs$Hs")
Fis = get_r_df("bs$Fis")
overall = get_r_series("bs$overall")
n_ind_samp = get_r_df("bs$n.ind.samp")

In [0]:
hdf.put('perloc', perloc)
hdf.put('Ho', Ho)
hdf.put("Hs", Hs)
hdf.put("Fis", Fis)
hdf.put("overall", overall)
hdf.put("n_ind_samp", n_ind_samp)

In [0]:
loc_df = get_r_df('res$loc')
F_df = get_r_df('res$F')
overall_df = get_r_df('res$overall')

In [0]:
F_df

In [0]:
def compute_fst(series):
    Hs = series[0]
    Ht = sum(series)
    return Hs/Ht

In [0]:
loci_fst = loc_df.apply(compute_fst, axis=1)

In [0]:
hdf.put("loci_fst", loci_fst)

In [0]:
loci_fst = hdf.get('loci_fst')

In [0]:
loci_fst.head()

In [0]:
perloc['Fst'].head()

In [0]:
pop_id

In [0]:
plt.hist(loci_fst, bins=50)
plt.title("n=%d mean=%.4f +/- %.4f [%.4f, %.4f]" % (len(loci_fst), 
                                                    np.mean(loci_fst), 
                                                    np.std(loci_fst),
                                                    np.min(loci_fst), 
                                                    np.max(loci_fst)))
plt.show()

In [0]:
working_df = hdf.get('z12_df_50_perc_polymorphic_maf_fis_dedup')

In [0]:
working_df.head()

In [0]:
working_df.shape

In [0]:
allele_freqs_final = hdf.get('allele_freqs_final')

In [0]:
allele_freqs_final.ix[:,0:5]

In [0]:
pca_maf = hdf.get('working_swapped')

In [0]:
pca_maf.head()

In [0]:
pca_maf.ix[:,:5].apply(lambda locus: np.mean([x for x in locus if x != -1]))

In [0]:
hdf.put('pca_maf', pca_maf)

In [0]:
pca_maf = hdf.get('pca_maf')

In [0]:
pca_maf.to_csv(os.path.join(analysis_dir, "pca_maf.txt"), sep="\t", header=True, index=True)

In [0]:
def center_and_standardize_value(val, u, var):
    if val == -1:
        return 0.0
    return (val-u)/np.sqrt(var)

In [0]:
def center_and_standardize(locus):
    if locus.name.startswith("L"):
        #locus_id = int(locus.name[1:])
        locus_id = locus.name
        freqs = allele_freqs[locus_id]
        maf = min(freqs["p"], freqs["q"])
        var = maf*(1-maf)
        u = np.mean([x for x in locus if x != -1])
        return locus.apply(center_and_standardize_value, args=(u, var))
    return locus

In [0]:
pca_std = pca_maf.apply(center_and_standardize)

In [0]:
pca_std.ix[:,0:5].apply(np.mean)

In [0]:
hdf.put('pca_std', pca_std)

In [0]:
pca_std = hdf.get('pca_std')

In [0]:
pca_std.head()

In [0]:
pca_std_data = pca_std.ix[:,:-2]

In [0]:
hierf_trans_df2.shape

In [0]:
pca_std_data.head()

In [0]:
snp_id_pos_ref_alt_snps_only = hdf.get("snp_id_pos_ref_alt_snps_only")

In [0]:
snp_id_pos_ref_alt_snps_only.head()

In [0]:
snp_cols = snp_id_pos_ref_alt_snps_only.index
pca_std_data_snps = pca_std_data
pca_std_data_snps.shape, len(snp_cols)

In [0]:
hdf.put('pca_std_data_snps', pca_std_data_snps)

In [0]:
pca_std_data_snps = hdf.get('pca_std_data_snps')

In [0]:
prcomp = r('prcomp')
summary = r('summary')

In [0]:
pca_std_data_snps.to_csv(os.path.join(analysis_dir, "pca_std_data_snps.txt"), 
                         header=True, index=True, sep="\t")

In [0]:
working_df.head()

In [0]:
pca_std_data_snps.head()

In [0]:
prcomp_res = prcomp(pca_std_data_snps, scale=False, center=False)

In [0]:
print(summary(prcomp_res))

In [0]:
x = pd.DataFrame(pandas2ri.ri2py(prcomp_res.rx2("x")))
x.index = prcomp_res.rx2("x").names[0]
x.columns = prcomp_res.rx2("x").names[1]
joined = x.join(pca_maf)

In [0]:
%R -i prcomp_res

In [0]:
%%R
file_dir = "/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/notimputed"
missing_df = read.table(paste(file_dir, "/", "missing_df.txt", sep=""), sep="\t", header=T)
sample_missing = read.table(paste(file_dir, "/", "sample_missing.txt", sep=""), sep="\t", header=F, 
                           row.names=1, col.names=c("sample", "missing"))
missing_pca = merge(sample_missing, prcomp_res$x, by=0)
fit1 = lm(missing~PC1, data=missing_pca)
fit2 = lm(missing~PC2, data=missing_pca)
print(summary(fit1))
print(summary(fit2))


In [0]:
joined.head()

In [0]:
hdf.put('joined', joined)
hdf.put('x', x)

In [0]:
norm = mcolors.Normalize(min(pop_id.values()), max(pop_id.values()))

In [0]:
legend = {}
for row in joined.iterrows():
    pop = row[1]['population']
    n = norm(pop_id[pop])
    color = cm.jet(n)
    legend[pop] = color
    plt.scatter(row[1].PC1, 
                row[1].PC2, 
                s=50, 
                c=color)
fig = plt.gcf()
ax = plt.gca()
cmap = plt.get_cmap()
fig.set_size_inches(10,8)
plt.title("PCA of n=%d samples on %d loci" % (len(joined), len(pca_std_data.columns)))
plt.xlabel("PC1 (0.05729)")
plt.ylabel("PC2 (0.03193)")

handles = []
for pop in sorted(legend):
    handles.append(mpatches.Patch(color=legend[pop], label=pop))
plt.legend(handles=sorted(handles))
plt.show()

In [0]:
F_df

In [0]:
pwd

In [0]:
%%R
source("tw_calc.R")
test=read.table("twtable", header=F)

In [0]:
TWcalc = r('TWcalc')

In [0]:
robjects.globalenv['pca_std_data_snps'] = pandas2ri.py2ri(pca_std_data_snps)

In [0]:
%%R
saveRDS(pca_std_data_snps, file="/gpfs_fs/home/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle41/pca_std_data_snps.rds")

In [0]:
%%R
pca_std_data_snps = readRDS("/gpfs_fs/home/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/beagle41/pca_std_data_snps.rds")

In [0]:
%%R
pca_std_data_snps[1:10,1:10]

In [0]:
%%R
tw = TWcalc(as.matrix(pca_std_data_snps),12)

In [0]:
tw_p = r("tw[[2]]")
tw_e = r("tw[[1]]")

In [0]:
tw_num = 0
for i, p in enumerate(tw_p):
    print(p)
    if p > 0.05:
        tw_num = i
        break
print("Tracy-Widom test yields %d axes of pop structure" % tw_num)

    8e-09
    8e-09
    8e-09
    8e-09
    8e-09
    8e-09
    5.328e-06
    0.000177359
    0.000773431
    0.509994383
    Tracy-Widom test yields 9 axes of pop structure

In [0]:
pca_cov = x.ix[:,0:tw_num]

In [0]:
hdf.put('pca_cov', pca_cov)

In [0]:
pca_cov.head()

In [0]:
pheno = hdf.get("pheno")

In [0]:
pheno.head()

In [0]:
pheno['sample_id'] = pheno.apply(lambda x: "%s_0" % x.sample_pheno, axis=1)
pheno.index = pheno['sample_id']
pheno = pheno.drop('sample_id', axis=1)

In [0]:
pheno.head()

In [0]:
pca_maf.head()

In [0]:
snp_cols

In [0]:
pca_std_pheno = pheno.join(pca_cov, how="inner").join(pca_maf.ix[:,:-2], how="inner")

In [0]:
pca_std_pheno.head()

In [0]:
pca_std_pheno.shape

In [0]:
hdf.put('pca_std_pheno', pca_std_pheno)

In [0]:
set(pca_std_data.index) - set(pca_std_pheno.index)

In [0]:
pca_std_pheno[:5]

In [0]:
translation_df[:5]

In [0]:
snp_id_pos_ref_alt_snps_only.head()

In [0]:
good_snps = snp_id_pos_ref_alt_snps_only
good_snp_file = os.path.join(analysis_dir, "goodsnps.vcf")
good_snp_file

In [0]:
good_snps = good_snps.ix[[x for x in pca_maf.columns if x.startswith("L")]]
reader = vcf.VCFReader(filename=os.path.join(analysis_dir, analysis_vcf))
wrote = 0
with open(good_snp_file, "w") as o:
    writer = vcf.VCFWriter(o, reader)
    for rec in reader:
        row = good_snps[(good_snps.contig==rec.CHROM) & (good_snps.pos==rec.POS)]
#         print row
        if len(row):
            writer.write_record(rec)
            wrote += 1
print("wrote %d recs to %s" % (wrote, good_snp_file))

In [0]:
hdf.put("good_snps", good_snps)

In [0]:
hdf2 = HDFStoreHelper("/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2_no_otis/notimputed/gypsy_samtools12_snps.vcf.gz.hd5")

In [0]:
good_snps_notimputed = hdf2.get('good_snps')

In [0]:
good_snps_notimputed.shape, good_snps.shape

In [0]:
from matplotlib_venn import venn2
venn2([set(good_snps_notimputed.index), set(good_snps.index)], set_labels=["not imputed", "imputed"])

In [0]:
inter_snps = set.intersection(*[set(good_snps_notimputed.index), set(good_snps.index)])

In [0]:
inter_snps_df = good_snps.ix[inter_snps,]

In [0]:
hdf.put('inter_snps_df', inter_snps_df)

In [0]:
good_snp_file_gz = "%s.gz" % good_snp_file
!$bgzip -c $good_snp_file > $good_snp_file_gz
!$tabix $good_snp_file_gz

In [0]:
good_snp_file_gz

In [0]:
reader = vcf.VCFReader(filename=good_snp_file_gz)
finder = vcf.VCFReader(filename=good_snp_file_gz)

def get_correct_name(name):
    row = translation_df.ix[name,:]
    return "%s_%d_%d" % (row['pop'], row.indiv, row.dup)

gt_base_data = {}
at = 0
for rec in reader:
    snps = finder.fetch(rec.CHROM, rec.POS-1, rec.POS)
    for snp in snps:
        snp_id = "%s_%d" % (snp.CHROM, snp.POS)
        for sample in snp.samples:
            if not snp_id in gt_base_data:
                gt_base_data[snp_id] = {}
            sample_name = get_correct_name(sample.sample)
            gt_base_data[snp_id][sample_name] = sample.gt_bases
    at += 1
    if at % 1000 == 0:
        print(at)
gt_base_df = pd.DataFrame(gt_base_data)

In [0]:
gt_base_df.head()

In [0]:
for col in gt_base_df.ix[:,0:30]:
    print(col, gt_base_df[col].unique())


In [0]:
hdf.put('gt_base_df', gt_base_df)

In [0]:
gt_base_df=hdf.get('gt_base_df')

In [0]:
gt_base_df.to_csv(os.path.join(analysis_dir, "gt_base_df.csv"), header=True, index=True, sep="\t")

In [0]:
gt_base_df = pd.read_csv(os.path.join(analysis_dir, "gt_base_df.csv"), index_col=0, sep="\t", dtype=str)

In [0]:
gt_base_df.head()

In [0]:
%%R
library(SNPassoc)

In [0]:
pheno.columns

In [0]:
pheno_cols = ['Mass','Pupual Duration','Total Dev Time']
pheno_cols.extend(gt_base_df.columns)

In [0]:
pheno_gt_base = pheno.merge(gt_base_df, left_index=True, right_index=True, how="inner")

In [0]:
pheno_gt_base = pheno_gt_base[pheno_cols]

In [0]:
pheno_gt_base.shape

In [0]:
hdf.put('pheno_gt_base', pheno_gt_base)

In [0]:
pheno_gt_base_pca = pheno_gt_base.merge(pca_cov, left_index=True, right_index=True, how="inner")

In [0]:
pheno_gt_base_pca.columns = pheno_gt_base_pca.apply(lambda x: x.name.replace(" ", "_"))

In [0]:
pheno_gt_base_pca.ix[:,0:3] = preprocessing.scale(pheno_gt_base_pca.ix[:,0:3])

In [0]:
pheno_gt_base_pca.head()

In [0]:
pheno_gt_base_pca.ix[:,0:3].apply(np.mean)

In [0]:
hdf.put('pheno_gt_base_pca', pheno_gt_base_pca)

In [0]:
pheno_gt_base_pca=hdf.get('pheno_gt_base_pca')

In [0]:
def snpassoc_filter(col):
    if col.name.startswith("L"):
        if len(col.value_counts()) == 3:
            return col.fillna("NA")
    else:
        return col
pheno_gt_base_pca_snpassoc = pheno_gt_base_pca.apply(snpassoc_filter).dropna(how="all", axis=1)
pheno_gt_base_pca_snpassoc

In [0]:
pheno_gt_base_pca_snpassoc.shape

In [0]:
pheno_gt_base_pca_snpassoc.to_csv(os.path.join(analysis_dir, "pheno_gt_base_pca_snpassoc.txt"),
                         header=True,
                         index=True,
                         sep="\t")

In [0]:
def write_snpassoc_files(df, input_file, num_pca_axes):
    pheno = df.columns[0:3]
    out_files = []
    for p in pheno:
        with open(os.path.join(analysis_dir, "snpassoc_%s.R" % p.lower()), "w") as o:
            print("writing %s" % o.name)
            out_files.append(o.name)
            text = '''
library(SNPassoc)

d = read.table('%s', sep="\\t", row.names=1, header=T)

#subtract b/c those are the PCA axes
snp_cols = 4:(ncol(d)-%d)
snp_data = setupSNP(d, colSNPs=snp_cols, sep="/")
pca_cols = (ncol(d)-%d):ncol(d)
pca_data = d[,pca_cols]

wg = WGassociation(%s~1+pca_data$PC1+pca_data$PC2+pca_data$PC3+pca_data$PC4+pca_data$PC5+pca_data$PC6+pca_data$PC7+pca_data$PC8+pca_data$PC9, data=snp_data, model="co", genotypingRate=5)
saveRDS(wg, "wg_%s_co.rds")
stats = WGstats(wg)
saveRDS(stats, "wgstats_%s.rds")
''' % (input_file, 
       num_pca_axes,
       num_pca_axes-1,
       p, 
       p.lower(), 
       p.lower())
        
            o.write(text)
    return out_files

snpassoc_files = write_snpassoc_files(pheno_gt_base_pca_snpassoc, 
                          os.path.join(analysis_dir, "pheno_gt_base_pca_snpassoc.txt"),
                         9)
snpassoc_files

In [0]:
cat {analysis_dir}/snpassoc_mass.R

```
library(SNPassoc)

d = read.table('/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2/pheno_gt_base_pca_snpassoc.txt', sep="\t", row.names=1, header=T)

#subtract b/c those are the PCA axes
snp_cols = 4:(ncol(d)-9)
snp_data = setupSNP(d, colSNPs=snp_cols, sep="/")
pca_cols = (ncol(d)-8):ncol(d)
pca_data = d[,pca_cols]

wg = WGassociation(Mass~1+pca_data$PC1+pca_data$PC2+pca_data$PC3+pca_data$PC4+pca_data$PC5+pca_data$PC6+pca_data$PC7+pca_data$PC8+pca_data$PC9, data=snp_data, model="co", genotypingRate=5)
saveRDS(wg, "wg_mass_co.rds")
stats = WGstats(wg)
saveRDS(stats, "wgstats_mass.rds")
```

In [0]:
%load_ext rpy2.ipython

In [0]:
%R -i analysis_dir

In [0]:
%%R
wg_mass_co = readRDS(paste(analysis_dir, "/wg_mass_co.rds", sep=""))
wg_pd_co = readRDS(paste(analysis_dir, "/wg_pupual_duration_co.rds", sep=""))
wg_tdt_co = readRDS(paste(analysis_dir, "/wg_total_dev_time_co.rds", sep=""))

wgstats_mass = readRDS(paste(analysis_dir, "/wgstats_mass.rds", sep=""))
wgstats_pd = readRDS(paste(analysis_dir, "/wgstats_pupual_duration.rds", sep=""))
wgstats_tdt = readRDS(paste(analysis_dir, "/wgstats_total_dev_time.rds", sep=""))

In [0]:
wgstats_mass = r['wgstats_mass']
wgstats_mass_labels = r('labels(wg_mass_co)')

wgstats_pd = r['wgstats_pd']
wgstats_pd_labels = r('labels(wg_pd_co)')

wgstats_tdt = r['wgstats_tdt']
wgstats_tdt_labels = r('labels(wg_tdt_co)')

In [0]:
test = com.convert_robj(wgstats_mass)

In [0]:
for x in test:
    print(pd.DataFrame(test[x]))
    break

In [0]:
com.convert_robj(wgstats_mass_labels)

In [0]:
alpha_vals = {}

wgstats = {"mass":[wgstats_mass, wgstats_mass_labels],
           "pd":[wgstats_pd, wgstats_mass_labels],
           "tdt":[wgstats_tdt, wgstats_mass_labels]}

for key, datalist in list(wgstats.items()):
    print("converting %s" % key)
    wgstats[key] = [com.convert_robj(x) for x in datalist]

In [0]:
def get_alleles(data):
    a = set()
    for x in data.index:
        for elem in x.split("/"):
            a.add(elem)
    return list(a)  

def get_allele_freqs_wg(data, AA, Aa, aa):
    total = np.sum(data['n'])*2
    A = data.ix[AA, "n"]*2 + data.ix[Aa, "n"]
    a = data.ix[aa, "n"]*2 + data.ix[Aa, "n"]
    return A/total, a/total

def get_genotypes(data, alleles):
    homos = ["%s/%s" % (x,x) for x in alleles]
    Aa = "%s/%s" % (alleles[0], alleles[1])
    if Aa not in data.index:
        Aa = Aa[::-1] #reverse it
    AA, aa = homos
    if data.ix[AA, "n"] < data.ix[aa, "n"]:
        AA, aa = homos[::-1] #reverse it so that major is first
    return AA, Aa, aa

def get_genotypic_values(data, alleles):
    AA, Aa, aa = get_genotypes(data, alleles)
    G_AA = float(data.ix[AA, 'me'])
    G_aa = float(data.ix[aa, 'me'])
    additive = (G_AA-G_aa)/2
    G_Aa = float(data.ix[Aa, 'me'])
    dominance = G_Aa - ((G_AA+G_aa)/2)
    return additive, dominance, AA, Aa, aa
    
def get_alpha(data):
    alleles = get_alleles(data)
    additive, dominance, AA, Aa, aa = get_genotypic_values(data, alleles)
    p, q = get_allele_freqs_wg(data, AA, Aa, aa)
    alpha = additive + (dominance*(q-p))
    return alpha, AA, aa, p, q

In [0]:
lt3 = {}
errors = {}
for p in wgstats:
    print("running %s" % p)
    df = pd.DataFrame(index=["alpha", "p-value", "AA", "aa", "p", "q"])
    alpha_vals[p] = df
    lt3[p] = 0
    errors[p] = []
    d = wgstats[p][0]
    labels = wgstats[p][1]
    for i, locus in enumerate(d):
        try:
            data = pd.DataFrame(d[locus])
            snp = labels[i]
            genotypes = [g for g in data.index if "/" in g]
            data = data.ix[genotypes,:]
            pvalue = data['p-value'].dropna()[0]
            if len(genotypes) == 3:
                alpha, AA, aa, p, q = get_alpha(data)
                df[snp] = [alpha, pvalue, AA, aa, p, q]
            elif len(genotypes) < 3:
                lt3[p] += 1
        except Exception as e: #needed for genotypes that are skipped b/c of genotyping rate
            traceback.print_exc()
#             print data
#             print labels
#             print d
            errors[p].append(data.to_string())

In [0]:
alpha_vals['mass']

In [0]:
plt.hist(alpha_vals['mass'].ix['alpha',:])
plt.title("mass")
plt.show()

plt.hist(alpha_vals['pd'].ix['alpha',:])
plt.title("pd")
plt.show()

plt.hist(alpha_vals['tdt'].ix['alpha',:])
plt.title("tdt")
plt.show()

##Write Berg/Coop files

In [0]:
pop_allele_freqs = {}
for pop, data in pca_maf.groupby('population'):
    data = data.ix[:,:-2]
    pop_allele_freqs[pop] = data.apply(get_allele_freqs, args=(False,))

In [0]:
missing = pca_maf.apply(get_percent_missing)
missing = DataFrame(missing)
missing.columns = ["missing"]
missing[0:5]

In [0]:
def write_gwas_data_file(df, pheno, outdir):
    out = "%s_gwas_data_file.txt" % pheno
    out = os.path.join(outdir, out)
    df = df.sort_index()
    df[['A1', 'A2', 'EFF', 'FRQ']].to_csv(out,
                                          header=True, 
                                          index=True,
                                          sep="\t")
    print(out)
    return out

def write_freqs_file(df, pheno, pop_freqs, outdir):
    out = "%s_freqs_file.txt" % pheno
    out = os.path.join(outdir, out)
    print(out)
    with open(out, "w") as o:
        o.write("SNP\tCLST\tA1\tA2\tFRQ\n")
        for pop, data in list(pop_freqs.items()):
            m = data.T.merge(df, how="inner", left_index=True, right_index=True)
            m['population'] = pop
            m.index.name = 'SNP'
            m = m.sort_index()
            o.write(m[['population','A1','A2','p']].to_csv(header=False, 
                                                             index=True,
                                                             sep="\t"))
def write_match_pop_file(df, pheno, pop_freqs, pop, outdir):
    out = "%s_match_pop_file.txt" % pheno
    out = os.path.join(outdir, out)
    print(out)
    with open(out, "w") as o:
        o.write("SNP\tCLST\tA1\tA2\tFRQ\n")
        for key, data in list(pop_freqs.items()):
            if key == pop:
                m = data.T.merge(df, how="inner", left_index=True, right_index=True)
                m['population'] = pop
                m.index.name = 'SNP'
                m = m.sort_index()
                o.write(m[['population','A1','A2','p']].to_csv(header=False, 
                                                                 index=True,
                                                                 sep="\t"))
                break
                
def write_full_dataset_file(df, pheno, pop_freqs, outdir):
    out = "%s_full_dataset_file.txt" % pheno
    out = os.path.join(outdir, out)
    print(out)
    with open(out, "w") as o:
        o.write("SNP\tCLST\tA1\tA2\tFRQ\n")
        for pop, data in list(pop_freqs.items()):
            m = data.T.merge(df, how="inner", left_index=True, right_index=True)
            m['population'] = pop
            m.index.name = 'SNP'
            m = m.sort_index()
            o.write(m[['population','A1','A2','p']].to_csv(header=False, 
                                                             index=True,
                                                             sep="\t"))   
def write_env_var_data_file(pheno, pop_freqs, outdir):
    out = "%s_env_var_data_file.txt" % pheno
    out = os.path.join(outdir, out)
    print(out)
    with open(out, "w") as o:
        o.write("CLST\tENV\tREG\n")
        pop_id = 0
        for pop in pop_freqs:
            pop_id += 1
            o.write("%s\t%f\t%d\n" % (pop, np.random.randn(), pop_id))
                
                
for p in alpha_vals:
    outdir = analysis_dir
    full = alpha_vals[p].T
    full = full.merge(missing, how="inner", left_index=True, right_index=True)
    full = full[(full['missing'] <= 0.2) & (full['missing'] >= 0.1)]
    #full = full[(full['missing'] <= 0.3) & (full['missing'] >= 0.2)]
    #full = full[(full['missing'] <= 0.3)]
    full.index.name = "SNP"
    full.AA = full.AA.apply(lambda x: x[0])
    full.aa = full.aa.apply(lambda x: x[0])
    full = full.rename(columns={'alpha':'EFF',
                                'AA':'A1',
                                'aa':'A2',
                                'p': 'FRQ'})
    candidates = full[full['p-value']<0.05]
    write_gwas_data_file(candidates, p, outdir)
    write_freqs_file(candidates, p, pop_allele_freqs, outdir)
    write_match_pop_file(full, p, pop_allele_freqs, "QC32", outdir)
    write_full_dataset_file(full, p, pop_allele_freqs, outdir)
    write_env_var_data_file(p, pop_allele_freqs, outdir)

In [0]:
squat_dir = "/home/cfriedline/eckertlab/src/PolygenicAdaptationCode/Scripts/"
def get_squat_vars(pheno):
    d = {"gwas.data.file":"'../%s_gwas_data_file.txt'" % pheno,
         "freqs.file":"'../%s_freqs_file.txt'" % pheno,
         "env.var.data.files":"list('../%s_env_var_data_file.txt')" % pheno,
         "match.pop.file":"'../%s_match_pop_file.txt'" % pheno,
         "full.dataset.file":"'../%s_full_dataset_file.txt'" % pheno,
         "path":"'%s'" % pheno,
         "match.categories":"c('MAF')",
         "match.bins":"list(seq(0,0.5,0.02), c(2), seq(0,1000,100))",
         "cov.SNPs.per.cycle":5000,
         "cov.cycles":1,
         "null.phenos.per.cycle":1000,
         "null.cycles":1,
         "load.cov.mat":"F",
         "sim.null":"T",
         "check.allele.orientation":"F"}
    return ',\n'.join("%s=%s" % (key,val) for (key,val) in d.items())

def create_squat_run_file(pheno):
    out_dir = os.path.join(analysis_dir, "squat")
    if not os.path.exists(out_dir):
        os.mkdir(out_dir)
    squat_file = os.path.join(out_dir, "squat_%s.r" % pheno)
    with open(squat_file, "w") as o:
        o.write("setwd('/home/cfriedline/eckertlab/src/PolygenicAdaptationCode')\n")
        o.write("source('%s')\n" % os.path.join(squat_dir, "CreateTraitFile.R"))
        o.write("source('%s')\n" % os.path.join(squat_dir, "functions.R"))
        o.write("setwd('%s')\n" % out_dir)
        o.write("PolygenicAdaptationFunction(%s)\n" % get_squat_vars(pheno))
    return squat_file

for pheno in alpha_vals:
    squat_file = create_squat_run_file(pheno)
    print squat_file
    !cat $squat_file
    print ""

In [0]:
def run_squat():
    for p in ["mass", "pd","tdt"]:
        print "running %s" % p
        output = os.path.join(analysis_dir,"squat/%s" % p)
        if os.path.exists(output):
            !rm -rf {output}
        r('setwd("%s")' % analysis_dir)
        print 'source("%s/squat/squat_%s.r")' % (analysis_dir, p)
        r('source("%s/squat/squat_%s.r")' % (analysis_dir, p))

In [0]:
cd $home

In [0]:
run_squat()

In [0]:
rfiles = !find {analysis_dir}/squat | grep Robj | grep Output
bc = {}
for f in rfiles:
    d = f.split("/")
    trait = d[-3]
    if not trait in bc:
        bc[trait] = []
    bc[trait].append(f)
bc

In [0]:
for pheno in bc:
    print(pheno)
    for obj in bc[pheno]:
        r('load("%s")' % obj)
    print(r("the.stats"))
    print("------------")
    print(r("p.vals"))
    print("XXXXXXXXXXXXXXXXXXXXXXXXXX")

##Run BEAGLE to impute missing genotypes

In [0]:
id_col = split_df.index
split_df.insert(0, "id", id_col)
split_df.insert(0, "I", "M")

In [0]:
split_df.to_csv(os.path.join(analysis_dir, "input.bgl"), sep="\t", header=True, index=False)

```bash
java -Xmx20g -jar /home/cfriedline/eckertlab/src/beagle_3.3.2/beagle.jar \
unphased=input.bgl \
missing='?' \
out=beagle_output
```

In [0]:
phased = pd.read_csv(gzip.open(os.path.join(analysis_dir, "beagle_output.input.bgl.phased.gz")), sep=" ")
phased.index = phased['id']
phased = phased.drop(["I", "id"], axis=1)

In [0]:
phased_dict = phased.T.to_dict()

In [0]:
phased_comb = {}
for snp, sample_data in list(phased_dict.items()):
    phased_comb[snp] = {}
    samples = [x[:-2] for x in sample_data]
    for s in samples:
        phased_comb[snp][s] = sample_data["%s_1" % s] + "/" + sample_data["%s_2" % s]

In [0]:
phased_df = DataFrame(phased_comb)

In [0]:
phased_df[0:5]

In [0]:
z12_df_50_perc_polymorphic_maf_fis = hdf.get('z12_df_50_perc_polymorphic_maf_fis')
duplicates = z12_df_50_perc_polymorphic_maf_fis[z12_df_50_perc_polymorphic_maf_fis.duplicate=="1"]
duplicates

In [0]:
phased_df = phased_df.drop(duplicates.index)

In [0]:
phased_df.shape

In [0]:
def convert_to_z12(locus):
    alleles = set()
    counts = locus.value_counts()
    num_individuals = sum(counts)
    c = {}
    for i, val in enumerate(counts):
        for allele in counts.index[i].split("/"):
            if not allele in c:
                c[allele] = 0
            c[allele] += val
    c = sorted(list(c.items()), key=lambda x:x[1], reverse=True)
    A = c[0][0]
    P = c[0][1]
    a = c[1][0]
    Q = c[1][1]
    total = P+Q*1.0
    p = P/total
    q = Q/total
    hets = ["%s/%s" % (A,a),"%s/%s" % (A,a)]
    PQ = 0
    for het in hets:
        if het in counts:
            PQ += counts[het]
    He = 2 * p * q * get_correction(num_individuals)
    Ho = PQ*1.0/num_individuals
    Fis = 1 - (Ho/He)
    af = [A,a,P,Q,p,q,Fis,He,Ho]
    trans = {"%s/%s" % (A,A): 0,
             "%s/%s" % (a,a): 2,
             "%s/%s" % (A,a): 1,
             "%s/%s" % (a,A): 1}
    phased_af[locus.name] = af
    z12 = locus.apply(lambda x: trans[x])
    return z12

phased_af = DataFrame(index=["A","a","P","Q","p","q","Fis","He","Ho"], 
                      columns=phased_df.columns)
phased_z12 = phased_df.apply(convert_to_z12)
phased_z12[0:5]

In [0]:
phased_df[0:5]

In [0]:
phased_af[0:5]

In [0]:
hdf.put('phased_df', phased_df)
hdf.put('phased_af', phased_af)

In [0]:
plt.hist(phased_af.T['q'], bins=100)
plt.title("MAF")
plt.show()

In [0]:
phased_af.T[['Fis','Ho','He','p','q']].astype(float).describe()

In [0]:
phased_maf_drop = phased_af.T[phased_af.T['q']<0.01].index

In [0]:
phased_z12_maf = phased_z12.drop(phased_maf_drop, axis=1)

In [0]:
phased_z12_maf[0:5]

In [0]:
phased_Fis_drop = phased_af.T[(phased_af.T['Fis'] < -0.5) | (phased_af.T['Fis'] > 0.5)].index

In [0]:
phased_z12_maf_fis = phased_z12_maf.drop(phased_Fis_drop.intersection(phased_z12_maf.columns), 
                                         axis=1)

In [0]:
phased_monomorphic = phased_z12_maf_fis.apply(is_monomorphic)

In [0]:
phased_monomorphic[phased_monomorphic==True] #none

In [0]:
hierf_trans

In [0]:
def apply_hierf_trans2(series):
    return series.apply(lambda x: hierf_trans[x])

In [0]:
phased_hierf = phased_z12_maf_fis.apply(apply_hierf_trans2)

In [0]:
url = 'https://docs.google.com/uc?export=download&id=0B4xHxBFoPCoWT0NneHJadUI0OHM'
response = urllib.request.urlopen(url)
pheno = pd.read_excel(response, "Males-forGenomics-final")
pheno=pheno[['Population', 'Number', 'Mass', 'Pupual Duration', 'Total Dev Time']]
for x in pheno.index:
    pheno.ix[x, 'sample_pheno'] = "%s_%d" % (pheno.ix[x, 'Population'], pheno.ix[x, 'Number'])

In [0]:
pheno['sample_id'] = pheno.apply(lambda x: "%s_0" % x.sample_pheno, axis=1)
pheno.index = pheno['sample_id']
pheno = pheno.drop('sample_id', axis=1)

In [0]:
pheno[0:5]

In [0]:
hdf.put('pheno', pheno)

In [0]:
pop_id

In [0]:
phased_hierf['population'] = phased_hierf.apply(lambda row: row.name.split("_")[0],axis=1)

In [0]:
phased_hierf2 = phased_hierf.apply(assign_popid, axis=1)

In [0]:
hierf_cols = ['popid']
hierf_cols.extend(sorted(phased_hierf2.columns[:-2]))

In [0]:
phased_hierf3 = phased_hierf2.reindex(columns=hierf_cols)

In [0]:
phased_hierf3[0:5]

In [0]:
phased_hierf2.ix[0:5,["L10","L100047"]]

In [0]:
phased_hierf3.popid.unique()

In [0]:
phased_hierf3.to_csv(os.path.join(analysis_dir, "hierfstat_phased.txt"), header=True, index=False, sep="\t")

```R
library(hierfstat)
data = read.table("hierfstat_phased.txt", header=T, sep="\t")
levels = data.frame(data$popid)
loci = data[,2:ncol(data)]
res = varcomp.glob(levels=levels, loci=loci, diploid=T)
saveRDS(res, "hierfstat_phased.rds")

```

In [0]:
%%R
res = readRDS(paste(analysis_dir, "hierfstat_phased.rds", sep="/"))

In [0]:
res = com.convert_robj(robjects.r('res'))
loc_df = res['loc']
F_df = res['F']
overall_df = res['overall']
F_df

In [0]:
phased_fst = loc_df.apply(compute_fst, axis=1)

In [0]:
hdf.put('phased_fst', phased_fst)

In [0]:
plt.hist(phased_fst, bins=50)
plt.title("$n=%d \ \mu=%.2f \pm %.2f \ [%.2f, %.2f]$" % (len(phased_fst), 
                                                    np.mean(phased_fst), 
                                                    np.std(phased_fst),
                                                    np.min(phased_fst), 
                                                    np.max(phased_fst)))
plt.xlim(0, 0.3)
plt.show()

In [0]:
plt.hist(phased_af.T.Fis, bins=50)
d = phased_af.T.Fis
plt.title("$\mu=%.4f \pm %.4f \ [%.2f, %.2f]$ " % (np.mean(d), 
                                                 np.std(d),
                                                np.min(d),
                                                np.max(d)))
plt.show()

In [0]:
hdf.in_store("phased_af")

In [0]:
phased_z12_maf_fis.shape

In [0]:
def center_and_standardize_phased(locus):
    if locus.name.startswith("L"):
        maf = phased_af.ix['q',locus.name]
        var = maf*(1-maf)
        u = np.mean([x for x in locus if x != -1])
        return locus.apply(center_and_standardize_value, args=(u, var))
    return locus

In [0]:
phased_z12_maf_fis_std = phased_z12_maf_fis.apply(center_and_standardize_phased)

In [0]:
phased_z12_maf_fis_std[0:5]

In [0]:
prcomp_phased = prcomp(phased_z12_maf_fis_std, scale=False, center=False)

In [0]:
print((summary(prcomp_phased)))

In [0]:
phased_x = com.convert_robj(prcomp_phased.rx2("x"))
phased_x.index = phased_z12_maf_fis_std.index
phased_joined = phased_x.join(phased_z12_maf_fis)
phased_joined['population'] = phased_joined.apply(lambda row: row.name.split("_")[0], axis=1)

In [0]:
norm = mcolors.Normalize(min(pop_id.values()), max(pop_id.values()))
legend = {}
for row in phased_joined.iterrows():
    pop = row[1]['population']
    n = norm(pop_id[pop])
    color = cm.hsv(n)
    legend[pop] = color
    plt.scatter(row[1].PC1, 
                row[1].PC2, 
                s=50, 
                c=color)
fig = plt.gcf()
ax = plt.gca()
cmap = plt.get_cmap()
fig.set_size_inches(10,8)
plt.title("PCA of n=%d samples on %d loci" % (len(phased_joined), 
                                              len(phased_z12_maf_fis.columns)))
plt.xlabel("PC1 (4.716%)")
plt.ylabel("PC2 (2.645%)")

handles = []
for pop in sorted(legend):
    handles.append(mpatches.Patch(color=legend[pop], label=pop))
plt.legend(handles=sorted(handles))
plt.show()

In [0]:
cd $home

In [0]:
%%R
source("tw_calc.R")
test=read.table("twtable", header=F)

In [0]:
TWcalc = r('TWcalc')

In [0]:
phased_tw = TWcalc(phased_z12_maf_fis_std.values, 15)
print(phased_tw)

In [0]:
phased_tw_p = com.convert_robj(phased_tw.rx2(2))
phased_tw_e = com.convert_robj(phased_tw.rx2(1))

In [0]:
phased_tw_num = 0
for i, p in enumerate(phased_tw_p):
    print(p)
    if p > 0.05:
        phased_tw_num = i
        break
print("Tracy-Widom test yields %d axes of pop structure" % phased_tw_num)

```
8e-09
8e-09
8e-09
8e-09
8e-09
8e-09
8e-09
7.9e-08
0.000142452
0.007395547
0.134556018
Tracy-Widom test yields 10 axes of pop structure
```

In [0]:
phased_pca_cov = phased_x.ix[:,0:phased_tw_num]

In [0]:
phased_pca_cov

In [0]:
hdf.put('phased_pca_cov', phased_pca_cov)

In [0]:
pheno_pca_phased_z12_maf_fis = pheno.join(phased_pca_cov, how='inner').join(phased_z12_maf_fis, how='inner')

In [0]:
phased_df_filtered = phased_df.ix[phased_z12_maf_fis.index, phased_z12_maf_fis.columns]

In [0]:
phased_df_filtered.shape

In [0]:
pheno_cols = ['Mass','Pupual Duration','Total Dev Time']
pheno_cols.extend(phased_df_filtered.columns)

In [0]:
pheno[0:5]

In [0]:
pheno_phased_df_filtered = pheno.join(phased_df_filtered, how='inner')
pheno_phased_df_filtered = pheno_phased_df_filtered[pheno_cols]
pheno_phased_df_filtered[:5]

In [0]:
pheno_phased_df_filtered.ix[:,0:3] = preprocessing.scale(pheno_phased_df_filtered.ix[:,0:3])

In [0]:
pheno_phased_df_filtered_pca_cov = pheno_phased_df_filtered.join(phased_pca_cov, how='inner')

In [0]:
pheno_phased_df_filtered[:5]

In [0]:
hdf.put('pheno_phased_df_filtered_pca_cov', pheno_phased_df_filtered_pca_cov)

In [0]:
phased_snpassoc = pheno_phased_df_filtered_pca_cov.apply(snpassoc_filter)

In [0]:
phased_snpassoc = phased_snpassoc.dropna(how="all", axis=1)
phased_snpassoc.columns = [x.replace(" ", "_") for x in phased_snpassoc.columns]
phased_snpassoc[:5]

In [0]:
hdf.put('phased_snpassoc', phased_snpassoc)

In [0]:
phased_snpassoc.to_csv(os.path.join(analysis_dir, "phased_snpassoc.txt"),
                     header=True,
                     index=True,
                     sep="\t")

In [0]:
def write_phased_snpassoc_files(df, input_file, num_pca_axes):
    pheno = df.columns[0:3]
    out_files = []
    for p in pheno:
        with open(os.path.join(analysis_dir, "snpassoc_%s_phased.R" % p.lower()), "w") as o:
            print("writing %s" % o.name)
            out_files.append(o.name)
            text = '''
library(SNPassoc)

d = read.table('%s', sep="\\t", row.names=1, header=T)

#subtract b/c those are the PCA axes
snp_cols = 4:(ncol(d)-%d)
snp_data = setupSNP(d, colSNPs=snp_cols, sep="/")
pca_cols = (ncol(d)-%d):ncol(d)
pca_data = d[,pca_cols]

wg = WGassociation(%s~1+pca_data$PC1+pca_data$PC2+pca_data$PC3+pca_data$PC4+pca_data$PC5+pca_data$PC6+pca_data$PC7+pca_data$PC8+pca_data$PC9+pca_data$PC10, data=snp_data, model="co")

saveRDS(wg, "phased_wg_%s_co.rds")
stats = WGstats(wg)
saveRDS(stats, "phased_wgstats_%s.rds")
''' % (input_file, 
       num_pca_axes,
       num_pca_axes-1,
       p, 
       p.lower(), 
       p.lower())
        
            o.write(text)
    return out_files

In [0]:
phased_snpassoc_files = write_phased_snpassoc_files(phased_snpassoc,
                                                    os.path.join(analysis_dir, "phased_snpassoc.txt"),
                                                    10)

In [0]:
!cat {analysis_dir}/snpassoc_mass_phased.R

```
library(SNPassoc)

d = read.table('/home/cfriedline/eckertlab/gypsy_indiv/masked/analysis/samtools1.2/phased_snpassoc.txt', sep="\t", row.names=1, header=T)

#subtract b/c those are the PCA axes
snp_cols = 4:(ncol(d)-10)
snp_data = setupSNP(d, colSNPs=snp_cols, sep="/")
pca_cols = (ncol(d)-9):ncol(d)
pca_data = d[,pca_cols]

wg = WGassociation(Mass~1+pca_data$PC1+pca_data$PC2+pca_data$PC3+pca_data$PC4+pca_data$PC5+pca_data$PC6+pca_data$PC7+pca_data$PC8+pca_data$PC9+pca_data$PC10, data=snp_data, model="co")

saveRDS(wg, "phased_wg_mass_co.rds")
stats = WGstats(wg)
saveRDS(stats, "phased_wgstats_mass.rds")
```