In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import pandas as pd
import numpy as np
import statsmodels as sm
from statsmodels.sandbox.stats.multicomp import multipletests
import os,sys
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [0]:
os.environ['R_HOME'] = "/home/cfriedline/R3/lib64/R/"
import rpy2.ipython
import rpy2.robjects as robjects
%load_ext rpy2.ipython
r = robjects.r

In [0]:
%%R
library(topGO)
library(qvalue)

In [0]:
ri2py = robjects.conversion.ri2py

In [0]:
count_file = "seqclean/all_ests.fa.clean_output/contig_member.counts"

In [0]:
counts = pd.read_csv(count_file, sep="\t", header=0, index_col=0)

In [0]:
counts[0:5]

In [0]:
def combine(row):
    return pd.Series([None, (row.P32C+row.P40C), (row.P32N+row.P40N)])
combined = counts.apply(combine, axis=1)
combined.columns = ["Descr", "C", "N"]
combined.index.name = "UNIQID"

In [0]:
combined[0:5]

In [0]:
totals = pd.DataFrame(columns=combined.columns)
totals.ix['UNIQID',:] = combined.apply(np.sum)
totals.ix['UNIQID','Descr'] = 'Descr'
totals

In [0]:
df = pd.concat([totals, combined])
df[0:5]

In [0]:
df.Descr[1:] = df.index[1:]

In [0]:
new_index = ["UNIQID"]
new_index.extend([int(x.replace("UN", "")) for x in df.index[1:]])
new_index[0:5]

In [0]:
df.index = new_index

In [0]:
df[0:5]

In [0]:
len(df)

In [0]:
def convert_to_int(col):
    try:
        return col.astype(int)
    except:
        return col
df = df.apply(convert_to_int)
        
df.to_csv("ideg6_counts.txt", sep="\t", header=False, index=True, float_format="%.0f")

In [0]:
!head ideg6_counts.txt

## Use IDEG6 web tool to calculate differentially expressed genes
http://telethon.bio.unipd.it/bioinfo/IDEG6_form/

In [0]:
pwd

In [0]:
bonferroni_alpha = 2.570694e-05

In [0]:
results = pd.read_csv("ideg6_results.txt", sep="\t", header=0, index_col=0)
results.columns = [x.replace(".", "") for x in results.columns]
results.columns = [x.replace("-", "_") for x in results.columns]
results.columns = [x.strip() for x in results.columns]
results = results.ix[:,:-1] #drop extra column at the end

In [0]:
stat_cols = [u'AC1_2', u'Fisher1_2', u'Chi2x21_2', u'R', u'Chi']

In [0]:
qvalue_cols = [u'Fisher1_2', u'Chi2x21_2', u'R', u'Chi']

In [0]:
def fdr_bh(pvals):
    return multipletests(pvals, method="fdr_bh")[1]

In [0]:
def q_value(pvals):
    p = robjects.FloatVector(pvals)
    robjects.globalenv['p'] = p
    vals = r('qvalue(p)')
    return pd.Series(ri2py(vals.rx('pvalues')[0]))
qvalue_results = results[qvalue_cols].apply(q_value)
qvalue_results.columns = ["%s_q" % x for x in qvalue_results.columns]

In [0]:
fdr_results = results[stat_cols].apply(fdr_bh)
fdr_results.columns = ["%s_fdr" % x for x in fdr_results.columns]

In [0]:
results_df = results.join(fdr_results).join(qvalue_results)

In [0]:
sns.set_context("talk")
X = sorted(results_df.Chi)
plt.step(X, np.arange(len(X)))
plt.show()

In [0]:
results_df.columns

In [0]:
results_df.Description = [x.strip() for x in results_df.Description]

In [0]:
results_df[['Chi','Chi_fdr','Chi_q']][0:10]

In [0]:
fdr_cols = [u'AC1_2_fdr', u'Fisher1_2_fdr', u'Chi2x21_2_fdr', u'R_fdr', u'Chi_fdr',
           'Fisher1_2_q', 'Chi2x21_2_q', 'R_q', 'Chi_q']

In [0]:
fdr_res = pd.DataFrame(index=['total','sig'])
for col in fdr_cols:
    d = results_df[col]
    fdr_res[col] = [len(d), len(d[d<0.05])]
fdr_res.T

In [0]:
stat_res = pd.DataFrame(index=['total','p<0.05', 'p<bonferroni'])
for col in stat_cols:
    d = results_df[col]
    stat_res[col] = [len(d), len(d[d<0.05]), len(d[d<bonferroni_p])]
stat_res.T

In [0]:
len(results_df[results_df.Chi < 0.05]), len(results_df[results_df.Chi_fdr < 0.05])

In [0]:
cd ~/g/projects/black_spruce/

In [0]:
go_file = "topGO_blast2go_export_20150330_1727.txt"

In [0]:
go = pd.read_csv(go_file, sep="\t", header=None, index_col=0, names=["go"])
go.index = [x.strip() for x in go.index]

In [0]:
go[0:5]

In [0]:
len(go)

In [0]:
counts_go = counts.join(go)

In [0]:
len(counts_go)

In [0]:
results_df.index = [x.strip() for x in results_df.Description]

In [0]:
results_df[0:5]

In [0]:
len(results_df[results_df.Chi_fdr<0.05])

In [0]:
full = counts_go.join(results_df)

In [0]:
len(full)

In [0]:
full_with_go = full.ix[full.go.dropna().index]

In [0]:
len(full_with_go)

In [0]:
sig = full[(full.Chi_fdr < 0.05)]

In [0]:
len(sig)

In [0]:
sig.head()

In [0]:
sig_with_go = sig.ix[sig.go.dropna().index]

In [0]:
len(sig_with_go)

In [0]:
dge_N = sig_with_go[sig_with_go.Lib2 > sig_with_go.Lib1]
dge_C = sig_with_go[sig_with_go.Lib2 < sig_with_go.Lib1]

In [0]:
def get_num_terms(x):
    return len(x.split(","))

print dge_C.go.apply(get_num_terms).describe()
print dge_N.go.apply(get_num_terms).describe()

In [0]:
pwd

In [0]:
dge_N.ix[:,0:5].to_csv("dge_N.csv")
dge_C.ix[:,0:5].to_csv("dge_C.csv")

In [0]:
len(sig_with_go), len(dge_C), len(dge_N)

In [0]:
with open("dge_needle_names.txt", "w") as o:
    for name in dge_N.index.tolist():
        o.write("%s\n" % name)
        
with open("dge_cambium_names.txt", "w") as o:
    for name in dge_C.index.tolist():
        o.write("%s\n" % name)

In [0]:
ls -lrt

In [0]:
cat dge_needle_names.txt

##Run topgo with new counts from iAssembler

In [0]:
full_with_go['go'].to_csv("go_mappings.txt", sep="\t", header=False, index=True)

In [0]:
len(full_with_go)

In [0]:
!head go_mappings.txt

In [0]:
!wc -l go_mappings.txt

In [0]:
%%R
rm(list=ls())

In [0]:
robjects.globalenv['full_with_go'] = robjects.DataFrame(full_with_go)
robjects.globalenv['sig_with_go'] = robjects.DataFrame(sig_with_go)
robjects.globalenv['dge_C'] = robjects.DataFrame(dge_C)
robjects.globalenv['dge_N'] = robjects.DataFrame(dge_N)

In [0]:
%%R
library(topGO)
gene_names = rownames(full_with_go)
cambium_interesting = rownames(dge_C)
needle_interesting = rownames(dge_N)
gene_id_2go  = readMappings(file="go_mappings.txt")
interesting = list()
interesting$cambium = cambium_interesting
interesting$needle = needle_interesting
godata = list()
gentables = list()
gentables_bh = list()
gentables_qval = list()
onts = c("BP","CC", "MF")
sigs = list()
descriptions = list()
for (i in 1:length(onts)) {
    for (j in 1:length(interesting)) {
        interest = interesting[[j]]
        gene_list <- factor(as.integer(gene_names %in% interest))
        names(gene_list) <- gene_names
        description=paste(names(interesting)[j], onts[i], sep="-")
        descriptions = append(descriptions, description)
        GOdata = new("topGOdata",
                     description=description,
                     ontology = onts[i], 
                     allGenes = gene_list, 
                     annot = annFUN.gene2GO, 
                     gene2GO = gene_id_2go,
                     nodeSize=2)
        print(GOdata)
        godata = append(godata, GOdata)
        classicFisher = runTest(GOdata, algorithm = "classic", statistic = "fisher")
        weight01Fisher = runTest(GOdata, algorithm = "weight01", statistic = "fisher")
        sigs = append(sigs, classicFisher)
        printGraph(GOdata, 
                   classicFisher, 
                   firstSigNodes = 2, 
                   fn.prefix = paste("tGO", "for", description(GOdata)), 
                   #fn.prefix = paste("tGOslim", "for", description(GOdata)), 
                   useInfo = "all")
        
        gt = GenTable(GOdata, 
                      classicFisher=classicFisher, 
                      weight01Fisher=weight01Fisher, 
                      topNodes=length(classicFisher@score), 
                      orderBy="classicFisher", numChar=1000)
        gentables = append(gentables, list(gt))
        
        fisher_p = as.numeric(gt[,"classicFisher"])
        
        gt.bh = gt[which(p.adjust(fisher_p,method="BH")<=0.05),]
        
        #print(qvalue(fisher_p))
    
        
        gt.qval = gt[which(qvalue(fisher_p)$qvalues<=0.05),]
        gentables_bh = append(gentables_bh, list(gt.bh))
        gentables_qval = append(gentables_qval, list(gt.qval))
        write.table(gt, file=paste(description(GOdata), ".txt", sep=""), row.names=F)
        write.table(gt.bh, file=paste(description(GOdata), "_bh.txt", sep=""), row.names=F)
        write.table(gt.qval, file=paste(description(GOdata), "_qval.txt", sep=""), row.names=F)  
    }
}
save.image("topgo.Rdata")

In [0]:
gentables = {}
for i, desc in enumerate(r("descriptions")):
    d = {"gt":None, "gt_bh":None}
    d['gt'] = ri2py(r("gentables")[i])
    d['gt_bh'] = ri2py(r("gentables_bh")[i])
    d['gt_qval'] = ri2py(r("gentables_qval")[i])
    gentables[desc[0]] = d

In [0]:
gentables.keys()

In [0]:
for k in gentables:
    gt = gentables[k]['gt']
    bh = gentables[k]['gt_bh']
    q = gentables[k]['gt_qval']
    gt['bh'] = gt['GO.ID'].isin(bh['GO.ID'])
    gt['qval'] = gt['GO.ID'].isin(q['GO.ID'])
    testgt=gt

In [0]:
for k in gentables:
    print k
    gt = gentables[k]['gt']
    print k, len(gt[gt.bh==True]), len(gt[gt.qval==True]), len(gt) 
    #print gt[gt.bh==True]

In [0]:
fasta_file = "seqclean/all_ests.fa.clean_output/all_unigene_seq.fasta"

In [0]:
from Bio import SeqIO

In [0]:
cambium_dge_seqs = []
needle_dge_seqs = []
for rec in SeqIO.parse(fasta_file, "fasta"):
    if rec.id in dge_C.index:
        cambium_dge_seqs.append(rec)
    
    if rec.id in dge_N.index:
        needle_dge_seqs.append(rec)
print SeqIO.write(cambium_dge_seqs, open("cambium_dge.fasta","w"), "fasta")
print SeqIO.write(needle_dge_seqs, open("needle_dge.fasta","w"), "fasta")

In [0]:
import dill

In [0]:
dill.dump(gentables, open("gentables.dill", "w"))

In [0]:
pwd