In [0]:
%load_ext autoreload
%autoreload 2

In [0]:
import pandas as pd
import numpy as np
import statsmodels as sm
from statsmodels.sandbox.stats.multicomp import multipletests
import os,sys

In [0]:
os.environ['R_HOME'] = "/home/cfriedline/R3/lib64/R/"
import rpy2.ipython
import rpy2.robjects as robjects
import pandas.rpy.common as com
%reload_ext rmagic
r = robjects.r

In [0]:
count_file = "seqclean/all_ests.fa.clean_output/contig_member.counts"

In [0]:
counts = pd.read_csv(count_file, sep="\t", header=0, index_col=0)

In [0]:
counts[0:5]

In [0]:
def combine(row):
    return pd.Series([None, (row.P32C+row.P40C), (row.P32N+row.P40N)])
combined = counts.apply(combine, axis=1)
combined.columns = ["Descr", "C", "N"]
combined.index.name = "UNIQID"

In [0]:
combined[0:5]

In [0]:
totals = pd.DataFrame(columns=combined.columns)
totals.ix['UNIQID',:] = combined.apply(np.sum)
totals.ix['UNIQID','Descr'] = 'Descr'
totals

In [0]:
df = pd.concat([totals, combined])
df[0:5]

In [0]:
df.Descr[1:] = df.index[1:]

In [0]:
new_index = ["UNIQID"]
new_index.extend([int(x.replace("UN", "")) for x in df.index[1:]])
new_index[0:5]

In [0]:
df.index = new_index

In [0]:
df[0:5]

In [0]:
def convert_to_int(col):
    try:
        return col.astype(int)
    except:
        return col
df = df.apply(convert_to_int)
        
df.to_csv("ideg6_counts.txt", sep="\t", header=False, index=True, float_format="%.0f")

In [0]:
!head ideg6_counts.txt

## Use IDEG6 web tool to calculate differentially expressed genes
http://telethon.bio.unipd.it/bioinfo/IDEG6_form/

In [0]:
bonferroni_p = 2.570694e-05

In [0]:
results = pd.read_csv("ideg6_results.txt", sep="\t", header=0, index_col=0)
results.columns = [x.replace(".", "") for x in results.columns]
results.columns = [x.replace("-", "_") for x in results.columns]
results.columns = [x.strip() for x in results.columns]
results = results.ix[:,:-1] #drop extra column at the end

In [0]:
stat_cols = [u'AC1_2', u'Fisher1_2', u'Chi2x21_2', u'R', u'Chi']

In [0]:
def fdr_bh(pvals):
    return multipletests(pvals, method="fdr_bh")[1]

In [0]:
fdr_results = results[stat_cols].apply(fdr_bh)
fdr_results.columns = ["%s_fdr" % x for x in fdr_results.columns]

In [0]:
results_df = results.join(fdr_results)

In [0]:
results_df.columns

In [0]:
fdr_cols = [u'AC1_2_fdr', u'Fisher1_2_fdr', u'Chi2x21_2_fdr', u'R_fdr', u'Chi_fdr']

In [0]:
fdr_res = pd.DataFrame(index=['total','bh_p<0.05'])
for col in fdr_cols:
    d = results_df[col]
    fdr_res[col] = [len(d), len(d[d<0.05])]
fdr_res.T

In [0]:
stat_res = pd.DataFrame(index=['total','p<0.05', 'p<bonferroni'])
for col in stat_cols:
    d = results_df[col]
    stat_res[col] = [len(d), len(d[d<0.05]), len(d[d<bonferroni_p])]
stat_res.T

In [0]:
go_file = "all_filtered_plants_30_30_annot_GOs_20140916_1726.txt_topGO.txt"

In [0]:
go = pd.read_csv(go_file, sep="\t", header=None, index_col=0, names=["go"])
go.index = [x.split("_")[1] for x in go.index]

In [0]:
go[0:5]

In [0]:
counts_go = counts.join(go)

In [0]:
results_df.index = [x.strip() for x in results_df.Description]

In [0]:
full = counts_go.join(results_df)

In [0]:
sig = full[(full.AC1_2_fdr <0.05) & (full.Chi_fdr < 0.05)]

In [0]:
dge_N = sig[sig.Lib2 > sig.Lib1]
dge_C = sig[sig.Lib2 < sig.Lib1]

##Run topgo with new counts from iAssembler

In [0]:
%%R
library(topGO)

In [0]:
sig['go'].to_csv("go_mappings.txt", sep="\t", header=False, index=True)

In [0]:
robjects.globalenv['full'] = com.convert_to_r_dataframe(full)

In [0]:
%%R
library(topGO)
#rm(list=ls())
counts = full
gene_names = rownames(counts)
c = as.numeric(counts$Lib1)
n = as.numeric(counts$Lib2)
names(c) = rownames(counts)
names(n) = rownames(counts)
cambium_interesting = c[c>1]
needle_interesting = n[n>1]
gene_id_2go  = readMappings(file="go_mappings.txt")
interesting = list()
interesting$cambium = cambium_interesting
interesting$needle = needle_interesting
godata = list()
gentables = list()
gentables_bh = list()
onts = c("BP","CC", "MF")
sigs = list()
for (i in 1:length(onts)) {
    for (j in 1:length(interesting)) {
        interest = interesting[[j]]
        gene_list <- factor(as.integer(gene_names %in% names(interest)))
        names(gene_list) <- gene_names
        GOdata = new("topGOdata",
                     description=paste(names(interesting)[j], onts[i], sep="-"),
                     ontology = onts[i], 
                     allGenes = gene_list, 
                     annot = annFUN.gene2GO, 
                     gene2GO = gene_id_2go,
                     nodeSize=5)
        print(GOdata)
        godata = append(godata, GOdata)
        classicFisher = runTest(GOdata, algorithm = "classic", statistic = "fisher")
        weight01Fisher = runTest(GOdata, algorithm = "weight01", statistic = "fisher")
        sigs = append(sigs, classicFisher)
        printGraph(GOdata, 
                   classicFisher, 
                   firstSigNodes = 5, 
                   fn.prefix = paste("tGO", "for", description(GOdata)), 
                   #fn.prefix = paste("tGOslim", "for", description(GOdata)), 
                   useInfo = "all")
        
        gt = GenTable(GOdata, 
                      classicFisher=classicFisher, 
                      weight01Fisher=weight01Fisher, 
                      topNodes=length(classicFisher@score), 
                      orderBy="classicFisher", numChar=1000)
        gentables = append(gentables, list(gt))
        gt.bh = gt[which(p.adjust(gt[,"classicFisher"],method="BH")<=0.05),]
        gentables_bh = append(gentables_bh, list(gt.bh))
        write.table(gt, file=paste(description(GOdata), ".txt", sep=""), row.names=F)
        write.table(gt.bh, file=paste(description(GOdata), "_bh.txt", sep=""), row.names=F)
        }    
}
save.image("topgo.Rdata")