In [0]:
from itertools import izip
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
%matplotlib inline
import seaborn as sns
import dill
from IPython.display import display
from statsmodels.sandbox.stats.multicomp import multipletests

In [0]:
def shell(cmd):
    from subprocess import Popen, PIPE
    p = Popen(cmd, shell=True, stdout=PIPE, stderr=PIPE)
    stdout, stderr = p.communicate()
    return [x for x in stdout.split("\n") if x != '']#, stderr.split("\n")

In [0]:
cd ~/g/projects/black_spruce_new

##GO Graphs

Generated as follows:

1. filtered by unigene name according to expression in tissue > 0, including unigenes found in other tissues.
1. Sequence filter (5)
1. Graph-coloring (Node score)
1. Score alpha (0.6)
1. Node Score Filter (20)

In [0]:
all_needle_graphs = shell("ls *graph*20150707_1105*all_needle*.txt")
all_cambium_graphs = shell("ls *graph*20150707_1105*all_cambium*.txt")

#dge_needle_graphs = !ls *graph*20150511_1221*needle*.txt | grep -v 'all'
#dge_cambium_graphs = !ls *graph*20150511_1221*cambium*.txt | grep -v 'all'


In [0]:
all_needle_graphs

In [0]:
# graphs = {"Needle-All": all_needle_graphs,
#           "Cambium-All": all_cambium_graphs,
#          "Needle-DGE":dge_needle_graphs,
#          "Cambium-DGE":dge_cambium_graphs}

In [0]:
graphs = {"Needle-All": all_needle_graphs,
          "Cambium-All": all_cambium_graphs}

In [0]:
graphs

In [0]:
pwd

In [0]:
shell("ls -lrt *.dill")

In [0]:
gentables = dill.load(open("gentables.dill"))

In [0]:
gentables.keys()

In [0]:
sns.set_context("notebook", font_scale=1.2)

##Write out unigenes that are present in each tissue

In [0]:
counts_file = "seqclean/all_ests.fa.clean_output/contig_member.counts"
counts_df = pd.read_csv(counts_file, sep="\t", index_col=0)
counts_df = counts_df.assign(C = lambda x: x.P32C + x.P40C > 0)
counts_df = counts_df.assign(N = lambda x: x.P32N + x.P40N > 0)
with open("needle_unigenes.txt", "w") as o:
    for elem in counts_df[counts_df.N].index.tolist():
        o.write("%s\n" % elem)
with open("cambium_unigenes.txt", "w") as o:
    for elem in counts_df[counts_df.C].index.tolist():
        o.write("%s\n" % elem)

##Generate blast2go combined graphs for each ontology

* Sequence filter = 5
* Graph coloring = by Node Score
* Score alpha = 0.6
* Node Score Filter = 5
* Node Information = all

In [0]:
mpl.rcParams = mpl.rcParamsDefault

In [0]:
mpl.rcParams['svg.fonttype'] = 'none'
mpl.rcParams['font.sans-serif'] = ['Arial']
mpl.rcParams['font.size'] = 40.0

##Generate the pie charts

They are annotated as follows:

* The top 20 go terms are added to the terms that are significant, dropping duplicates for cases where significant terms are in the top 20
* if this number > 20, then the **not** significant terms are trimmed, starting with the term having the smallest number of associated unigenes, until there are a total of 20 go terms in the pie chart
* If there are $\le$ 20 terms after adding sigificant ones, no trimming is performed
* If there are > 20 terms still remaining after processing non-sig. from the bottom up, then only top 20 are kept
* \* indicates that a term is significant at p < 0.05
* \*\* indicates that a term is significant at p < 0.05 after BH multiple test correction
* The numbers in parentheses mean the following:
    * In the title, it is the number of unique unigenes in the chart
    * In each term, it is the number of unigenes annotated to that term

In [0]:
def plot_pie(plot_data, title, merged, top):
    unique_seqs = set()
    sig = ""
    fig = plt.gcf()
    fig.set_size_inches(8,8)
    labels = None
    plot_data['classicFisher'] = plot_data['classicFisher'].astype(float)
    if merged:
        plot_data['Label'] = plot_data.apply(lambda row: "%s (%d)" % (row["Term_x"],row["#Seqs"]), axis=1)
        plot_data['sig1'] = plot_data.apply(lambda x: "*" if float(x.classicFisher) < 0.05 else "", axis=1)
        plot_data['sig2'] = plot_data.apply(lambda x: "*" if x.bh == True else "", axis=1)
        plot_data['siglabel'] = plot_data.apply(lambda row: "%s%s%s" % (row.Label, row.sig1, row.sig2), axis=1)
        labels = plot_data['siglabel']
    else:
        plot_data['Label'] = plot_data.apply(lambda row: "%s (%d)" % (row["Term"],row["#Seqs"]), axis=1)
        labels = plot_data['Label']
        
    num_sig = len(plot_data[plot_data.classicFisher < 0.05])
    num_bh = len(plot_data[plot_data.bh ==True])
        
    sig_data = pd.DataFrame(plot_data[plot_data.Label != plot_data.siglabel])
    #plot_data = plot_data.ix[0:top,:]
    plot_data = plot_data.append(sig_data)
    plot_data = plot_data.drop_duplicates()
    plot_data = plot_data.sort("#Seqs", ascending=True)
    plot_data['not_sig'] = plot_data.apply(lambda x: True if not "*" in x.siglabel else False, axis=1)
        
    
    if len(plot_data) > top:
        print "trimming from %d to %d" % (len(plot_data), top)
        plot_data = plot_data.sort("#Seqs")
        plot_data["#Seqs"]
        to_delete = len(plot_data)-top
        deleted = 0
        delete_me = []
        for row in plot_data.iterrows():
            if row[1]['not_sig']:
                delete_me.append(row[0])
                deleted += 1
            
            if deleted == to_delete:
                break
        plot_data = plot_data.drop(delete_me)    
    
    plot_data = plot_data.sort("#Seqs", ascending=False)
    
    if len(plot_data) > top:
        plot_data = plot_data.ix[0:top,:]
    
    labels = plot_data['siglabel']
    
    colors = sns.cubehelix_palette(len(plot_data), 
                                   start=.5, 
                                   rot=-1.5, 
                                   dark=.15, 
                                   light=1.0, 
                                   reverse=True)
    
    
    plt.pie(plot_data["#Seqs"],
           colors=colors,
           labels = labels)
    
    for row in plot_data.iterrows():
        for seq in row[1].Sequences.split(","):
            unique_seqs.add(seq)
   
    print num_sig, num_bh
    title = "The %d top and significant terms for %s (%d/%d/%d)"  % (top, title, len(unique_seqs),
                                                                    num_sig, num_bh)
    
    plt.title(title)
    pdf_name = title.replace(" ", "_").replace("/", "-") + ".svg"
    plt.savefig(pdf_name, bbox_inches="tight")
    plt.show()
    
    
def process_graph_file_by_level(key, g):
    ontology = g[-6:-4].upper()
    tissue = key.split("-")[0].lower()
    df = pd.read_csv(g, sep="\t", header=0, index_col=1)
    genkey = "%s-%s" % (tissue, ontology)
    merged = False
    if genkey in gentables:
        gt = gentables[genkey]['gt']
        gt.index = gt['GO.ID']
        df = df.merge(gt, left_index=True, right_index=True)
        merged = True
    for level, data in df.groupby("Level"):
        if level > 1:
            d = data.sort("#Seqs", ascending=False)
            plot_pie(d, "%s/%s/Level %d" % (key, ontology, level), merged)
            
def correct_pvals(df):
    return multipletests([float(x) for x in df.classicFisher], method="fdr_bh")[0]
            
            
def process_graph_file(key, g, top):
    ontology = g[-6:-4].upper()
    tissue = key.split("-")[0].lower()
    df = pd.read_csv(g, sep="\t", header=0, index_col=1)
    genkey = "%s-%s" % (tissue, ontology)
    print genkey
    merged = False
    if genkey in gentables:
        gt = gentables[genkey]['gt']
        gt.index = gt['GO.ID']
        df = df.merge(gt, left_index=True, right_index=True)
        merged = True
        d = df.sort("#Seqs", ascending=False)
        d = d[d.Level != 1]
        d['bh'] = correct_pvals(d)
        plot_pie(d, "%s/%s" % (key, ontology), merged, top)
            
            
plotted = 0
for key, files in graphs.items():
    d = key.split("-")
    key = "%s_all-%s" % (d[0], d[1])
    if not "DGE" in key:
        print "================= Charts for %s =================" % key
        for g in files:
            process_graph_file(key, g, 30)
            #process_graph_file_by_level(key, g)
            plotted+=1  