In [43]:
import pandas as pd
from bokeh.plotting import figure, output_file, show
from bokeh.models import ColumnDataSource
from bokeh.models.tools import HoverTool
from bokeh.palettes import Dark2_5 as palette
import itertools
from read_file_table import get_multiindex_ctrl_file_table
import re

output_file("pileup_test.html")

file_table_path = "/Users/lij/PycharmProjects/eclip_analysis_pipeline/file_table.csv"
file_table_df = get_multiindex_ctrl_file_table(file_table_path)
file_table_df = file_table_df.sort_index()

def generate_pileup_figure(filepath, gene_name, cell):
    global file_table_df
    filtered_file_table_df = file_table_df.loc[gene_name,cell].copy()
    
#     ctrl_accession = filtered_file_table_df[filtered_file_table_df["experiment_type"] == "ctrl"]["experiment_accession"]
    ctrl_accession = filtered_file_table_df[filtered_file_table_df["experiment_type"] == "ctrl"]["experiment_accession"]
    data_accession = filtered_file_table_df[filtered_file_table_df["experiment_type"] == "eclip"]["experiment_accession"]
    
    assert len(ctrl_accession) == 1, "Multiple accessions detected for controls"
    assert len(data_accession) == 1, "Multiple accessions detected for dataset"
    
    ctrl_accession = ctrl_accession[0]
    data_accession = data_accession[0]
                                    
    pileup_data = pd.read_csv(filepath)
    pileup_data = pileup_data.reset_index()
    pileup_data.sort_index(axis=1,inplace=True)
    pileup_data.rename(columns={'index':'nucleotide position'}, inplace=True)
    pileup_data['nucleotide position'] = pileup_data['nucleotide position']+1
    pileup_data.columns = [column.replace("_sorted_CONTIG_","_") for column in pileup_data.columns]
    source = ColumnDataSource(pileup_data)
    ctrl_names = [column for column in pileup_data.columns if ctrl_accession in column]
    data_names = [column for column in pileup_data.columns if data_accession in column]
    
    contig = ctrl_names[0].split("_")[-1].upper()
    
    assert len(ctrl_names) == 1, "More than one control dataset found"
    colors = itertools.cycle(palette)
    p = figure(plot_width=1500, plot_height=1000)
    p.title.text = f"{gene_name}, {contig} rRNA ({cell})"
    for ctrlset in ctrl_names:
        p.line(x="nucleotide position", y=ctrlset,source=source,  color=next(colors),legend=(ctrl_accession+"_"+gene_name+"_CONTROL"))
    for dataset in data_names:
        p.line(x="nucleotide position", y=dataset,source=source, line_width=2, color=next(colors),legend=dict(value=dataset))
    p.title.text_font_size = "24pt"
    p.xaxis.axis_label = "Nucleotide position"
    p.yaxis.axis_label = "Reads per Million"
    p.axis.axis_label_text_font_size = "14pt"
    p.axis.major_label_text_font_size = "14pt"
    return p


# path = "/Users/lij/PycharmProjects/eclip_analysis_pipeline/20201027_pileup_data/pileup_output/average_pileup_output/18s_average_pileup.csv"
path = "/Users/lij/PycharmProjects/eclip_analysis_pipeline/20201027_pileup_data/pileup_output/18s_pileup_table.csv"

reggie = generate_pileup_figure(path, 'DDX6',"K562")

show(reggie)