In [22]:
import numpy as np
import pandas as pd

from Bio import AlignIO, SeqIO

import bokeh as bk

import panel as pn
import panel.widgets as pnw
pn.extension()



In [60]:
### load the data

clustering_data = pd.read_csv("../Data/clustering_info.csv")
rep_counts = pd.read_csv("../Data/chimeric_lib_rep_counts.csv")

### Introduce an additional column with member count
clustering_data["member_count"] = clustering_data["Members_char"].str.count('AAV')
### Sort the entries by the member count
clustering_data = clustering_data.sort_values(by=['member_count'])

clustering_data = pd.merge(clustering_data, rep_counts, on = ['Representative'])

clustering_data.head()

Unnamed: 0,Cluster,Representative,Members_char,member_count,Chimeric.Count,Chimeric.Percentage
0,261,AAV.178444,AAV.178444,1,1,0.0
1,204,AAV.123793,AAV.123793,1,1,0.0
2,195,AAV.115309,AAV.115309,1,1,0.0
3,194,AAV.157653,AAV.157653,1,1,0.0
4,190,AAV.158741,AAV.158741,1,1,0.0


In [58]:
### Lower and upper cluster sizes to be included in the bar plot
lower_cut = 100
upper_cut = max(clustering_data["member_count"])

### Filter the data
cut_tmp = clustering_data[(clustering_data.member_count > lower_cut) & 
                          (clustering_data.member_count < upper_cut)]
### Prepare the ColumnDataSource
### reps is the representative names, counts is the size of the corresponding cluster
source = bk.models.ColumnDataSource(data = dict(reps = cut_tmp.Representative, counts = cut_tmp.member_count))

### Make a range slider for changing the lower and upper cuts
range_slider = bk.models.RangeSlider(start = 1, 
                                     end = upper_cut, 
                                     value = (lower_cut, upper_cut), 
                                     step=1, 
                                     title="Range of the cluster sizes")


### Make the log and linear bar plots
### VV: Ideally, we should have changed the y_axis_type property, but I could now access it
p_linear = bk.plotting.figure(width = 1800, 
                              height = 1300, 
                              y_range = [1e-2, 5e3], 
                              x_range = source.data["reps"].values, 
                              title="Cluster sizes", 
                              sizing_mode="scale_both", 
                              y_axis_type="linear")
    
    
p_linear.vbar(x = 'reps', 
              bottom = 1e-2, 
              top = 'counts', 
              width = 0.7, 
              source = source,
              line_color = 'white', 
              fill_color = 'lightblue')

p_log = bk.plotting.figure(width = 1800, 
                              height = 1300, 
                              y_range = [1e-2, 5e3], 
                              x_range = source.data["reps"].values, 
                              title="Cluster sizes", 
                              sizing_mode="scale_both", 
                              y_axis_type="log")
    
    
p_log.vbar(x = 'reps', 
              bottom = 1e-2, 
              top = 'counts', 
              width = 0.7, 
              source = source,
              line_color = 'white', 
              fill_color = 'lightblue')

### Rotate the x-axis labels
p_linear.xaxis.major_label_orientation = np.pi/4
p_log.xaxis.major_label_orientation = np.pi/4

### Combine the log and linear plots into separate tabs of a single panel
panels = [bk.models.widgets.Panel(child = p_linear, title="Linear scale"), 
          bk.models.widgets.Panel(child = p_log, title="Log scale")]

tabs = bk.models.widgets.Tabs(tabs = panels)

### Also show the data in a table format
    
columns = [
    bk.models.TableColumn(field = 'reps', title = 'Representative'),
    bk.models.TableColumn(field = 'counts', title = 'Cluster size')
    ]

representative_table = bk.models.DataTable(source=source, columns=columns, fit_columns=False)



def update_data(attrname, old, new):

    ### Get the current slider values
    lower_cut = range_slider.value[0]
    upper_cut = range_slider.value[1]

    ### And introduce the cut to the data
    cut_tmp = clustering_data[(clustering_data.member_count>lower_cut) & 
                              (clustering_data.member_count<upper_cut)]
    
    ### There is a bug when the filtered data is empty
    ### Should figure out how to correct this efficiently
    #if not len(cut_tmp):
    #    lower_cut = 100
    #    upper_cut = 1000
        
    #    range_slider.value[0] = lower_cut
    #    range_slider.value[1] = upper_cut
        
    #    cut_tmp = clustering_data[(clustering_data.member_count>lower_cut) & 
    #                         (clustering_data.member_count<upper_cut)]
        
    source.data = dict(reps=cut_tmp.Representative, counts=cut_tmp.member_count)

    ### Prepare new list, and update the figure
    new_list = list(source.data["reps"].values)
    p_log.x_range.factors = new_list
    p_linear.x_range.factors = new_list

    
    



range_slider.on_change('value', update_data)


pn.pane.Bokeh(bk.layouts.column(range_slider, tabs, representative_table, width=800))


In [68]:
### Lower and upper cluster sizes to be included in the bar plot
lower_cut = 100
upper_cut = max(clustering_data["Chimeric.Count"])

### Filter the data
cut_tmp = clustering_data[(clustering_data["Chimeric.Count"] > lower_cut) & 
                          (clustering_data["Chimeric.Count"] < upper_cut)]
### Prepare the ColumnDataSource
### reps is the representative names, counts is the size of the corresponding cluster
source = bk.models.ColumnDataSource(data = dict(reps = cut_tmp.Representative, counts = cut_tmp["Chimeric.Count"]))

### Make a range slider for changing the lower and upper cuts
range_slider = bk.models.RangeSlider(start = 1, 
                                     end = upper_cut, 
                                     value = (lower_cut, upper_cut), 
                                     step=1, 
                                     title="Range of the cluster sizes")


### Make the log and linear bar plots
### VV: Ideally, we should have changed the y_axis_type property, but I could now access it
p_linear = bk.plotting.figure(width = 1800, 
                              height = 1300, 
                              y_range = [1e-2, 7e3], 
                              x_range = source.data["reps"].values, 
                              title="Combined counts", 
                              sizing_mode="scale_both", 
                              y_axis_type="linear")
    
    
p_linear.vbar(x = 'reps', 
              bottom = 1e-2, 
              top = 'counts', 
              width = 0.7, 
              source = source,
              line_color = 'white', 
              fill_color = 'lightblue')

p_log = bk.plotting.figure(width = 1800, 
                              height = 1300, 
                              y_range = [1e-2, 5e3], 
                              x_range = source.data["reps"].values, 
                              title="Cluster sizes", 
                              sizing_mode="scale_both", 
                              y_axis_type="log")
    
    
p_log.vbar(x = 'reps', 
              bottom = 1e-2, 
              top = 'counts', 
              width = 0.7, 
              source = source,
              line_color = 'white', 
              fill_color = 'lightblue')

### Rotate the x-axis labels
p_linear.xaxis.major_label_orientation = np.pi/4
p_log.xaxis.major_label_orientation = np.pi/4

### Combine the log and linear plots into separate tabs of a single panel
panels = [bk.models.widgets.Panel(child = p_linear, title="Linear scale"), 
          bk.models.widgets.Panel(child = p_log, title="Log scale")]

tabs = bk.models.widgets.Tabs(tabs = panels)

### Also show the data in a table format
    
columns = [
    bk.models.TableColumn(field = 'reps', title = 'Representative'),
    bk.models.TableColumn(field = 'counts', title = 'Combined counts')
    ]

representative_table = bk.models.DataTable(source=source, columns=columns, fit_columns=False)



def update_data(attrname, old, new):

    ### Get the current slider values
    lower_cut = range_slider.value[0]
    upper_cut = range_slider.value[1]

    ### And introduce the cut to the data
    cut_tmp = clustering_data[(clustering_data["Chimeric.Count"]>lower_cut) & 
                              (clustering_data["Chimeric.Count"]<upper_cut)]
    
    ### There is a bug when the filtered data is empty
    ### Should figure out how to correct this efficiently
    #if not len(cut_tmp):
    #    lower_cut = 100
    #    upper_cut = 1000
        
    #    range_slider.value[0] = lower_cut
    #    range_slider.value[1] = upper_cut
        
    #    cut_tmp = clustering_data[(clustering_data.member_count>lower_cut) & 
    #                         (clustering_data.member_count<upper_cut)]
        
    source.data = dict(reps=cut_tmp.Representative, counts=cut_tmp["Chimeric.Count"])

    ### Prepare new list, and update the figure
    new_list = list(source.data["reps"].values)
    p_log.x_range.factors = new_list
    p_linear.x_range.factors = new_list

    
    



range_slider.on_change('value', update_data)


pn.pane.Bokeh(bk.layouts.column(range_slider, tabs, representative_table, width=800))


In [51]:
df2

Unnamed: 0,lkey,value_2
0,foo,5
1,bar,6
2,baz,7
3,foo,8


In [52]:
pd.merge(df1, df2, on=["lkey"])

Unnamed: 0,lkey,value_1,value_2
0,foo,1,5
1,foo,1,8
2,foo,5,5
3,foo,5,8
4,bar,2,6
5,baz,3,7


In [42]:
df1

Unnamed: 0,lkey,value
0,foo,1
1,bar,2
2,baz,3
3,foo,5
