# Random notes
 - tap and box select tool: holding `shift` while clicking on bars allows you to select several bars

In [1]:
import numpy as np
import pandas as pd
from bokeh.plotting import figure, show
from bokeh.io import output_notebook
from bokeh.models import ColumnDataSource, LinearColorMapper, tools, CustomJS
from bokeh.palettes import Oranges256
from bokeh.transform import transform, linear_cmap

output_notebook(hide_banner=True)

In [2]:
data = pd.read_csv('../data/test_data_merged_10000.csv')
data

  exec(code_obj, self.user_global_ns, self.user_ns)


Unnamed: 0,Key,Num_DIAG,DIAG_01,DIAG_02,DIAG_03,DIAG_04,DIAG_05,DIAG_06,DIAG_07,DIAG_08,...,MYOPDATE_15,MYOPDATE_16,MYOPDATE_17,MYOPDATE_18,MYOPDATE_19,MYOPDATE_20,MYOPDATE_21,MYOPDATE_22,MYOPDATE_23,MYOPDATE_24
0,1,0,,,,,,,,,...,,,,,,,,,,
1,2,0,,,,,,,,,...,,,,,,,,,,
2,3,0,,,,,,,,,...,,,,,,,,,,
3,4,0,,,,,,,,,...,,,,,,,,,,
4,5,0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,20,D26962,D49262,D54318,D36068,D12142,D74753,D45795,D66155,...,57096.0,60529.0,55024.0,79591.0,51780.0,43183.0,38030.0,,,
9996,9997,20,D88558,D73974,D72770,D80793,D86633,D18476,D42776,D72743,...,57042.0,75706.0,10929.0,56941.0,13516.0,54186.0,57988.0,57300.0,79044.0,
9997,9998,20,D37129,D62871,D55147,D33643,D41880,D74478,D97347,D30782,...,23239.0,56516.0,16711.0,50298.0,46091.0,68457.0,12058.0,33634.0,87629.0,87833.0
9998,9999,20,D64324,D45842,D77944,D59685,D52313,D12828,D55866,D45379,...,57526.0,36408.0,25217.0,57561.0,99568.0,17156.0,78333.0,85650.0,97086.0,43875.0


In [3]:
counts = data.isnull().sum().reset_index(name='Sum')
# counts['index']
counts

Unnamed: 0,index,Sum
0,Key,0
1,Num_DIAG,0
2,DIAG_01,131
3,DIAG_02,1796
4,DIAG_03,3513
...,...,...
67,MYOPDATE_20,9994
68,MYOPDATE_21,9995
69,MYOPDATE_22,9996
70,MYOPDATE_23,9996


In [4]:
source = ColumnDataSource(data=counts)
TOOLS = "box_select, tap, reset"

# horizontal bar plot
# p = figure(title="Missing by column", y_range=counts['index'], width=960, height=960)
# p.hbar(y='index', right='Sum', source=source)
# p.xaxis.axis_label = "Count of missing"

# vertical bar plot
width = 0.5 # width of bars 
p = figure(title="Value bar chart", x_range=counts['index'], tools=TOOLS, width=960, height=960)
p.vbar(x='index', top='Sum', width=width, source=source)
p.xaxis.major_label_orientation = 'vertical'
p.yaxis.axis_label = "Number of missing values"

# 'level_0' is the name given to the index of the data by 'ColumnDataSource'
selected_indices = source.data['level_0']

source.selected.indices = selected_indices

# make a custom javascript callback that exports the indices of the selected points to the Jupyter notebook
callback = CustomJS(args=dict(source=source), 
                    code="""
                         console.log('Running CustomJS callback now.');
                         var indices = source.selected.indices;
                         var data = source.selected.data;
                         var kernel = IPython.notebook.kernel;
                         kernel.execute("selected_indices = " + indices)
                         """)

# set the callback to run when a selection geometry event occurs in the figure
p.js_on_event('selectiongeometry', callback)

show(p)

In [5]:
# get data of selection
# TODO: list(...) problem when only selecting one bar
list_selected_indices = list(selected_indices)
selected_data = counts.iloc[list_selected_indices]
selected_data

Unnamed: 0,index,Sum
0,Key,0
1,Num_DIAG,0
2,DIAG_01,131
3,DIAG_02,1796
4,DIAG_03,3513
...,...,...
67,MYOPDATE_20,9994
68,MYOPDATE_21,9995
69,MYOPDATE_22,9996
70,MYOPDATE_23,9996


In [6]:
data

Unnamed: 0,Key,Num_DIAG,DIAG_01,DIAG_02,DIAG_03,DIAG_04,DIAG_05,DIAG_06,DIAG_07,DIAG_08,...,MYOPDATE_15,MYOPDATE_16,MYOPDATE_17,MYOPDATE_18,MYOPDATE_19,MYOPDATE_20,MYOPDATE_21,MYOPDATE_22,MYOPDATE_23,MYOPDATE_24
0,1,0,,,,,,,,,...,,,,,,,,,,
1,2,0,,,,,,,,,...,,,,,,,,,,
2,3,0,,,,,,,,,...,,,,,,,,,,
3,4,0,,,,,,,,,...,,,,,,,,,,
4,5,0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,20,D26962,D49262,D54318,D36068,D12142,D74753,D45795,D66155,...,57096.0,60529.0,55024.0,79591.0,51780.0,43183.0,38030.0,,,
9996,9997,20,D88558,D73974,D72770,D80793,D86633,D18476,D42776,D72743,...,57042.0,75706.0,10929.0,56941.0,13516.0,54186.0,57988.0,57300.0,79044.0,
9997,9998,20,D37129,D62871,D55147,D33643,D41880,D74478,D97347,D30782,...,23239.0,56516.0,16711.0,50298.0,46091.0,68457.0,12058.0,33634.0,87629.0,87833.0
9998,9999,20,D64324,D45842,D77944,D59685,D52313,D12828,D55866,D45379,...,57526.0,36408.0,25217.0,57561.0,99568.0,17156.0,78333.0,85650.0,97086.0,43875.0


In [7]:
data_missingness = data.isnull()
data_missingness = data_missingness.groupby(list(data_missingness)).size().reset_index(name='Count')
data_missingness = data_missingness.multiply(data_missingness['Count'], axis='index').loc[:, data_missingness.columns != 'Count']
data_missingness = data_missingness.reset_index()
data_missingness_long = pd.melt(data_missingness, id_vars=['index'])

source2 = ColumnDataSource(data_missingness_long)


In [8]:
pal = list(reversed(Oranges256))
pal[0] = '#FF000000'
colourmap = LinearColorMapper(palette=pal, low=0, high=data_missingness_long['value'].max())
p2 = figure(title="Missingness pattern", width=960, height=960, y_range=list(data_missingness), x_range=[str(idx) for idx in list(data_missingness.index)])
p2.background_fill_color = '#FFFFFF'
p2.grid.visible = False
p2.rect(y='variable', x='index', source=source2, width=1.05, height=1.05, fill_color=transform('value', colourmap), 
        line_color=None)
show(p2)

In [9]:
# Second attempt: clip zero missingness to background colour

p2 = figure(title="Missingness pattern", width=960, height=960, 
            y_range=list(data_missingness), 
            x_range=[str(idx) for idx in list(data_missingness.index)])

p2.background_fill_color = '#cccccc'
p2.grid.visible = False

p2.rect(y='variable', x='index', source=source2, width=1.0, height=1.0,
        fill_color=linear_cmap(field_name='value',
                               palette=list(reversed(Oranges256)),
                               low=1,
                               high=data_missingness_long['value'].max(),
                               low_color='#00000000'),
        line_color='#cccccc')

show(p2)