# Visualization Design

By: *Tyler Biggs*

---

**Overview**

This notebook will go over the design of the vizualizations. It should also serve as a reference for future custom visualizations.

In [1]:
%load_ext autoreload
%autoreload 2
from pprint import pprint

In [2]:
import pandas as pd
import numpy as np
import collections

import bokeh as bk
import bokeh.io
import bokeh.models
import bokeh.layouts
import bokeh.plotting
bokeh.io.output_notebook()

# import holoviews as hv
# hv.extension('bokeh')

In [3]:
# Path hack to allow imports from the parent directory.
import sys, os
sys.path.insert(0, os.path.abspath('../../'))

In [23]:
from isadream.isadream.models import utils
from isadream.isadream import io

---

## Dataflow

The data is transfered from the Drupal server as a `.json` file. Those files placed into a directory as the user requests them. That is, all the datasets that a user selects for any given visualziation are placed in a directory. These files are condensed into four dataframes per `.json` file.

In [27]:
# A demo json file is provided.
nmr_json_demo = utils.SIPOS_DEMO
demo_base_path = utils.BASE_PATH
print(nmr_json_demo, '\n', demo_base_path)

/home/tylerbiggs/git/isadream/isadream/demo_data/demo_json/sipos_2006_talanta_nmr_figs.json 
 /home/tylerbiggs/git/isadream/isadream/demo_data/


In [28]:
demo_json = io.read_idream_json(nmr_json_demo)
node = io.parse_json(demo_json)

---

## Viewing the data in each Assay (datafile) per .json

---

### Getting Subsets

In [36]:
assay_chain_map = collections.ChainMap(*[a.as_dict for a in node.assays]).maps
df = pd.DataFrame.from_records(assay_chain_map)
# df.columns = pd.MultiIndex.from_tuples(df.columns)
df

Unnamed: 0,"(Material Property, Percent, ('Al(III)', 1.0))","(Material Property, Quality, ('Fake', 1.0)__('Fake', 2.0))","(Material Property, g/cm^3, ('Fake', 1.0)__('Fake', 2.0))","(Measurement, ppm, ('Al(III)', 1.0))","(Measurement, ppm, ('Fake', 1.0)__('Fake', 2.0))","(Measurement, ppm, ('OH-', 1.0)__('K+', 1.0))","(Measurement, ppm, ('OH-', 1.0)__('Li+', 1.0))","(Measurement, ppm, ('OH-', 1.0)__('Na+', 1.0))","(Measurement Condition, Molar, ('Al(III)', 1.0))","(Measurement Condition, Molar, ('Fake', 1.0)__('Fake', 2.0))","(Measurement Condition, Molar, ('OH-', 1.0)__('K+', 1.0))","(Measurement Condition, Molar, ('OH-', 1.0)__('Li+', 1.0))","(Measurement Condition, Molar, ('OH-', 1.0)__('Na+', 1.0))"
0,"[0.98, 0.98, 0.98, 0.98, 0.98]","[Poor, Poor, Poor, Poor, Poor]","[1.05, 1.05, 1.05, 1.05, 1.05]","[79.9, 79.84, 79.72, 79.66, 79.66]","[79.9, 79.84, 79.72, 79.66, 79.66]","[79.9, 79.84, 79.72, 79.66, 79.66]",,,"[0.005, 0.005, 0.005, 0.005, 0.005]","[0.006, 0.006, 0.006, 0.006, 0.006]","[2.93, 4.92, 6.85, 9.13, 10.71]",,
1,"[0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98]","[Poor, Poor, Poor, Poor, Poor, Poor, Poor, Poor]","[1.05, 1.05, 1.05, 1.05, 1.05, 1.05, 1.05, 1.05]","[79.92, 79.84, 79.66, 79.54, 79.36, 79.13, 78....","[79.92, 79.84, 79.66, 79.54, 79.36, 79.13, 78....",,"[79.92, 79.84, 79.66, 79.54, 79.36, 79.13, 78....",,"[0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.0...","[0.006, 0.006, 0.006, 0.006, 0.006, 0.006, 0.0...",,"[0.66, 1.1, 1.64, 2.14, 2.59, 3.11, 3.59, 4.11]",
2,"[0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.98, 0.9...","[Poor, Poor, Poor, Poor, Poor, Poor, Poor, Poo...","[1.05, 1.05, 1.05, 1.05, 1.05, 1.05, 1.05, 1.0...","[79.54, 79.98, 79.13, 78.45, 77.67, 76.47, 74....","[79.54, 79.98, 79.13, 78.45, 77.67, 76.47, 74....",,,"[79.54, 79.98, 79.13, 78.45, 77.67, 76.47, 74....","[0.005, 0.005, 0.005, 0.005, 0.005, 0.005, 0.0...","[0.006, 0.006, 0.006, 0.006, 0.006, 0.006, 0.0...",,,"[4.98, 0.93, 6.97, 8.96, 10.95, 12.99, 14.92, ..."


In [None]:
for 

In [30]:
for c in df.T:
    print(c)

0
1
2


In [8]:
# f_key = 'ppm'
# val_cols = [k for k in df.columns if f_key in k]
# key_cols = [k for k in df.columns if k not in val_cols]
# pprint(val_cols)
# pprint(key_cols)

In [9]:
# bkdf = df.dropna(axis=1)
# bkdf = bkdf.reindex(val_cols, axis=1).dropna(axis=1)
# bkdf

In [10]:
# cds = {}
# for samp in bkdf:
#     key = '__'.join(str(x) for x in samp)
#     cds[key] = np.concatenate(bkdf[samp].values)
    
# cds

In [21]:
def to_cds(query='ppm', drupal_node=node):
    cds = {}
    
    assay_chain_map = collections.ChainMap(*[a.as_dict for a in node.assays]).maps
    df = pd.DataFrame.from_records(assay_chain_map)
#     df.columns = pd.MultiIndex.from_tuples(df.columns)
#     df = df.dropna(axis=1)
    
    val_cols = [k for k in df.columns if query in k]
    key_cols = [k for k in df.columns if k not in val_cols]
    
    tdf = df.reindex(val_cols, axis=1)
    for samp in tdf:
        display(tdf)
        display(samp)
        key = 'val_dim__' +  '__'.join([str(x) for x in samp])
        for col in tdf:
            vectors = [v for v in tdf[col].values if v]
            vector_length = max([len(v) for v in vectors])
            cds[key] = np.concatenate([v for v in tdf[col].values if v])

    tdf = df.reindex(key_cols, axis=1)      
    for samp in tdf:
#         display(tdf)
#         display(samp)
        key = 'key_dim__' +  '__'.join([str(x) for x in samp])
        
        for col in tdf:
            cds[key] = np.concatenate([v for v in tdf[col].values if v])
        
    
    return cds

In [22]:
node_cds = to_cds()
node_cds

[('OH-', 1.0), ('K+', 1.0)]
[('Al(III)', 1.0)]
[('Fake', 1.0), ('Fake', 2.0)]
[('OH-', 1.0), ('Li+', 1.0)]
[('Al(III)', 1.0)]
[('Fake', 1.0), ('Fake', 2.0)]
[('OH-', 1.0), ('Na+', 1.0)]
[('Al(III)', 1.0)]
[('Fake', 1.0), ('Fake', 2.0)]


Unnamed: 0,"(Measurement, ppm, ('Al(III)', 1.0))","(Measurement, ppm, ('Fake', 1.0)__('Fake', 2.0))","(Measurement, ppm, ('OH-', 1.0)__('K+', 1.0))","(Measurement, ppm, ('OH-', 1.0)__('Li+', 1.0))","(Measurement, ppm, ('OH-', 1.0)__('Na+', 1.0))"
0,"[79.9, 79.84, 79.72, 79.66, 79.66]","[79.9, 79.84, 79.72, 79.66, 79.66]","[79.9, 79.84, 79.72, 79.66, 79.66]",,
1,"[79.92, 79.84, 79.66, 79.54, 79.36, 79.13, 78....","[79.92, 79.84, 79.66, 79.54, 79.36, 79.13, 78....",,"[79.92, 79.84, 79.66, 79.54, 79.36, 79.13, 78....",
2,"[79.54, 79.98, 79.13, 78.45, 77.67, 76.47, 74....","[79.54, 79.98, 79.13, 78.45, 77.67, 76.47, 74....",,,"[79.54, 79.98, 79.13, 78.45, 77.67, 76.47, 74...."


('Measurement', 'ppm', "('Al(III)', 1.0)")

ValueError: all the input arrays must have same number of dimensions

In [258]:
keys = [k for k in node_cds.keys()]
keys

["val_dim__Measurement__ppm__('Al(III)', 1.0)",
 "val_dim__Measurement__ppm__('Fake', 1.0)",
 "key_dim__Material Property__Percent__('Al(III)', 1.0)",
 "key_dim__Material Property__Quality__('Fake', 1.0)",
 "key_dim__Material Property__g/cm^3__('Fake', 1.0)",
 "key_dim__Measurement Condition__Molar__('Al(III)', 1.0)",
 "key_dim__Measurement Condition__Molar__('Fake', 1.0)"]

In [262]:
fig = bk.plotting.figure()
fig.circle(
    y="val_dim__Measurement__ppm__('Al(III)', 1.0)",
    x="key_dim__Measurement Condition__Molar__('Al(III)', 1.0)",
    source=bk.models.ColumnDataSource(to_cds())
)
bk.plotting.show(fig)

In [52]:
node_dict = dict()
for assay in node.assays:
    node_dict = dict(**node_dict, **assay.as_dict)

node_dict

TypeError: type object got multiple values for keyword argument '-9223363289947229531.8746907551759'

In [9]:
molar_df = df.xs(('Measurement_Condition', 'Molar'))
molar_df

Unnamed: 0,0,1,2,3,4
"((Al(III), 1.0),)",0.005,0.005,0.005,0.005,0.005
"((Fake, 2.0), (Fake, 1.0))",0.006,0.006,0.006,0.006,0.006
"((K+, 1.0), (OH-, 1.0))",2.93,4.92,6.85,9.13,10.71


In [10]:
ppm_df = df.xs(('Measurement', 'ppm'))
ppm_df

Unnamed: 0,0,1,2,3,4
"((Al(III), 1.0),)",79.9,79.84,79.72,79.66,79.66
"((Fake, 2.0), (Fake, 1.0))",79.9,79.84,79.72,79.66,79.66
"((K+, 1.0), (OH-, 1.0))",79.9,79.84,79.72,79.66,79.66


**Goal**

Get friendlier formats for `ColumnDataSource`.

In [11]:
def build_array(factor, assay):
    
    assay_df = pd.DataFrame.from_records(assay.column_data_source)
    assay_df.columns = pd.MultiIndex.from_tuples(assay_df.columns)
    assay_df = assay_df.T
    
    factor_df = assay_df.xs(factor)
    factor_df = factor_df.T.melt(var_name='species', value_name=str(factor))
    factor_df = factor_df.set_index('species')
    
    return factor_df

In [12]:
# for assay in node.assays:
#     display(build_array(('Measurement', 'ppm'), assay))

In [13]:
# for assay in node.assays:
#     display(build_array(('Measurement_Condition', 'Molar'), assay))

### Grouby

TODO...

In [14]:
# Groupby examples

---

# Bokeh Model

https://bokeh.pydata.org/en/latest/docs/reference/core/properties.html#container-properties

In [None]:
layout = []

for assay in node.assays:
    xs = build_array(('Measurement_Condition', 'Molar'), assay)
    
    ys = build_array(('Measurement', 'ppm'), assay)
    
    layout.append(hv.Scatter((xs, ys)))
    
hv.Layout(layout)