## Dev Dashboarding ##
panel serve PHNX_prod_analytics.ipynb --session-token-expiration 600
* https://docs.google.com/document/d/1PU1OUlR6i1fGRnLsv8iYXmmY_byGu2kccWzCMyY922o/edit

In [1]:
%load_ext autoreload
%autoreload 2

import pandas as pd
import matplotlib.pyplot as plt
import sys, os
import panel as pn
import pickle
import numpy as np

cwd = os.getcwd()
slash_idx = [i for i,l in enumerate(cwd) if l=='/']
repo_dir = cwd[:slash_idx[-1]]
base_dir = cwd[:slash_idx[-2]]
sys.path.append(repo_dir)
sys.path.append(base_dir)

from analysis_pipeline.analysis_pipeline import AnalysisPipeline
from analytics_utils.analytics_utils.lims_utils import get_plate_runs, plates_from_exp, plates_from_workflow
from analytics_utils.analytics_utils.s3_interface import download_from_s3, upload_to_s3, s3_imgupload, s3_df2csv, s3_csv2df
from analytics_utils.analytics_utils.table_properties import * 
from analytics_utils.analytics_utils.visualization_utils import plot_boxplot

pd.set_option('display.max_columns', None)
s3_model_bucket = 'ml-models-registry'
s3_model_bucket_subdirectory = 'exerevnetes-preprocessing-models/'

pn.extension(sizing_mode="stretch_width")
overall_analytics_panel = []
variant_analytics_panel = []
top_variant_reps = []
top_variant_analytics_panel = []
activity_corr_panel = []
activityCV_corr_panel = []
plate_analytics_panel = []
library_analytics_panel = []
unit_analytics_panel = []

# Get data from table

In [2]:
project_subfolder = 'VQ-BL21vsC41_LCMSONLY' 
model_subfolder = f'{s3_model_bucket_subdirectory}{project_subfolder}/'
exp_workflow = ['WF10192']
table = 'lcms_detections' # 'combi_analytics_table' 
metric_dict = {'LcmsC18':['pellet_OD', 'measured_nonbinary_sum_(r)','measured_nonbinary_score_(r)', 'measured_conversion_(r)']}
analysis_pipeline = AnalysisPipeline(s3_subfolder=model_subfolder, neg_ctrltype='EV', sort_by=None, metric_dict=metric_dict, get_dashboard_panel=True)
df = analysis_pipeline.load_data(table, exp_workflow=exp_workflow)
df = analysis_pipeline.get_derived_metrics(df, derived_metrics_to_get=['LcmsC18'])
df = analysis_pipeline.standardize_dataset_columns(df, data_type=table, update_analytics_table=False)

executing query: SELECT "id", "address", "run", "plate", "dev_or_prod", "exp_workflow_barcode", "exp_workflow_name", "proj_barcode", "proj_name", "ctrl_type", "exp_condition", "enzyme_barcode", "sequence", "hamming", "mutations", "reference_enzyme", "enzyme_concentration", "enzyme_unit", "enzyme_class", "sequence_qc", "sample_position", "sample_type", "substrate_barcode", "substrate_concentration", "substrate_unit", "substrate_smiles", "substrate_mz", "product_smiles", "product_mz", "sub_area", "prod_area", "ptr_lcms", "sub_conc_lcms_actual", "prod_conc_lcms_actual", "sum_conc_lcms_actual", "dilution_factor", "injector_volume", "expected_concentration", "concentration_units", "source_plate", "source_address", "seed_address", "seed_address_alphanumeric", "seed_plate", "main_plate", "rxn_plate", "library_barcode", "library_ref", "library_description", "seed_plate_time", "main_plate_time", "rxn_plate_time", "plate_time", "pellet_OD", "pellet_detected", "pellet_area", "pellet_intensity", "

## Data Cleanup + Update exp_condition labels on data

In [73]:
# include only 'exp' type samples 
df = df.loc[df.ctrl_type=='exp']

# clean up exp_condition labels
exp_condition_list = df.exp_condition.tolist()
exp_condition_list = [str(e).upper().replace(' ','') for e in exp_condition_list]
df.loc[:, 'exp_condition'] = exp_condition_list
print(set(df.exp_condition))

# # add substrate concentration to exp_condition labels
# sub_conc_list = [f'{str(int(sub_conc))}mM' for sub_conc in df['substrate_concentration_(r)'].tolist()]
# exp_condition_list = [f'{sub_conc}_{exp_condition}' for sub_conc, exp_condition in zip(sub_conc_list, exp_condition_list)]
# exp_condition_list = [str(e).upper().replace(' ','') for e in exp_condition_list]
# df.loc[:, 'exp_condition'] = exp_condition_wsubconc_list
# print(set(df.exp_condition))

{'C41_MINIPREP', 'BL21_INVIVO', 'BL21_MINIPREP', 'C41_INVITRO', 'BL21_INVITRO'}


In [74]:
# clean up enzyme_barcode labels
enz_list = df.enzyme_barcode.tolist()
enz_list = [str(e) for e in enz_list]
df.loc[:, 'enzyme_barcode'] = enz_list

# clean up mutation strings labels
enz_list = df.mutations.tolist()
enz_list = [str(e) for e in enz_list]
df.loc[:, 'mutations'] = enz_list

# Get Analytics from data

In [6]:
# get experiment analytics metrics
# experiment_analytics, experiment_analytics_panel = analysis_pipeline.GET_EXPERIMENT_ANALYTICS(df, display_table=True)
variant_analytics, variant_analytics_panel = analysis_pipeline.GET_VARIANT_ANALYTICS(df, display_table=True)

# get individual boxplots for each variant

******************************
Computing Variant Analytics...
******************************
LcmsC18 ['pellet_OD', 'measured_nonbinary_sum_(r)', 'measured_nonbinary_score_(r)', 'measured_conversion_(r)']
Obtained variant analytics metrics. 

**********************************
Computing Top Variant Analytics...
**********************************
Obtained top variant analytics metrics. 



In [None]:
# # get boxplots, split by substrate concentration, with all variants
# for sub_conc in set(top_variant_reps['substrate_concentration_(r)']):
#     top_variant_reps_subconc = top_variant_reps.loc[top_variant_reps['substrate_concentration_(r)']==sub_conc]
#     boxplot_activity_corr_list = analysis_pipeline.get_activity_correlation_boxplots(top_variant_reps_subconc, metric_list=['measured_nonbinary_score_(r)', 'pellet_OD', 'measured_nonbinary_sum_(r)'], metricname_list=['RacemicProduct', 'PelletOD', 'RacemicSum'], groupby=['mutations', 'exp_condition'])

# get boxplots, split by enzyme variant
enz_list = top_variant_stats.mutations.tolist()
for enz in enz_list:
    variant_reps = df.loc[df['mutations']==enz]
    boxplot_activity_corr_list = analysis_pipeline.get_activity_correlation_boxplots(variant_reps, metric_list=['measured_nonbinary_score_(r)'], metricname_list=['RacemicProduct'], groupby=['mutations', 'exp_condition'])


In [None]:
plot_activity_corr_list = analysis_pipeline.get_activity_correlation_plots(top_variant_reps, xmetric_list=['pellet_OD', 'measured_nonbinary_sum_(r)'], xmetricname_list=['PelletOD', 'RacemicSum'], ymetric='measured_nonbinary_score_(r)', ymetricname='RacemicProduct', groupby='mutations')
plot_activityCV_corr_list = analysis_pipeline.get_activity_CV_correlation_plots(top_variant_stats, xmetric_list=['pellet_OD_cv', 'measured_nonbinary_sum_(r)_cv'], xmetricname_list=['PelletOD-CV', 'RacemicSum-CV'], ymetric='measured_nonbinary_score_(r)_cv', ymetricname='RacemicProduct-CV', groupby='mutations')
activity_corr_panel = pn.Column(
    '# Correlation of PelletOD & SumConc to Activity (by sample)', 
    *plot_activity_corr_list[:3], 
    pn.Row(*plot_activity_corr_list[3:], background='White'), 
    background='White', width=1500, scroll=True)
activityCV_corr_panel = pn.Column(
    '# Correlation of CV of PelletOD & SumConc to CV of Activity (by variant)', 
    pn.Row(*plot_activityCV_corr_list), 
    background='White', scroll=True)

# Get Analytics Dashboard

In [None]:
analytics_panel = overall_analytics_panel + variant_analytics_panel + top_variant_analytics_panel + top_variant_analytics_panel + activity_corr_panel + activityCV_corr_panel + plate_analytics_panel + library_analytics_panel + unit_analytics_panel

pn.template.FastListTemplate(
    site="Panel", 
    title="PHNX Production Analytics", 
    main=analytics_panel, 
    main_max_width="1760px",
).servable();

# Manual Analysis

In [None]:
lib = 'LIB13047'
df_LIB = df[df.library_barcode==lib]
enz_list = list(set(df_LIB.mutations))

col_to_plot = 'measured_nonbinary_score_(r)'
groupby = ['substrate_concentration_(r)', 'exp_condition']
for enz in enz_list:
    df_enz = df_LIB.loc[df_LIB.mutations==enz]
    exp_condition_list = list(set(df_enz.exp_condition))
    print(enz, exp_condition_list)
    mut = df_enz.iloc[0]['mutations']
    n = len(df_enz)
    ax = df_enz.boxplot(column=col_to_plot, by=groupby, rot=90, figsize=(6,6))
    ax_labels = [label._text[1:-1].split(', ') for label in ax.get_xticklabels()]
    for tick, label in enumerate(ax_labels):
        df_label = df_enz.loc[(df_enz[groupby[0]]==float(label[0])) & (df_enz[groupby[1]]==label[1])]
        n_label = len(df_label)
        ax.text(tick+1, df_label[col_to_plot].median(), n_label, horizontalalignment='center', color='r')
    plt.title(f'{enz} ({mut}), n={n}')
    plt.show()

# Comparison with historical Phoenix data

### Get old data

In [44]:
exp_workflow_phnx = ['WF10125','WF10128','WF10130','WF10132', 'WF10134', 'WF10136', 'WF10139', 'WF10144', 'WF10149', 'WF10157', 'WF10158', 'WF10163', 'WF10169', 'WF10173', 'WF10181']  # None # 
df_phnx = analysis_pipeline.load_data(table, exp_workflow=exp_workflow_phnx)
# df_phnx = analysis_pipeline.get_derived_metrics(df_phnx, derived_metrics_to_get=['LcmsC18'])
df_phnx = analysis_pipeline.standardize_dataset_columns(df_phnx, data_type=table, update_analytics_table=False)

executing query: SELECT "id", "address", "run", "plate", "dev_or_prod", "exp_workflow_barcode", "exp_workflow_name", "proj_barcode", "proj_name", "ctrl_type", "exp_condition", "enzyme_barcode", "sequence", "hamming", "mutations", "reference_enzyme", "enzyme_concentration", "enzyme_unit", "enzyme_class", "sequence_qc", "sample_position", "sample_type", "substrate_barcode", "substrate_concentration", "substrate_unit", "substrate_smiles", "substrate_mz", "product_smiles", "product_mz", "sub_area", "prod_area", "ptr_lcms", "sub_conc_lcms_actual", "prod_conc_lcms_actual", "sum_conc_lcms_actual", "dilution_factor", "injector_volume", "expected_concentration", "concentration_units", "source_plate", "source_address", "seed_address", "seed_address_alphanumeric", "seed_plate", "main_plate", "rxn_plate", "library_barcode", "library_ref", "library_description", "seed_plate_time", "main_plate_time", "rxn_plate_time", "plate_time", "pellet_OD", "pellet_detected", "pellet_area", "pellet_intensity", "

### Filter for data slice

In [None]:
# var_mutation = 'A129S' # 'V152A_L403T'
enz_barcode = 'ENZ10045'

# get variant data from phoenix     
# df_phnx_var = df_phnx[df_phnx.enzyme_barcode==enz_barcode].copy()
df_phnx_var = df_phnx[(df_phnx.enzyme_barcode==enz_barcode) & (df_phnx.ctrl_type=='pos')].copy()
# df_phnx_var = df_phnx[df_phnx.mutations==var_mutation].copy()
df_phnx_var = df_phnx_var[df_phnx_var.exp_workflow_barcode != 'WF10181'] # remove unit 16
df_phnx_var.loc[df_phnx_var['substrate_concentration_(r)']>300, 'substrate_concentration_(r)'] = 308
df_phnx_var.loc[:,'exp_condition'] = 'C41_PHNX'
display(df_phnx_var[['mutations', 'library_barcode', 'exp_condition']])

# get variant data from BL21 vs C41 experiment
df_var = df[(df.enzyme_barcode==enz_barcode) & (df.library_barcode=='LIB13036')].copy()
# df_var = df[df.mutations==var_mutation].copy()
display(df_var[['mutations', 'library_barcode', 'exp_condition']])

### Visualize analytics

In [None]:
data_var = pd.concat([df_phnx_var, df_var])
col_to_plot = ['measured_nonbinary_score_(r)']
groupby = ['substrate_concentration_(r)', 'exp_condition', 'library_barcode', 'exp_workflow_barcode']
# groupby = ['substrate_concentration_(r)', 'exp_condition', 'exp_workflow_barcode']
boxplot_list = plot_boxplot(data_var, col_to_plot, groupby=groupby, showplot=True, show_n=True)