# Load required modules

* [Link to SOP](https://docs.google.com/document/d/1Oi2pUYFsxWAtLrsqoprCOqMYejmTCKDhwDJGekK00Og/edit)
* https://docs.google.com/document/d/1PU1OUlR6i1fGRnLsv8iYXmmY_byGu2kccWzCMyY922o/edit

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys, os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

cwd = os.getcwd()
slash_idx = [i for i,l in enumerate(cwd) if l=='/']
repo_dir = cwd[:slash_idx[-1]]
if repo_dir not in sys.path: sys.path.append(repo_dir)

from analysis_pipeline.dataset_creation import DatasetCreation 
from analysis_pipeline.analysis_pipeline import AnalysisPipeline
from analytics_utils.lims_tools.lims_utils import get_plate_runs, plates_from_exp, plates_from_workflow
from analytics_utils.database_access.s3_interface import download_from_s3, upload_to_s3, s3_imgupload, s3_df2csv, s3_csv2df
from analytics_utils.database_access.table_properties import * 
pd.set_option('display.max_columns', None)

dataset_folder = '../DATASETS/'
results_folder = '../detectionspipeline_results/'
models_folder = '../MODELS/'
s3_bucket = 'ml-analytics-filestore' 
s3_subbucket = '' # 'exerevnetes-preprocessing-models/'

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Fetch plates associated with an experiment barcode or workflow barcode

In [None]:
exp_workflow_barcode = 'WF10125' # None # 'EXP10088' #
if exp_workflow_barcode is not None:
    if 'EXP' in exp_workflow_barcode:
        plate_list = plates_from_exp(exp_workflow_barcode, 'hts')
    elif 'WF' in exp_workflow_barcode:
        plate_list = plates_from_workflow(exp_workflow_barcode, 'hts')
else: 
    # define plate list
    plate_list = ['PLT17136']

print(f"Plate list: {plate_list}")

# 1. Train models

## Create dataset & Train hits prediction models

In [None]:
project_subfolder = 'Phoenix-prod_Unit2' # 'Phoenix-prod_Unit2-3-4-5-6-7-retest-8-9-10-11' # 
s3_subfolder = f'{s3_subbucket}{project_subfolder}/'
model_fname_suffix = '_chiral' # ''
model_fname = f'Maldi12Classification-LcmsRegression_{project_subfolder}{model_fname_suffix}'
plate=None
run=None
exp_workflow = ['WF10125'] # ['WF10125','WF10128','WF10130','WF10132', 'WF10134', 'WF10136', 'WF10139', 'WF10141', 'WF10144', 'WF10149', 'WF10157'] # ['WF10139', 'WF10144'] # 
lcms_method='C18'
maldi_1_csv=None
maldi_2_csv=None 
lcms_csv=None
lcms_chiral_csv=None
data_types_to_get=['maldi_detections_1', 'maldi_detections_2', 'lcms_detections', 'lcms_detections_chiral'] #
model_type_list = ['maldi1sample_classification', 'maldi2sample_classification', 'maldi2lcms_regression']
posctrl_enzyme_barcode = 'ENZ10045' 
plot_enantiomer_lcms_metrics = ['CMP60403','CMP60404']
split_by_list = ['CMP60354'] # ['CMP60354', 'CMP60403', 'CMP60404']
combine_maldi_lcms_by = ['source_address', 'source_plate_grp']
col_suffix_db = {'':'_(r)', 'CMP60354':'_(r)', 'CMP60403':'_(-)', 'CMP60404':'_(+)'}
hit_filter_dict = {
    f"data['maldi_lcms']['true_score_binary_CMP60354']": (0, '>'), 
#     f"data['maldi_lcms']['true_score_binary_CMP60403']": (0, '>'),
#     f"data['maldi_lcms']['true_score_binary_CMP60404']": (0, '>')
}
get_2sample_from_1sample_dataset = True
maldi_label_condition={'pk_prod_stdz_379':{'pk_thres':8e-2, 'rxn_thres_factor':1}} # ENZ10045
# maldi_label_condition={'pk_prod_stdz_379':{'pk_thres':4.5e-2, 'rxn_thres_factor':1}} # ENZ10045
source_plate_groupings = {}
plate_to_grp_idx_mapping = {}
source_plate_groupings = {0: ['PLT17266', 'PLT17270', 'PLT17268'], 1: ['PLT17399', 'PLT17450', 'PLT17398'], 2: ['PLT17456', 'PLT17454', 'PLT17455'], 3: ['PLT17391', 'PLT17390', 'PLT17389'], 4: ['PLT17273', 'PLT17269', 'PLT17265'], 5: ['PLT17263', 'PLT17264', 'PLT17262'], 6: ['PLT17392', 'PLT17394', 'PLT17393'], 7: ['PLT17459', 'PLT17466', 'PLT17463'], 8: ['PLT17272', 'PLT17267', 'PLT17271'], 9: ['PLT17464', 'PLT17460', 'PLT17457'], 10: ['PLT17465', 'PLT17458', 'PLT17461'], 11: ['PLT17453', 'PLT17452', 'PLT17451'], 12: ['PLT17395', 'PLT17396'], 13: ['PLT17602'], 14: ['PLT17467', 'PLT17468', 'PLT17462'], 15: ['PLT17600'], 16: ['PLT17599'], 17: ['PLT17601'], 18: ['PLT17473'], 19: ['PLT17590'], 20: ['PLT17474', 'PLT17470', 'PLT17471', 'PLT17472'], 21: ['PLT17587'], 22: ['PLT17589'], 23: ['PLT17588'], 24: ['PLT17280'], 25: ['PLT17277'], 26: ['PLT17388'], 27: ['PLT17283'], 28: ['PLT17596'], 29: ['PLT17598'], 30: ['PLT17586'], 31: ['PLT17597'], 32: ['PLT17621'], 33: ['PLT17620'], 34: ['PLT17619'], 35: ['PLT17622'], 36: ['PLT17610'], 37: ['PLT17618'], 38: ['PLT17608'], 39: ['PLT17611'], 40: ['PLT17609']}
plate_to_grp_idx_mapping = {'PLT17181': 0, 'PLT17488': 1, 'PLT17490': 2, 'PLT17323': 3, 'PLT17182': 4, 'PLT17179': 5, 'PLT17324': 6, 'PLT17521': 7, 'PLT17180': 8, 'PLT17519': 9, 'PLT17520': 10, 'PLT17489': 11, 'PLT17487': 12, 'PLT17730': 13, 'PLT17522': 14, 'PLT17728': 15, 'PLT17727': 16, 'PLT17729': 17, 'PLT17736': 18, 'PLT17674': 19, 'PLT17523': 20, 'PLT17671': 21, 'PLT17673': 22, 'PLT17672': 23, 'PLT17185': 24, 'PLT17184': 25, 'PLT17183': 26, 'PLT17186': 27, 'PLT17793': 28, 'PLT17795': 29, 'PLT17792': 30, 'PLT17794': 31, 'PLT17854': 32, 'PLT17851': 33, 'PLT17850': 34, 'PLT17855': 35, 'PLT17946': 36, 'PLT17865': 37, 'PLT17945': 38, 'PLT17947': 39, 'PLT17944': 40}
dataset_creation = DatasetCreation(s3_subfolder=s3_subfolder, 
                                      model_fname=model_fname,
                                      neg_ctrltype='EV', 
                                      model_type_list=model_type_list, 
                                      posctrl_enzyme_barcode=posctrl_enzyme_barcode,
                                      maldi_label_condition=maldi_label_condition,
                                      hit_filter_dict=hit_filter_dict,
                                      combine_maldi_lcms_by=combine_maldi_lcms_by,
                                      plot_enantiomer_lcms_metrics=plot_enantiomer_lcms_metrics,
                                      col_suffix_db=col_suffix_db,
                                      get_2sample_from_1sample_dataset=get_2sample_from_1sample_dataset
                                     )
data = dataset_creation.TRAIN(plate=plate, run=run, exp_workflow=exp_workflow, lcms_method=lcms_method, data_types_to_get=data_types_to_get, 
              maldi_1_csv=maldi_1_csv, maldi_2_csv=maldi_2_csv, lcms_csv=lcms_csv, lcms_chiral_csv=lcms_chiral_csv, get_maldi_labels=True, get_lcms_labels=True,
              simulate_2sample_dataset=False, split_by_list=split_by_list, 
              get_source_plate_groupings=True, source_plate_groupings=source_plate_groupings, plate_to_grp_idx_mapping=plate_to_grp_idx_mapping, 
              generate_heatmaps=False, skip_model_training=False)

# SAVE MODEL
upload_to_s3(dataset_creation.s3_bucket, dataset_creation.model_path, dataset_creation)

# 2. Perform Hits Selection & Worklist Generation
## 2a. Load analysis class with trained models

In [None]:
project_subfolder = 'Phoenix-prod_Unit2' # 'Phoenix-prod_Unit2-3-4-5-6-7-retest-8-9' # 
s3_subfolder = f'{s3_subbucket}{project_subfolder}/'
model_fname_suffix = '_chiral' # ''
model_fname = f'Maldi12Classification-LcmsRegression_{project_subfolder}{model_fname_suffix}'
model_path = f'{s3_subfolder}{model_fname}'
loaded_dataset_creation = download_from_s3(s3_bucket, model_path)

## 2b. Perform hits selection & worklist generation (from MALDI data)

In [None]:
%load_ext autoreload
%autoreload 2
    
plate=None
run=None
exp_workflow = ['WF10125']
lcms_method='C18'
maldi_1_csv=None
maldi_2_csv=None 
lcms_csv=None
lcms_chiral_csv=None
get_2sample_from_1sample_dataset=True
source_plate_groupings = {0: ['PLT17264', 'PLT17262', 'PLT17263'], 1: ['PLT17268', 'PLT17270', 'PLT17266'], 2: ['PLT17267', 'PLT17272', 'PLT17271'], 3: ['PLT17269', 'PLT17273', 'PLT17265']}
plate_to_grp_idx_mapping = {'PLT17179': 0, 'PLT17181': 1, 'PLT17180': 2, 'PLT17182': 3}
split_by_list_PREDICT = ['CMP60354']
data_types_to_get=['maldi_detections_1', 'maldi_detections_2'] # , 'lcms_detections', 'lcms_detections_chiral']
# select_by=['predicted_binary_score', 'predicted_enantiomeric_excess_(+over-)']
select_by=['predicted_nonbinary_score']
max_num_hits= 600
select_up_to_max=True
generate_worklist=False
generate_heatmaps=False
generate_boxplots=False

loaded_dataset_creation.PREDICT(plate=plate, run=run, exp_workflow=exp_workflow, lcms_method=lcms_method, data_types_to_get=data_types_to_get, 
              maldi_1_csv=maldi_1_csv, maldi_2_csv=maldi_2_csv, lcms_csv=lcms_csv, lcms_chiral_csv=lcms_chiral_csv,
            get_source_plate_groupings=True, source_plate_groupings=source_plate_groupings, plate_to_grp_idx_mapping=plate_to_grp_idx_mapping, 
            split_by_list_PREDICT=split_by_list_PREDICT, generate_heatmaps=generate_heatmaps, generate_boxplots=generate_boxplots)
loaded_dataset_creation.HIT_SELECTION(loaded_dataset_creation.enzyme_analytics_df, select_by=select_by, 
                                      max_num_hits=max_num_hits, select_up_to_max=select_up_to_max, generate_worklist=generate_worklist)

# 3. Get LCMS ONLY dataset and Perform Hits Selection (without models)
## 3a. Create Dataset

In [4]:
perform_hits_selection = False
s3_subfolder = f'VQ-Retest5/' # 'VQ-Retest4_LCMSONLY' # 'Phoenix-prod_Unit2-3-4-5-6-7-8-9-10-11-12-13-14-15-16_LCMSONLY' # 
run= None # ['WF10179'] # 
exp_workflow = ['WF10198'] #['WF10191'] # ['WF10125','WF10128','WF10130','WF10132', 'WF10134', 'WF10136', 'WF10139', 'WF10144', 'WF10149', 'WF10157', 'WF10158', 'WF10163', 'WF10169', 'WF10173', 'WF10181']  # None # 
lcms_csv=None
lcms_chiral_csv = None
combine_by = ['address', 'source_plate_grp', 'substrate_concentration', 'exp_condition'] #  ['address', 'source_plate_grp'] #
combine_maldi_lcms_by = ['source_address', 'source_plate_grp', 'substrate_concentration', 'exp_condition'] # ['source_address', 'source_plate_grp'] # 
get_source_plate_groupings=True
source_plate_groupings={}
plate_to_grp_idx_mapping={}
source_plate_groupings = {0: ['PLT17609'], 1: ['PLT17465', 'PLT17458', 'PLT17461'], 2: ['PLT17269', 'PLT17265', 'PLT17273'], 3: ['PLT17587'], 4: ['PLT17472', 'PLT17471', 'PLT17470', 'PLT17474'], 5: ['PLT17454', 'PLT17455', 'PLT17456'], 6: ['PLT17398', 'PLT17399', 'PLT17450'], 7: ['PLT17389', 'PLT17390', 'PLT17391'], 8: ['PLT17451', 'PLT17453', 'PLT17452'], 9: ['PLT17599'], 10: ['PLT17598'], 11: ['PLT17457', 'PLT17464', 'PLT17460'], 12: ['PLT17590'], 13: ['PLT17270', 'PLT17268', 'PLT17266'], 14: ['PLT17601'], 15: ['PLT17468', 'PLT17462', 'PLT17467'], 16: ['PLT17610'], 17: ['PLT17264', 'PLT17263', 'PLT17262'], 18: ['PLT17588'], 19: ['PLT17618'], 20: ['PLT17586'], 21: ['PLT17602'], 22: ['PLT17622'], 23: ['PLT17589'], 24: ['PLT17271', 'PLT17267', 'PLT17272'], 25: ['PLT18073'], 26: ['PLT17466', 'PLT17459', 'PLT17463'], 27: ['PLT17608'], 28: ['PLT17596'], 29: ['PLT17394', 'PLT17392', 'PLT17393'], 30: ['PLT17395', 'PLT17396'], 31: ['PLT17613'], 32: ['PLT17621'], 33: ['PLT17611'], 34: ['PLT17620'], 35: ['PLT17597'], 36: ['PLT17615'], 37: ['PLT17600'], 38: ['PLT17473'], 39: ['PLT18074'], 40: ['PLT17614'], 41: ['PLT17619'], 42: ['PLT17280'], 43: ['PLT17388'], 44: ['PLT17283'], 45: ['PLT17277'], 46: ['PLT18078'], 47: ['PLT18070'], 48: ['PLT18079'], 49: ['PLT17606'], 50: ['PLT17605'], 51: ['PLT17604'], 52: ['PLT18068'], 53: ['PLT18075'], 54: ['PLT18067'], 55: ['PLT17595'], 56: ['PLT18573', 'PLT18575'], 57: ['PLT18568', 'PLT18561']}
plate_to_grp_idx_mapping = {'PLT17944': 0, 'PLT17520': 1, 'PLT17182': 2, 'PLT17671': 3, 'PLT17523': 4, 'PLT17490': 5, 'PLT17488': 6, 'PLT17323': 7, 'PLT17489': 8, 'PLT17727': 9, 'PLT17795': 10, 'PLT17519': 11, 'PLT17674': 12, 'PLT17181': 13, 'PLT17729': 14, 'PLT17522': 15, 'PLT17946': 16, 'PLT17179': 17, 'PLT17672': 18, 'PLT17865': 19, 'PLT17792': 20, 'PLT17730': 21, 'PLT17855': 22, 'PLT17673': 23, 'PLT17180': 24, 'PLT18124': 25, 'PLT17521': 26, 'PLT17945': 27, 'PLT17793': 28, 'PLT17324': 29, 'PLT17487': 30, 'PLT18040': 31, 'PLT17854': 32, 'PLT17947': 33, 'PLT17851': 34, 'PLT17794': 35, 'PLT18041': 36, 'PLT17728': 37, 'PLT17736': 38, 'PLT18123': 39, 'PLT18042': 40, 'PLT17850': 41, 'PLT17185': 42, 'PLT17183': 43, 'PLT17186': 44, 'PLT17184': 45, 'PLT18246': 46, 'PLT18242': 47, 'PLT18247': 48, 'PLT18332': 49, 'PLT18331': 50, 'PLT18327': 51, 'PLT18439': 52, 'PLT18436': 53, 'PLT18440': 54, 'PLT18435': 55, 'PLT18680': 56, 'PLT18679': 57}
posctrl_enzyme_barcode = 'ENZ10045' 
split_by_list = ['CMP60354'] # ['CMP60354', 'CMP60403', 'CMP60404'] # 
plate_col = 'plate_CMP60354'
subconc_init = 100
plot_enantiomer_lcms_metrics = ['CMP60403','CMP60404']
col_suffix_db = {'':'_(r)', 'CMP60354':'_(r)', 'CMP60403':'_(-)', 'CMP60404':'_(+)'}
data_types_to_get = ['lcms_detections', 'lcms_detections_chiral'] # ['lcms_detections'] # 
get_binary_labels_lcms = True
nonbinary_label_col = 'prod_conc_lcms_actual'
select_by = ['measured_nonbinary_score']
max_num_hits = 600
select_up_to_max=True
generate_worklist=True
generate_heatmaps=False
generate_boxplots=False

lcms_analysis = DatasetCreation(s3_subfolder=s3_subfolder, 
                                 neg_ctrltype='EV', 
                                 posctrl_enzyme_barcode=posctrl_enzyme_barcode,
                                 combine_maldi_lcms_by=combine_maldi_lcms_by, 
                                 plot_enantiomer_lcms_metrics=plot_enantiomer_lcms_metrics,
                                 col_suffix_db=col_suffix_db)
lcms_analysis.split_by_list = split_by_list
lcms_analysis.split_by_list_PREDICT = split_by_list
data = lcms_analysis.GET_LCMS_DATASETS(lcms_run=run, exp_workflow=exp_workflow, lcms_csv=lcms_csv, lcms_chiral_csv=lcms_chiral_csv, combine_by=combine_by,
                                      get_source_plate_groupings=get_source_plate_groupings, source_plate_groupings=source_plate_groupings, plate_to_grp_idx_mapping=plate_to_grp_idx_mapping, split_by_list=split_by_list, 
                                      data_types_to_get=data_types_to_get, chiral_merge_method='outer', 
                                      calculate_conversion_enantioselectivity=get_binary_labels_lcms, 
                                       perform_qc=False, plate_col=plate_col, subconc_init=subconc_init,
                                       get_binary_labels_lcms=get_binary_labels_lcms, nonbinary_label_col=nonbinary_label_col)

# get enzyme_analytics_df
df = AnalysisPipeline().standardize_dataset_columns(data['lcms_detections_all'], data_type='lcms_detections_all', update_analytics_table=True)



executing query: SELECT "id", "address", "run", "plate", "dev_or_prod", "exp_workflow_barcode", "exp_workflow_name", "proj_barcode", "proj_name", "ctrl_type", "exp_condition", "enzyme_barcode", "sequence", "hamming", "mutations", "reference_enzyme", "enzyme_concentration", "enzyme_unit", "enzyme_class", "sequence_qc", "sample_position", "sample_type", "substrate_barcode", "substrate_concentration", "substrate_unit", "substrate_smiles", "substrate_mz", "product_smiles", "product_mz", "sub_area", "prod_area", "ptr_lcms", "sub_conc_lcms_actual", "prod_conc_lcms_actual", "sum_conc_lcms_actual", "dilution_factor", "injector_volume", "expected_concentration", "concentration_units", "source_plate", "source_address", "seed_address", "seed_address_alphanumeric", "seed_plate", "main_plate", "rxn_plate", "library_barcode", "library_ref", "library_description", "seed_plate_time", "main_plate_time", "rxn_plate_time", "plate_time", "pellet_OD", "pellet_detected", "pellet_area", "pellet_intensity", "

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(loc, val, pi)


source_plate_(r) list: ['PLT18561', 'PLT18573', 'PLT18575', 'PLT18568']
***INFO*** Cleared 368 records from combi_analytics_table.
Saved 368 rows to Postgres (combi_analytics_table).
Updated postgres enzyme_analytics_table for all (+), (-), (r) samples associated with the variants on source plate PLT18561.
***INFO*** Cleared 368 records from combi_analytics_table.
Saved 368 rows to Postgres (combi_analytics_table).
Updated postgres enzyme_analytics_table for all (+), (-), (r) samples associated with the variants on source plate PLT18573.
***INFO*** Cleared 368 records from combi_analytics_table.
Saved 368 rows to Postgres (combi_analytics_table).
Updated postgres enzyme_analytics_table for all (+), (-), (r) samples associated with the variants on source plate PLT18575.
***INFO*** Cleared 368 records from combi_analytics_table.
Saved 368 rows to Postgres (combi_analytics_table).
Updated postgres enzyme_analytics_table for all (+), (-), (r) samples associated with the variants on source 

## 3b. Perform hits selection & worklist generation (from LCMS data)

In [None]:
if perform_hits_selection:
    lcms_analysis.plate_list = [p for p in list(set(df['lcms_plate_(r)'])) if p==p]
    lcms_analysis.HIT_SELECTION(df=df, select_by=select_by, plate_type_to_select_from='lcms_plate',
                                max_num_hits=max_num_hits, select_up_to_max=select_up_to_max, generate_worklist=generate_worklist)

# 4. Get Dataset Analytics

## 4a. Analytics from Combined dataset (combi_analytics_table)

In [None]:
project_subfolder = 'Phoenix-prod_Unit2-3-4-5-6-7-8-9-10-11-12-13-14-15-16_LCMSONLY' 
s3_subfolder = f'{s3_subbucket}{project_subfolder}/'
exp_workflow = ['WF10125','WF10128','WF10130','WF10132', 'WF10134', 'WF10136', 'WF10139', 'WF10144', 'WF10149', 'WF10157', 'WF10158', 'WF10163', 'WF10169', 'WF10173', 'WF10181']  # None # 
table = 'combi_analytics_table' 
metric_dict = {'LcmsC18':['pellet_OD', 'measured_nonbinary_score_(r)', 'measured_conversion_(r)'], 'LcmsChiral':['measured_enantiomeric_excess_(+over-)']}

analysis_pipeline = AnalysisPipeline(s3_subfolder=s3_subfolder, neg_ctrltype='EV', sort_by=None, metric_dict=metric_dict)
df = analysis_pipeline.load_data(table, exp_workflow=exp_workflow)
overall_analytics, overall_analytics_panel = analysis_pipeline.GET_OVERALL_ANALYTICS(df, display_table=True, plot_scatterplot=True, plot_histogram=True)
variant_analytics, variant_analytics_panel = analysis_pipeline.GET_VARIANT_ANALYTICS(df, display_table=True)
top_variant_stats, top_variant_reps, top_variant_analytics_panel = analysis_pipeline.get_top_variants(variant_analytics, df, thres_dict={'measured_conversion_(r)':(0.2,1,'median'), 'measured_enantiomeric_excess_(+over-)':(0.2,1,'median')}, top_n_variants=50, sort_by=('measured_conversion_(r)', 'median'), get_replicates_data=True)
plate_analytics, plate_analytics_panel = analysis_pipeline.GET_PLATE_ANALYTICS(df, display_table=True)
library_analytics, library_analytics_panel = analysis_pipeline.GET_LIBRARY_ANALYTICS(df, display_table=True)
unit_analytics, unit_analytics_panel = analysis_pipeline.GET_UNIT_ANALYTICS(df, display_table=True, plot_scatterplot=True, plot_histogram=True, get_ctrls_from_vals=True)
scatterplot_activity_corr_list = analysis_pipeline.get_activity_correlation_plots(top_variant_reps, xmetric_list=['pellet_OD', 'measured_nonbinary_sum_(r)'], xmetricname_list=['PelletOD', 'RacemicSum'], ymetric='measured_nonbinary_score_(r)', ymetricname='RacemicProduct', groupby='enzyme_barcode')
scatterplot_activityCV_corr_list = analysis_pipeline.get_activity_CV_correlation_plots(top_variant_stats, xmetric_list=['pellet_OD_cv', 'measured_nonbinary_sum_(r)_cv'], xmetricname_list=['PelletOD-CV', 'RacemicSum-CV'], ymetric='measured_nonbinary_score_(r)_cv', ymetricname='RacemicProduct-CV', groupby='enzyme_barcode')

## 4b. Analytics from individual LCMS dataset (lcms_detections)

In [None]:
project_subfolder = 'VQ-BL21vsC41_LCMSONLY' # 'VQ-Retest4_LCMSONLY' 
s3_subfolder = f'{s3_subbucket}{project_subfolder}/'
exp_workflow = ['WF10192'] 
table = 'lcms_detections' # 'combi_analytics_table'
metric_dict = {'LcmsC18':['pellet_OD', 'measured_nonbinary_sum_(r)', 'measured_nonbinary_score_(r)', 'measured_conversion_(r)']}  # 

# get data
analysis_pipeline = AnalysisPipeline(s3_subfolder=s3_subfolder, neg_ctrltype='EV', sort_by=None, metric_dict=metric_dict)
df = analysis_pipeline.load_data(table, exp_workflow=exp_workflow)
df = analysis_pipeline.get_derived_metrics(df, derived_metrics_to_get=['LcmsC18'])
df = analysis_pipeline.standardize_dataset_columns(df, update_analytics_table=False)
display(df)

# for sub_conc
for sub_conc in list(set(df['substrate_concentration_(r)'])):
    df_sub = df.loc[df['substrate_concentration_(r)']==sub_conc]
    variant_analytics, variant_analytics_panel = analysis_pipeline.GET_VARIANT_ANALYTICS(df_sub, display_table=True, table_suffix=f'_{int(sub_conc)}')
    top_variant_stats, top_variant_reps, top_variant_analytics_panel = analysis_pipeline.get_top_variants(variant_analytics, df=df_sub, 
                     thres_dict={'measured_nonbinary_score_(r)':(0,1,'median')}, top_n_variants=34, 
                     sort_by=('measured_nonbinary_score_(r)', 'median'), get_replicates_data=True, display_table=True, table_suffix=f'_{int(sub_conc)}')
    
    scatterplot_activity_corr_list = analysis_pipeline.get_activity_correlation_plots(top_variant_reps, xmetric_list=['pellet_OD', 'measured_nonbinary_sum_(r)'], xmetricname_list=['PelletOD', 'RacemicSum'], ymetric='measured_nonbinary_score_(r)', ymetricname='RacemicProduct', groupby='enzyme_barcode', table_suffix=f'_{int(sub_conc)}')
    scatterplot_activitycv_corr_list = analysis_pipeline.get_activity_CV_correlation_plots(top_variant_stats, xmetric_list=['pellet_OD_cv', 'measured_nonbinary_sum_(r)_cv'], xmetricname_list=['PelletOD-CV', 'RacemicSum-CV'], ymetric='measured_nonbinary_score_(r)_cv', ymetricname='RacemicProduct-CV', groupby='enzyme_barcode',table_suffix=f'_{int(sub_conc)}')