In [None]:
%load_ext autoreload
%autoreload 2
%matplotlib inline
import sys, os
import pickle
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import pickle

cwd = os.getcwd()
slash_idx = [i for i,l in enumerate(cwd) if l=='/']
repo_dir = cwd[:slash_idx[-1]]
base_dir = cwd[:slash_idx[-2]]
sys.path.append(repo_dir)
sys.path.append(base_dir)

from analysis_pipeline.sequence_update import DetectionsSequenceUpdate
from analytics_utils.analytics_utils.lims_utils import get_plate_runs, plates_from_exp, plates_from_workflow, track_transfers_in_out
from analytics_utils.analytics_utils.s3_interface import download_from_s3, upload_to_s3, s3_imgupload, s3_df2csv, s3_csv2df
from analytics_utils.analytics_utils.table_properties import maldi_detections_columns, lcms_detections_columns

dataset_folder = '../DATASETS/'
results_folder = '../detectionspipeline_results/'
models_folder = '../MODELS/'
s3_model_bucket = 'ml-models-registry'
s3_model_bucket_subdirectory = 'exerevnetes-preprocessing-models/'

# Get updated sequence data from 384-well plate

In [None]:
genesis_plate_list = ['PLT15039'] #['PLT15623', 'PLT16348', 'PLT15735', 'PLT16792', 'PLT15583', 'PLT15925']
sequence_update_dict = {}

for plate in genesis_plate_list:
    sequence_update = DetectionsSequenceUpdate(table="maldi_detections_1")
    well_to_enzyme, well_to_enzyme_multi, enzyme_dict = sequence_update.get_sequence_update(plate, reference_enzyme='ENZ10045', enz_idx_to_use=-1)
    
    # update sequence update dict with final well contents
    sequence_update_dict.update({plate: well_to_enzyme})
    
    # final well_to_enzyme data
    well_to_enzyme_df = pd.DataFrame.from_dict(well_to_enzyme, orient='index')
    well_to_enzyme_df.to_csv(f'{dataset_folder}{plate}_well-to-enzyme.csv')
    
    # save data if multiple enzymes were found in each well
    if len(well_to_enzyme_multi) > 0: 
        print(plate, len(well_to_enzyme_multi[0]))
        for enz_idx in well_to_enzyme_multi:
            enz_data_idx = well_to_enzyme_multi[enz_idx]
            
            # update well quadrant
            for address in enz_data_idx:
                x = int(address[1:address.find('Y')])
                y = int(address[address.find('Y')+1:])
                if x%2==1 and y%2==1: 
                    enz_data_idx[address].update({'quadrant': 'Q1'})
                elif x%2==0 and y%2==1: 
                    enz_data_idx[address].update({'quadrant': 'Q2'})
                elif x%2==1 and y%2==0:
                    enz_data_idx[address].update({'quadrant': 'Q3'})
                elif x%2==0 and y%2==0: 
                    enz_data_idx[address].update({'quadrant': 'Q4'})
                x_seed = int(np.ceil(x/2)) # 1: 1,2; 2: 3,4; 3: 5,6
                y_seed = int(np.ceil(y/2)) 
                address_seed = f'X{x_seed}Y{y_seed}'
                enz_data_idx[address].update({'seed_address': address_seed})
                    
            enz_data_idx = pd.DataFrame.from_dict(enz_data_idx, orient='index')
            enz_data_idx.to_csv(f'{dataset_folder}{plate}_{enz_idx}.csv')
    else: 
        print(f'{plate} has no wells with multiple enzymes.')

# Formulate sequence update dict for target plates

### Get plates to propagate updated sequences to

In [None]:

genesis_phoenix_mapping = {
    ('PLT15623', 'Q4'): 'PLT17314',
    ('PLT15623', 'Q3'): 'PLT17313',
    ('PLT16348', 'Q1'): 'PLT17321',
    ('PLT15735', 'Q4'): 'PLT17320',
    ('PLT16792', 'Q1'): 'PLT17479',
    ('PLT15583', 'Q4'): 'PLT17476',
    ('PLT15925', 'Q1'): 'PLT17496',
    ('PLT15925', 'Q3'): 'PLT17498',
    ('PLT15925', 'Q2'): 'PLT17501',
    ('PLT15925', 'Q4'): 'PLT17504',
}

for (genesis_plate, quadrant), plate in genesis_phoenix_mapping.items():
    print(genesis_plate, quadrant, plate)
    lcms_intermediate_plates = []
    lcms_plates = []
    maldi_plates = []

    # get LCMS intermediate plate
    transfers_in, _ = track_transfers_in_out(plate, in_strings=['lcms_intermediate'])
    if 'X12Y8' in transfers_in:
        lcms_intermediate_plate = transfers_in['X12Y8']['plate'][0]
        print('LCMS intermediate plate (in):', lcms_intermediate_plate)

    # get RXN plate
    transfers_in, _ = track_transfers_in_out(lcms_intermediate_plate, in_strings=['rxn'])
    if 'X12Y8' in transfers_in:
        rxn_plate = transfers_in['X12Y8']['plate'][0]
        print('Rxn plate (in):', rxn_plate)

    # get MAIN plate
    transfers_in, _ = track_transfers_in_out(rxn_plate, in_strings=['main'])
    if 'X12Y8' in transfers_in:
        main_plate = transfers_in['X12Y8']['plate'][0]
        print('Main plate (in):', main_plate)

    #################
    # get RXN plates out
    _, transfers_out = track_transfers_in_out(main_plate, out_strings=['rxn'])
    if 'X12Y8' in transfers_out:
        rxn_plates = transfers_out['X12Y8']['plate']
        print('Rxn plate (out):', rxn_plates)

    # for each reaction plate, get downstream LCMS intermediate plate, followed by LCMS final plate and MALDI plate
    for rxn_plate in rxn_plates: 
        _, transfers_out = track_transfers_in_out(rxn_plate, out_strings=['lcms_intermediate'])
        if 'X12Y8' in transfers_out:
            lcms_intermediate_plate = transfers_out['X12Y8']['plate'][0]
            lcms_intermediate_plates.append(lcms_intermediate_plate)

        # get MALDI plate
        _, transfers_out = track_transfers_in_out(lcms_intermediate_plate, out_strings=['hts'])
        if 'X12Y8' in transfers_out:
            maldi_plate = transfers_out['X12Y8']['plate'][0]
            maldi_plates.append(maldi_plate)

        # get MALDI plate
        _, transfers_out = track_transfers_in_out(lcms_intermediate_plate, out_strings=['lcms_final'])
        if 'X12Y8' in transfers_out:
            lcms_plate = transfers_out['X12Y8']['plate'][0]
            lcms_plates.append(lcms_plate)

    # UPDATE PLATE GROUPINGS
    genesis_phoenix_mapping[(genesis_plate, quadrant)] = {'maldi_plates': maldi_plates, 'lcms_plates': lcms_plates, 
                                      'lcms_intermediate_plates':lcms_intermediate_plates}
    
genesis_phoenix_mapping


### Parse quadrant data and reformat sequence updates for output plate type

In [None]:
for (genesis_plate, quadrant) in list(genesis_phoenix_mapping.keys()): 
    
    print(genesis_plate, quadrant)
    
    # get sequence updates
    well_to_enzyme = sequence_update_dict[genesis_plate]
    enz_dict_byquadrant = {'Q1':{}, 'Q2':{}, 'Q3':{}, 'Q4':{}}
    
    # iterate through addresses and append contents to relevant quadrant in sequence_update_dict_byquadrant
    for address, enz_dict in well_to_enzyme.items():
        x = int(address[1:address.find('Y')])
        y = int(address[address.find('Y')+1:])

        # Q1: x odd, y odd # This quadrant starts in the A1 (X1Y1) top left corner of the plate.
        if x%2==1 and y%2==1: 
            enz_dict_byquadrant['Q1'].update({address:enz_dict})

        # Q2: x even, y odd # This quadrant starts from the A2 (X2Y1) well of the plate and ends in the upper right corner of the plate. 
        elif x%2==0 and y%2==1: 
            enz_dict_byquadrant['Q2'].update({address:enz_dict})

        # Q3: x odd, y even # This quadrant starts in the B1 (X1Y2) well of the plate and ends in the lower left corner of the plate.
        elif x%2==1 and y%2==0:
            enz_dict_byquadrant['Q3'].update({address:enz_dict})

        # Q4: x even, y even # This quadrant starts in the B2 (X2Y2) well of the plate and ends in the lower right corner of the plate.
        elif x%2==0 and y%2==0: 
            enz_dict_byquadrant['Q4'].update({address:enz_dict})

    # get wells / sequences to update
    sequence_update = enz_dict_byquadrant[quadrant]
    genesis_phoenix_mapping[(genesis_plate, quadrant)]['well_to_enzyme_quadrant'] = sequence_update
    
    # get LCMS sequence update dict
    lcms_sequence_update = {}
    for address, enz_dict in sequence_update.items():
        x = int(address[1:address.find('Y')])
        y = int(address[address.find('Y')+1:])

        # get lcms address
        x_lcms = int(np.ceil(x/2)) # 1: 1,2; 2: 3,4; 3: 5,6
        y_lcms = int(np.ceil(y/2)) 
        address_lcms = f'X{x_lcms}Y{y_lcms}'
        lcms_sequence_update.update({address_lcms: enz_dict})
    genesis_phoenix_mapping[(genesis_plate, quadrant)].update({'lcms_sequence_update': lcms_sequence_update})
    
    # get MALDI sequence update dict
    maldi_sequence_update = {}
    for address, enz_dict in lcms_sequence_update.items():
        x = int(address[1:address.find('Y')])
        y = int(address[address.find('Y')+1:])

        # get lcms address
        x_maldi_list = np.array([2*x-1, 2*x]).astype(int) # 1: 1,2; 2: 3,4; 3: 5,6
        y_maldi_list = np.array([2*y-1, 2*y]).astype(int)
        address_maldi_list= []
        for x_maldi in x_maldi_list:
            for y_maldi in y_maldi_list:   
                address_maldi = f'X{x_maldi}Y{y_maldi}'
                maldi_sequence_update.update({address_maldi: enz_dict})
    genesis_phoenix_mapping[(genesis_plate, quadrant)].update({'maldi_sequence_update': maldi_sequence_update})

### Save

In [None]:
with open('genesis_phoenix_mapping.pickle', 'wb') as handle:
    pickle.dump(genesis_phoenix_mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)

### Load

In [None]:
with open('genesis_phoenix_mapping.pickle', 'rb') as handle:
    genesis_phoenix_mapping = pickle.load(handle)
    
genesis_phoenix_mapping[('PLT15623', 'Q4')]['maldi_sequence_update']['X45Y31']

# Update sequences in Postgres

In [None]:
for (genesis_plate, quadrant) in list(genesis_phoenix_mapping.keys())[1:]: 
    
    ## MALDI ##
    maldi_plates = genesis_phoenix_mapping[(genesis_plate, quadrant)]['maldi_plates']
    print('MALDI plates:', maldi_plates)
    
    # get maldi_sequence_update
    maldi_sequence_update = genesis_phoenix_mapping[(genesis_plate, quadrant)]['maldi_sequence_update']
    
    # maldi update
    for maldi_plate in maldi_plates:
        print(maldi_plate)
        sequence_update = DetectionsSequenceUpdate(table='maldi_detections_1')
        sequence_update.update_table_with_sequences_in_postgres(maldi_plate, maldi_sequence_update, maldi_detections_columns,
                                               detections_type='maldi', save_csv_to_s3=True, suffix='-1')
        
        sequence_update = DetectionsSequenceUpdate(table='maldi_detections_2')
        sequence_update.update_table_with_sequences_in_postgres(maldi_plate, maldi_sequence_update, maldi_detections_columns,
                                               detections_type='maldi', save_csv_to_s3=True, suffix='-2')
        
    ## LCMS ##
    lcms_plates = genesis_phoenix_mapping[(genesis_plate, quadrant)]['lcms_plates']
    print('LCMS plates:', lcms_plates)
    
    # get lcms_sequence_update
    lcms_sequence_update = genesis_phoenix_mapping[(genesis_plate, quadrant)]['lcms_sequence_update']
    
    # maldi update
    for lcms_plate in lcms_plates:
        sequence_update = DetectionsSequenceUpdate(table='lcms_detections')
        sequence_update.update_table_with_sequences_in_postgres(lcms_plate, lcms_sequence_update, lcms_detections_columns, 
                                               detections_type='lcms', save_csv_to_s3=True, suffix='')

In [None]:
matches = DetectionsSequenceUpdate(table='maldi_detections_1').lookup_previous_record({'plate':'PLT17314'})
display(matches)

# Update sequences directly in CSV

In [None]:
csv_fname = 'maldi_lcms_WF10125-WF10128-WF10130_C18.csv'
df = pd.read_csv(f'{dataset_folder}{csv_fname}', index_col=0)
list(df.columns)

sequence_cols=['enzyme_barcode', 'sequence', 'mutations', 'hamming', 'reference_enzyme', 'enzyme_class', 'sequence_qc']
for (genesis_plate, quadrant) in list(genesis_phoenix_mapping.keys()): 
    
    ## MALDI ##
    maldi_plates = genesis_phoenix_mapping[(genesis_plate, quadrant)]['maldi_plates']
    print('MALDI plates:', maldi_plates)
    
    # get maldi_sequence_update
    maldi_sequence_update = genesis_phoenix_mapping[(genesis_plate, quadrant)]['maldi_sequence_update']
    
    # maldi update
    for maldi_plate in maldi_plates:
        # iterate through addresses, update sequence information
        for address, well_update in maldi_sequence_update.items():
            # get address string for 2x2 replicates
            x = int(address[1:address.find('Y')])
            y = int(address[address.find('Y')+1:])
            x_lcms = int(np.ceil(x/2))
            y_lcms = int(np.ceil(y/2)) 
            x_maldi_list = np.array([2*x_lcms-1, 2*x_lcms]).astype(int) # 1: 1,2; 2: 3,4; 3: 5,6
            y_maldi_list = np.array([2*y_lcms-1, 2*y_lcms]).astype(int)
            address_maldi_list= []
            for x_maldi in x_maldi_list:
                for y_maldi in y_maldi_list:   
                    address_maldi = f'X{x_maldi}Y{y_maldi}'
                    address_maldi_list.append(address_maldi)
            address_str = ', '.join(address_maldi_list)    
            num_matching_rows = len(df.loc[(df.plate_CMP60354==maldi_plate) & (df.address==address_str)])
            if num_matching_rows > 0:
                df.loc[(df.plate_CMP60354==maldi_plate) & (df.address==address_str), sequence_cols] = [well_update[c] for c in sequence_cols]
#                 print(f'Updated {num_matching_rows} matching rows for {maldi_plate}, {address_str}.')
            
display(df)

In [None]:
csv_fname = 'maldi_lcms_WF10125-WF10128-WF10130_C18_sequpdate.csv'
df.to_csv(f'{dataset_folder}{csv_fname}')

In [None]:
csv_fname = 'maldi_lcms_WF10125-WF10128-WF10130_C18.csv'
df_old = pd.read_csv(f'{dataset_folder}{csv_fname}', index_col=0)

In [None]:
# enz_combi = [(p1, p2, p3, a, enz1, enz2) for p1, p2, p3, a, enz1, enz2 in zip(list(df.plate_CMP60354), list(df.plate_CMP60403), list(df.plate_CMP60404), list(df.address), list(df.enzyme_barcode), list(df_old.enzyme_barcode)) if (enz1!=enz2 and not (isinstance(enz1, float) and isinstance(enz2, float)))]
enz_combi = [(p1, p2, p3, a, enz1, enz2) for p1, p2, p3, a, enz1, enz2 in zip(list(df.plate_CMP60354), list(df.plate_CMP60403), list(df.plate_CMP60404), list(df.address), list(df.enzyme_barcode), list(df_old.enzyme_barcode)) if (enz1!=enz2 and isinstance(enz1, str) and isinstance(enz2, str))]

print(len(list(df.enzyme_barcode)))
print(len(enz_combi))
for i in enz_combi:
    print(i)

In [None]:
plates = [', '.join([str(row[0]), str(row[1]), str(row[2])]) for row in enz_combi]
unique, counts = np.unique(np.array(plates), return_counts=True)
print([(u, c) for u, c in zip(unique, counts)])