# Create files mapping FACS well and plate to basic allele metadata

This script takes cherry picking and allele info files as input and produces a table relating FACS well and plate IDs to allele IDs and symbols. 

In [1]:
import pandas as pd
import numpy as np
import os

In [2]:
dualipa_inputs = "../1_inputs"
dualipa_outputs = "../3_outputs"
allele_collection_inputs = "../../../1_allele_collection/1_inputs"
allele_collection_outputs = "../../../1_allele_collection/3_outputs"

In [3]:
# Get allele metadata
allele_info_df = pd.read_table(f"{allele_collection_inputs}/VarChamp_Consolidated_allele_collection.tsv")
other_cols = [i for i in allele_info_df.columns if "mutation_id_" not in i]
mut_cols = [i for i in allele_info_df.columns if "mutation_id_" in i]

allele_info_df = allele_info_df.melt(id_vars=other_cols, 
                                     value_vars=mut_cols, 
                                     var_name='mut_type', 
                                     value_name='mut_id')

In [4]:
# Get gene-level metadata
gene_info_df = pd.read_csv(f"{allele_collection_outputs}/orf_info.csv")
gene_info_df = gene_info_df[['orf_id', 'symbol']]
orf_to_symbol = gene_info_df.set_index("orf_id")["symbol"].to_dict()
orf_to_symbol = {str(k): v for k, v in orf_to_symbol.items()}

Vidal lab ran experiment once and analyzed the data. Some alleles didn't pass the qa/qc thresholds or got no data. Batch5 is is a re-run for all alleles that failed in the first set of batches. Here, we process everything together from scratch. An allele should always be compared to the wt on the same batch and plate. The summary stats can then be compared across batches. 

In [5]:
# Read in cherry picking files for both the initial batches and the second round
cp_col = ['orf_id', 'mut_id', 'node_type','dest_pla_id', 'dest_pla', 'dest_well']

pdest_layout_initial_batches_df = pd.read_csv(f'{dualipa_inputs}/cherry_picking_files/summary_CP_VUSDUALXP.csv', index_col=0)[cp_col]
pdest_layout_b5_df = pd.read_csv(f'{dualipa_inputs}/cherry_picking_files/summary_CP_VUSDUALXP_batch5.csv', index_col=0)[cp_col]

pdest_layout_df = pd.concat([pdest_layout_b5_df, pdest_layout_initial_batches_df])
pdest_layout_df
### This doesn't have well A01

Unnamed: 0,orf_id,mut_id,node_type,dest_pla_id,dest_pla,dest_well
78,56360.0,0.0,wt,23,VUSDUALXP_23,A12
79,56360.0,0.0,wt,23,VUSDUALXP_23,B12
80,56360.0,0.0,wt,23,VUSDUALXP_23,C12
81,56360.0,0.0,wt,23,VUSDUALXP_23,D12
10,9905.0,0.0,wt,23,VUSDUALXP_23,F02
...,...,...,...,...,...,...
119,,,pdest_empty,20,VUSDUALXP_20,A08
124,,,pdest_empty,21,VUSDUALXP_21,A07
125,,,pdest_empty,21,VUSDUALXP_21,A08
130,,,pdest_empty,22,VUSDUALXP_22,A07


In [6]:
seq_confirm_code_df = pd.read_table(f"{allele_collection_inputs}/seq_confirmation_results/sequence_confirmation_class_code.tsv")
with pd.option_context('display.max_columns', None):
    display(seq_confirm_code_df)
# seq_confirm_res_df["db_plate"].unique()

Unnamed: 0,code,description
0,1,perfectly validated
1,2,"partially validated, >= 50% coverage"
2,3,wild type
3,4,"partial wild type, >= 50% coverage"
4,5,"target and off-target mutation, >= 50% coverage"
5,6,"off-target mutation, >= 50% coverage"
6,7,"truncated, < 50% coverage"
7,99,no reads


In [7]:
seq_confirm_res_df = pd.read_table(f"{allele_collection_inputs}/seq_confirmation_results/VarChampSeqConfirmationResult.tsv")
with pd.option_context('display.max_columns', None):
    display(seq_confirm_res_df)
seq_confirm_res_df["dualip_plate"].unique()

Unnamed: 0,symbol,orf_id_wt,mutation_id_old,ccsb_mutation_id,nt_change,aa_change,collection,entry_plate_orig,entry_well_orig,entry_plate_conso,entry_well_conso,entry_seq_pool,db_plate,db_well,n2h_plate,n2h_well,dualip_plate,dualip_well,entry_sequenced,entry_sequence_confirmation_class,db_sequenced,db_sequence_confirmation_class,n2h_sequenced,n2h_sequence_confirmation_class,dualip_sequenced,dualip_sequence_confirmation_class
0,GBA,2,6,CCSBVarC000001,160G>C,Val54Leu,RC4,RC4_Mut_GDEh1026,H01,GDEhDisVCh_40054,F12,2.0,RC4_Mut_GDDh1026,H01,,,,,1,1.0,1,2.0,0,,0,
1,GBA,2,73,CCSBVarC000002,670C>T,Leu224Phe,RC4,RC4_Mut_GDEh1026,E01,GDEhDisVCh_40054,C12,2.0,RC4_Mut_GDDh1026,E01,,,,,1,1.0,1,1.0,0,,0,
2,GBA,2,113,CCSBVarC000003,887G>A,Arg296Gln,RC4,RC4_Mut_GDEh1026,F01,GDEhDisVCh_40054,D12,2.0,RC4_Mut_GDDh1026,F01,,,,,1,1.0,1,7.0,0,,0,
3,GBA,2,231,CCSBVarC000004,1448T>C,Leu483Pro,RC4,RC4_Mut_GDEh1026,G01,GDEhDisVCh_40054,E12,2.0,RC4_Mut_GDDh1026,G01,,,,,1,1.0,1,2.0,0,,0,
4,GBA,2,213510,CCSBVarC003869,259C>T,Arg87Trp,CEGS2,CegsMutGDEh1035,B03,GDEhDisVCh_40054,B02,2.0,CegsMutGDDh1035,B03,,,,,1,1.0,1,2.0,0,,0,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8705,CTNNB1,100070227,212487,CCSBVarC007073,860A>G,Asn287Ser,CEGS2,CegsMutGDEh1023,B07,GDEhDisVCh_40034,E09,2.0,,,,,,,1,1.0,0,,0,,0,
8706,TBX19,100070258,212488,CCSBVarC007074,383C>T,Ser128Phe,CEGS2,CegsMutGDEh1043,A07,GDEhDisVCh_40046,B02,2.0,CegsMutGDDh1043,A07,,,,,1,7.0,1,7.0,0,,0,
8707,TBX19,100070258,212489,CCSBVarC007075,257T>G,Met86Arg,CEGS2,CegsMutGDEh1043,G08,GDEhDisVCh_40046,F02,2.0,CegsMutGDDh1043,G08,,,,,1,1.0,1,1.0,0,,0,
8708,GDF2,100070273,212491,CCSBVarC007076,203G>T,Arg68Leu,CEGS2,CegsMutGDEh1023,E04,GDEhDisVCh_40033,B08,1.0,,,,,,,1,5.0,0,,0,,0,


array([nan, 'VUSMutpDEST2_01', 'VUSMutpDEST2_02', 'VUSMutpDEST2_03',
       'VUSMutpDEST2_04', 'VUSMutpDEST2_05', 'VUSMutpDEST2_06',
       'VUSMutpDEST2_07', 'VUSMutpDEST2_08', 'VUSMutpDEST2_09',
       'VUSMutpDEST2_11', 'VUSMutpDEST2_12', 'VUSMutpDEST2_13',
       'VUSMutpDEST2_14', 'VUSMutpDEST2_15', 'VUSMutpDEST2_16'],
      dtype=object)

In [8]:
pdest_layout_seq_conf_df = pd.merge(pdest_layout_df, 
        seq_confirm_res_df[
        ["symbol","orf_id_wt","mutation_id_old","dualip_sequenced","dualip_sequence_confirmation_class","aa_change","nt_change","ccsb_mutation_id"]
        ].rename({"orf_id_wt": "orf_id",
                "mutation_id_old": "mut_id"}, axis=1), 
    on=["orf_id","mut_id"],
    how="left"
)
# print(pdest_layout_seq_conf_df["dualip_sequence_confirmation_class"].unique())

## QC with allele collection df for consistency
meta_cols = ['orf_id', 'mut_id', 'aa_change', 'nt_change', 'symbol']
pdest_layout_seq_conf_df = pdest_layout_seq_conf_df.merge(allele_info_df[meta_cols], on=['orf_id', 'mut_id'], how='left', suffixes=("", "_ac"))
pdest_layout_seq_conf_df

assert(all(pdest_layout_seq_conf_df.dropna(subset="symbol")["symbol"]==pdest_layout_seq_conf_df.dropna(subset="symbol")["symbol_ac"]))
assert(all(pdest_layout_seq_conf_df.dropna(subset="aa_change")["aa_change"]==pdest_layout_seq_conf_df.dropna(subset="aa_change")["aa_change_ac"]))
assert(all(pdest_layout_seq_conf_df.dropna(subset="nt_change")["nt_change"]==pdest_layout_seq_conf_df.dropna(subset="nt_change")["nt_change_ac"]))
pdest_layout_seq_conf_df = pdest_layout_seq_conf_df.drop(columns=["symbol_ac","aa_change_ac","nt_change_ac"])

## replace NAs
pdest_layout_seq_conf_df["orf_id"] = pdest_layout_seq_conf_df["orf_id"].astype("Int64").astype(str)
pdest_layout_seq_conf_df["orf_id"] = pdest_layout_seq_conf_df["orf_id"].replace("<NA>", np.nan)
pdest_layout_seq_conf_df[["aa_change", "nt_change"]] = pdest_layout_seq_conf_df[["aa_change", "nt_change"]].fillna("WT")
## Replace missing symbols
pdest_layout_seq_conf_df["symbol"] = pdest_layout_seq_conf_df["symbol"].fillna(pdest_layout_seq_conf_df["orf_id"].map(orf_to_symbol))

In [9]:
pdest_layout_seq_conf_df.to_csv(f'{dualipa_outputs}/dualipa_experimental_layout_seq_conf.csv', index=None)

In [None]:
# # Add metadata
# meta_cols = ['orf_id', 'mut_id', 'aa_change', 'nt_change', 'symbol']
# pdest_layout_df = pdest_layout_df.merge(allele_info_df[meta_cols], on=['orf_id', 'mut_id'], how='left')
# pdest_layout_df[["aa_change", "nt_change"]] = pdest_layout_df[["aa_change", "nt_change"]].fillna("WT")
# pdest_layout_df["orf_id"] = pdest_layout_df["orf_id"].astype("Int64").astype(str)
# pdest_layout_df["orf_id"] = pdest_layout_df["orf_id"].replace("<NA>", np.nan)

# # Replace missing symbols
# pdest_layout_df["symbol"] = pdest_layout_df["symbol"].fillna(pdest_layout_df["orf_id"].map(orf_to_symbol))

In [None]:
# pdest_layout_df.to_csv(f'{dualipa_outputs}/dualipa_experimental_layout.csv', index=None)