## Random sanity checks

In [69]:
new_df = part_lib_data[(part_lib_data['Part Type'] == 'ZF Binding Sites (SynTF A)') | (part_lib_data['Part Type'] == 'ZF Binding Sites (SynTF B)')]

In [70]:
new_df

Unnamed: 0,Sr. No.,Part Type,Part Name,Part Code (CLASSIC),Sequence
25,25,ZF Binding Sites (SynTF A),n = 2,1,CAAAacgccttcGGCGTAGCCGATGTCGCGctcccgtgTCAGtaaa...
26,26,ZF Binding Sites (SynTF A),n = 8,2,CAAAACGCTTcGGCGTAGCCGATGTCGCGcTCCCGTGTCAGTAAAG...
27,27,ZF Binding Sites (SynTF A),n = 12,3,CAAAacgccttcGGCGTAGCCGATGTCGCGctcccgtgTCAGtaaa...
34,34,ZF Binding Sites (SynTF B),n = 2,1,CAAAacgccttcGGCGTAGCCGATGTCGCGctcccgtgTCAGtaaa...
35,35,ZF Binding Sites (SynTF B),n = 8,2,CAAAACGCTTcGGCGTAGCCGATGTCGCGcTCCCGTGTCAGTAAAG...
36,36,ZF Binding Sites (SynTF B),n = 12,3,CAAAACGCTTcGGCGTAGCCGATGTCGCGcTCCCGTGTCAGTAAAG...


In [None]:
# checking whether binding motif is the same

import pandas as pd

# Pivot the table to align A and B by part code
pivoted = (
    new_df
    .pivot_table(
        index="Part Code (CLASSIC)",
        columns="Part Type",
        values="Sequence",
        aggfunc="first"
    )
    .reset_index()
)

# Rename columns for readability
pivoted.columns = ["Part Code (CLASSIC)", "Seq_A", "Seq_B"]

# Convert to lowercase before comparing
pivoted["is_same"] = (
    pivoted["Seq_A"].str.lower() == pivoted["Seq_B"].str.lower()
)

print(pivoted)

  Part Code (CLASSIC)                                              Seq_A  \
0                   1  CAAAacgccttcGGCGTAGCCGATGTCGCGctcccgtgTCAGtaaa...   
1                   2  CAAAACGCTTcGGCGTAGCCGATGTCGCGcTCCCGTGTCAGTAAAG...   
2                   3  CAAAacgccttcGGCGTAGCCGATGTCGCGctcccgtgTCAGtaaa...   

                                               Seq_B  is_same  
0  CAAAacgccttcGGCGTAGCCGATGTCGCGctcccgtgTCAGtaaa...     True  
1  CAAAACGCTTcGGCGTAGCCGATGTCGCGcTCCCGTGTCAGTAAAG...    False  
2  CAAAACGCTTcGGCGTAGCCGATGTCGCGcTCCCGTGTCAGTAAAG...    False  


# Load Data

In [None]:
# import
import pandas as pd

# loading data
base_lib_exp_data = pd.read_csv('./base_library_data.csv')
part_lib_data = pd.read_excel('./Dual_Input_Library_Part_Sequences.xlsx')

# reframing data
# part_lib_data.columns = part_lib_data.iloc[0] # renaming the columns
# part_lib_data.drop(index=0) # dropping the second index
part_lib_data['Sequence'] = part_lib_data['Sequence'].fillna("") # convert NaN to ""

parts_map = part_lib_data.set_index(["Part Type", "Part Code (CLASSIC)"])["Sequence"].to_dict()

In [6]:
base_lib_exp_data.columns

Index(['Basal Exp (GFP, au)', '4-OHT Induced Exp (GFP, au)',
       'GZV Induced Exp (GFP, au)', 'Dual Induced Exp (GFP, au)',
       'No. of Barcodes (basal)', 'No. of Barcodes (4-OHT)',
       'No. of Barcodes (GZV)', 'No. of Barcodes (Dual)',
       'No. of Binding sites', 'Binding site pattern', 'Core prom',
       'Term spacing reporter', 'Prom. SynTF A', 'AD SynTF A', 'IDR SynTF A',
       'ZF Affinity SynTF A', 'Term spacing SynTF A', 'Prom. SynTF B',
       'AD SynTF B', 'IDR SynTF B', 'ZF Affinity SynTF B',
       'Term spacing SynTF B', 'Orientation'],
      dtype='object')

In [None]:
base_lib_exp_data_essential = base_lib_exp_data[['Basal Exp (GFP, au)', '4-OHT Induced Exp (GFP, au)',
'GZV Induced Exp (GFP, au)', 'Dual Induced Exp (GFP, au)', 'No. of Barcodes (basal)', 'No. of Barcodes (4-OHT)', 'No. of Barcodes (GZV)', 'No. of Barcodes (Dual)', 'No. of Binding sites', 'Binding site pattern', 'Core prom', 'Term spacing reporter', 'Prom. SynTF A', 'AD SynTF A', 'IDR SynTF A', 'ZF Affinity SynTF A', 'Term spacing SynTF A', 'Prom. SynTF B', 'AD SynTF B', 'IDR SynTF B', 'ZF Affinity SynTF B', 'Term spacing SynTF B', 'Orientation']]

# Filtering out rows with NaN values
base_lib_exp_data_essential = base_lib_exp_data_essential.dropna()
base_lib_exp_data_essential.reset_index()

## Mapping functions

In [None]:
def apply_seq_mapping(df, mappings):
    """
    Apply get_seq() to multiple (output, input, lookup_key) triplets.

    Args:
        df (pd.DataFrame): Input dataframe.
        mappings (list[tuple[str, str, str]]):
            Each tuple is (output_col, input_col, lookup_key).

    Returns:
        pd.DataFrame: Updated dataframe with new columns.
    """
    for out_col, in_col, key in mappings:
        df[out_col] = df[in_col].apply(lambda x: get_seq(key, x))
    return df

# Function to safely fetch a sequence
def get_seq(part_type, code):
    # print(f"this is part_type {part_type}")
    # print(f"this is the code {code}")
    return parts_map.get((part_type, str(code))) or parts_map.get(
        (part_type, int(code)), ""
    )

In [None]:
# create
# key_matching

mappings = [
    ('syn_tf_ta_A', 'AD SynTF A', 'Activation Domain (SynTF A/B)'),
    ('syn_tf_IDP_A', 'IDR SynTF A', 'IDR (SynTF A/B)'),
    ('syn_tf_z_aff_A', 'ZF Affinity SynTF A', 'ZF Affinity (SynTF A)'),
    ('syn_tf_ta_B', 'AD SynTF B', 'Activation Domain (SynTF A/B)'),
    ('syn_tf_IDP_B', 'IDR SynTF B', 'IDR (SynTF A/B)'),
    ('syn_tf_z_aff_B', 'ZF Affinity SynTF B', 'ZF Affinity (SynTF B)'),
    ('syn_tf_promoter_A', 'Prom. SynTF A', 'Promoter (SynTF A/B)'),
    ('syn_tf_promoter_B', 'Prom. SynTF B', 'Promoter (SynTF A/B)'),
    ('syn_tf_term_spacing_A', 'Term spacing SynTF A', 'SynTF Term Spacer (SynTF A/B)'),
    ('syn_tf_term_spacing_B', 'Term spacing SynTF B', 'SynTF Term Spacer (SynTF A/B)'),
    ('reporter_pattern', 'Binding site pattern', 'Binding site pattern'), # This is a special case it varies by the pattern  # ('reporter_BM', 'No. of Binding sites', ''), # Doing the assembly later
    ('reporter_core_promoter', 'Core prom', 'Core/Minimal Promoters'),
    ('reporter_core_spacer', 'Term spacing reporter', 'SynTF Term Spacer (SynTF A/B)'),
    ]
base_lib_exp_data_essential = apply_seq_mapping(base_lib_exp_data_essential, mappings)
# base_lib_exp_data_essential['syn_tf_ta'] = base_lib_exp_data_essential['AD SynTF A'].apply(lambda x: get_seq("Activation Domain (SynTF A/B)", x))



In [None]:
construct_map = {
    'A': ['syn_tf_ta_A', 'syn_tf_IDP_A', 'syn_tf_z_aff_A'],
    'B': ['syn_tf_z_aff_B', 'syn_tf_IDP_B', 'syn_tf_ta_B'],
    'R': ['reporter_pattern', 'reporter_core_promoter', 'k1', 'eGFP', 'terminator', ]   # TODO: ZELUN figure out what is 
}

for row in base_lib_exp_data_essential.head(10).itertuples(index=False):
    for key, cols in construct_map.items():
        construct = "".join(str(getattr(row, col)) for col in cols)
        print(f"{key}: {construct}")

#     # construct_B =
#     # construct_C =
#     # print(row.syn_tf_ta_A)

A: gatgagtttcccacgatggtgtttccttctgggcagatcagccaggcctcggccttggccccggcccctccccaagtcctgccccaggctccagcccctgcccctgctccagcgatggtatcagctctggcccaggccccagcccctgtcccagtcctagccccaggccctcctcaggctgtggccccacctgcccccaagcccacccaggctggggaaggaacgctgtcagaggccctgctgcagctgcagtttgatgatgaggacctgggggccttgcttggcaacagcacagacccagctgtgttcacagacctggcatccgtagacaactccgagtttcagcagctgctgaaccagggcatacctgtggccccccacacaactgagcccatgctgatggagtaccctgaggctataactcgcctagtgacaggggcccagaggccccccgacccagctcctgctccactgggggcccctgggctccccaatggcctcctttcaggagatgaggacttctcctccattgcggacatggacttctcagccctgctgagtcagatcagctccATCTGGTTCTGGCAGTGGAGGAGGTGGCTCCGGTGGAGccaggagaacgaccatttcaatgccgcatctgtatgcgtaatttctctCGCAGGCATGGGCTTGATCGGcacacgcgcacccacaccggcgagaagccattccaatgcaggatttgcatgaggaacttttcaGATCATTCAAGTCTCAAACGCcacctcaggactcatacgggttcccagaaacctttccaatgccgcatttgcatgaggaattttagcGTCAGGCATAATCTTACTCGTcacctgcgaacacatactggtgaaaagccttttcagtgtaggatctgtatgagaaatttctctGATCATTCCAATCTTTCCCGCcatcttaagacacatacgggcagccaaaagccgttccaatgcagaatatgtatgcgcaactt

In [None]:
# construct the sequence
# Part A
base_lib_exp_data_essential['seq_synTF_A'] = (base_lib_exp_data_essential['syn_tf_ta_A'].str() + base_lib_exp_data_essential['syn_tf_IDP_A'].str() + )
# Part B
# Reporter
# Combine


In [79]:
base_lib_exp_data_essential

Unnamed: 0,"Basal Exp (GFP, au)","4-OHT Induced Exp (GFP, au)","GZV Induced Exp (GFP, au)","Dual Induced Exp (GFP, au)",No. of Barcodes (basal),No. of Barcodes (4-OHT),No. of Barcodes (GZV),No. of Barcodes (Dual),No. of Binding sites,Binding site pattern,...,syn_tf_ta_B,syn_tf_IDP_B,syn_tf_z_aff_B,syn_tf_promoter_A,syn_tf_promoter_B,syn_tf_term_spacing_A,syn_tf_term_spacing_B,reporter_pattern,reporter_core_promoter,reporter_core_spacer
0,5464.20,5020.20,5687.70,5721.60,3,3,3,3,1,1,...,gatgagtttcccacgatggtgtttccttctgggcagatcagccagg...,ATCTGGTTCTGGCAGTGGAGGAGGTGGCTCCGGTGGAG,caattgccaggagaaCGAccgtttcaatgccggatatgtatgAGGa...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,,gtaggcgtgtacggtgggaggtcTATAtaagcagagctcgtttagt...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...
1,0.00,0.00,23499.00,51684.00,0,0,3,3,3,1,...,gatgagtttcccacgatggtgtttccttctgggcagatcagccagg...,ATCTGGTTCTGGCAGTGGAGGAGGTGGCTCCGGTGGAG,caattgccaggagaaCGAccgtttcaatgccggatatgtatgAGGa...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,,gtaggcgtgtacggtgggaggtcTATAtaagcagagctcgtttagt...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...
2,6315.90,5590.40,10732.00,7072.60,3,3,3,3,2,3,...,gatgagtttcccacgatggtgtttccttctgggcagatcagccagg...,ATCTGGTTCTGGCAGTGGAGGAGGTGGCTCCGGTGGAG,caattgccaggagaaCGAccgtttcaatgccggatatgtatgAGGa...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,,gtaggcgtgtacggtgggaggtcTATAtaagcagagctcgtttagt...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...
3,959.53,959.53,5239.40,7879.10,3,3,3,3,3,3,...,gatgagtttcccacgatggtgtttccttctgggcagatcagccagg...,ATCTGGTTCTGGCAGTGGAGGAGGTGGCTCCGGTGGAG,caattgccaggagaaCGAccgtttcaatgccggatatgtatgAGGa...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,,gtaggcgtgtacggtgggaggtcTATAtaagcagagctcgtttagt...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...
4,7789.80,8092.00,2466.90,2786.50,3,3,3,3,2,4,...,gatgagtttcccacgatggtgtttccttctgggcagatcagccagg...,ATCTGGTTCTGGCAGTGGAGGAGGTGGCTCCGGTGGAG,caattgccaggagaaCGAccgtttcaatgccggatatgtatgAGGa...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,,gtaggcgtgtacggtgggaggtcTATAtaagcagagctcgtttagt...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
83086,2565.50,0.00,0.00,0.00,5,0,0,0,3,4,...,GAAGGCCAGAGTGACGAAAGAGCATTATTGGACCAGCTGCACACTC...,atggcctcaaacgactacactcagcaagccacacaaagttacgggg...,caattgccaggagaaCGAccgtttcaatgccggatatgtatgAGGa...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,ggggttggggttgcgccttttccaaggcagccctgggtttgcgcag...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,,gtaggcgtgtacggtgggaggtcTATAtaagcagagctcgtttagt...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...
83087,2786.50,0.00,330.41,330.41,3,0,3,3,3,3,...,GAAGGCCAGAGTGACGAAAGAGCATTATTGGACCAGCTGCACACTC...,atggcctcaaacgactacactcagcaagccacacaaagttacgggg...,caattgccaggagaaCGAccgtttcaatgccggatatgtatgAGGa...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,ggggttggggttgcgccttttccaaggcagccctgggtttgcgcag...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,,tctagagggTATATAatgggggccaGAAT,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...
83088,0.00,0.00,615.91,959.53,0,0,3,3,1,1,...,gacgcattggacgattttgatctggatatgctgggaagtgacgccc...,atggcctcaaacgactacactcagcaagccacacaaagttacgggg...,caattgccaggagaaCGAccgtttcaatgccggatatgtatgAGGa...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,attgattattgactagatcatcgcgtgaggctccggtgcccgtcag...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,,tctagagggTATATAatgggggccaGAAT,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...
83089,0.00,0.00,330.41,350.04,0,0,3,3,2,1,...,gacgcattggacgattttgatctggatatgctgggaagtgacgccc...,atggcctcaaacgactacactcagcaagccacacaaagttacgggg...,caattgccaggagaaCGAccgtttcaatgccggatatgtatgAGGa...,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...,ggggttggggttgcgccttttccaaggcagccctgggtttgcgcag...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...,,gtaggcgtgtacggtgggaggtcTATAtaagcagagctcgtttagt...,CCATgataggggatctgtgagatgggtgtgtatagatctttggcca...


In [49]:
for seq in part_lib_data.query("`Part Type` == 'ZF Binding Sites (SynTF A)'")["Sequence"]:
    print(seq.lower())

caaaacgccttcggcgtagccgatgtcgcgctcccgtgtcagtaaaggtcggcgtagccgatgtcgcgcaatcggactcc
caaaacgcttcggcgtagccgatgtcgcgctcccgtgtcagtaaaggtcggcgtagccgatgtcgcgcaatcggactcccttcgtacggcgtagccgatgtcgcgcgtatcagtcgcctcggaacggcgtagccgatgtcgcgcattcgtacaaaacgccttcggcgtagccgatgtcgcgctcccgtgtcagtaaaggtcggcgtagccgatgtcgcgcaatcggactcccttcgtacggcgtagccgatgtcgcgcgtatcagtcgcctcggaacggcgtagccgatgtcgcgcattcgtacaaa
caaaacgccttcggcgtagccgatgtcgcgctcccgtgtcagtaaaggtcggcgtagccgatgtcgcgcaatcggactcccttcgtacggcgtagccgatgtcgcgcgtatcagtcgcctcggaacggcgtagccgatgtcgcgcattcgtacaaaacgccttcggcgtagccgatgtcgcgctcccgtgtcagtaaaggtcggcgtagccgatgtcgcgcaatcggactcccttcgtacggcgtagccgatgtcgcgcgtatcagtcgcctcggaacggcgtagccgatgtcgcgcattcgtacaaaacgccttcggcgtagccgatgtcgcgctcccgtgtcagtaaaggtcggcgtagccgatgtcgcgcaatcggactcccttcgtacggcgtagccgatgtcgcgcgtatcagtcgcctcggaacggcgtagccgatgtcgcgcattcgtacaaagctcactctcccttacacggagtggatatagt


In [34]:
part_lib_data.columns = part_lib_data.iloc[0]
part_lib_data.drop(index=0)
part_lib_data


Unnamed: 0,Sr. No.,Part Type,Part Name,Part Code (CLASSIC),Sequence
0,Sr. No.,Part Type,Part Name,Part Code (CLASSIC),Sequence
1,1,Promoter (SynTF A/B),hEF1a v1,1,ggatctgcgatcgctccggtgcccgtcagtgggcagagcgcacatc...
2,2,Promoter (SynTF A/B),hEF1a v2,2,attgattattgactagatcatcgcgtgaggctccggtgcccgtcag...
3,3,Promoter (SynTF A/B),hPGK,3,ggggttggggttgcgccttttccaaggcagccctgggtttgcgcag...
4,4,Promoter (SynTF A/B),CMV,4,Ccagatatactcgttgacattgattattgactagttattaatagta...
5,5,Promoter (SynTF A/B),mPGK,5,gggtaggggaggcgcttttcccaaggcagtctggagcatgcgcttt...
6,6,Promoter (SynTF A/B),RSV,6,AATGTAGTCTTATGCAATACACTTGTAGTCTTGCAACATGGTAACG...
7,7,Activation Domain (SynTF A/B),p65,1,gatgagtttcccacgatggtgtttccttctgggcagatcagccagg...
8,8,Activation Domain (SynTF A/B),VP64,2,gacgcattggacgattttgatctggatatgctgggaagtgacgccc...
9,9,Activation Domain (SynTF A/B),NFZ,3,GAAGGCCAGAGTGACGAAAGAGCATTATTGGACCAGCTGCACACTC...


In [None]:
# construct into A & B & R
# part_A_seq = 

# part_B_seq = None
# part_R_seq = None



# df_exp["syn_tf_promoter_seq"] = df_exp["SynTF Prom."].apply(lambda x: get_seq("Promoter", x))
# df_exp["kozak_seq"] = get_seq("Kozak", 1)
# df_exp["AD_seq"] = df_exp["AD"].apply(lambda x: get_seq("Activation Domain", x))
# df_exp["IDR_seq"] = df_exp["IDR"].apply(lambda x: get_seq("IDR", x))
# df_exp['mRuby_seq'] = get_seq("mRuby", 1)
# df_exp['ert2_seq'] = get_seq("ERT2", 1)
# df_exp["ZF_seq"] = df_exp["ZF Aff"].apply(lambda x: get_seq("ZF Aff", x))
# df_exp["syn_tf_term_seq"] = df_exp["SynTF Term & Sp."].apply(lambda x: get_seq("SynTF Term", x))
# df_exp["spacer_seq"] = df_exp['Spacer'].apply(lambda x: get_seq("SynTF Term Spacer", x))
# df_exp["BM_seq"] = df_exp["No. of BS"].apply(lambda x: get_seq("ZF Binding Sites", x))
# df_exp["reporter_promoter_seq"] = df_exp["pMin"].apply(lambda x: get_seq("Core/Minimal Promoter", x))
# # Reminder there is Kozak here
# df_exp["eGFP_seq"] = get_seq("eGFP", 1)
# df_exp["reporter_terminator_seq"] = get_seq("Reporter Terminator", 1)
# df_exp["reporter_spacer_seq"] = df_exp["Rep. Term. Sp."].apply(lambda x: get_seq("Reporter Spacer", x))
