In [1]:
import pandas as pd
import numpy as np
import os, sys, time
import re

# bioPython
from Bio.Seq import Seq

In [2]:
# load
probe_filename = r'/home/puzheng/MERFISH_Probes/PL70-76_amplifier_pools_bits_pegfish_2lvl.xlsx'
probe_df = pd.read_excel(probe_filename)

In [5]:
np.unique(probe_df['Pool name'], return_counts=True)


(array(['PL70_combined_bit_amps_15', 'PL71_combined_bit_amps_20',
        'PL72_emx1_bit_350', 'PL73_hek3_bit_350', 'PL74_rnf2_bit_350',
        'PL75_combined_bit_amps_20_btree_lvl1',
        'PL76_combined_bit_amps_20_btree_lvl2'], dtype=object),
 array([120, 120,  21,  21,  21, 120, 144]))

In [93]:
#sel_pools = ['PL72_emx1_bit_350', 'PL73_hek3_bit_350', 'PL74_rnf2_bit_350',]
sel_pools = ['PL72_emx1_bit_350']
readout_copy_nums = [2, 4, 6, 8, 10, 13]
T7_promoter = Seq('TAATACGACTCACTATAGGG')
T7_promoter_rc = str(T7_promoter.reverse_complement())
# assemble
readoutNum_2_seqs = {_n:[] for _n in readout_copy_nums}
readout_site_dict = {'name':[], 'seq':[]}
readout_size = 20

for sel_pool in sel_pools:
    sel_probe_df = probe_df.loc[probe_df['Pool name']==sel_pool]
    for seq in list(sel_probe_df['Sequence'].values):
        # Parse
        fwd_seq, rev_primer_rc = re.findall('[ATCG]+', seq)
        fwd_primer = fwd_seq[:len(rev_primer_rc)]
        target_seq = fwd_seq[len(rev_primer_rc):]
        
        readout_seq = re.findall('[atcg]+', seq)[0]
        readout_site = readout_seq[:readout_size]
        
        #print(fwd_primer, rev_primer_rc, target_seq, readout_site)
        # append
        readout_site_dict['name'].append(sel_pool.split('_bit_350')[0])
        readout_site_dict['name'].append(readout_site)
        for _n in readout_copy_nums:
            #_final_seq_list = [fwd_primer] + [readout_site]*min(_n,2) + [target_seq] + [readout_site]*max(_n-2, 0) + [ref_primer_rc]
            _final_seq_list = [fwd_primer] + [readout_site]*_n + [target_seq] + [ref_primer_rc] + [T7_promoter_rc]
            _final_seq = ''.join(_final_seq_list)
            #print(_final_seq)
            readoutNum_2_seqs[_n].append(_final_seq)

In [96]:
# create df
summary_dict = {'Pool name': [], 'Sequence': []}
for _n, _seqs in readoutNum_2_seqs.items():
    summary_dict['Pool name'].extend([f'PL72_emx1_{_n}-readout']*len(_seqs))
    summary_dict['Sequence'].extend(_seqs)
summary_df = pd.DataFrame(summary_dict)

In [97]:
summary_df

Unnamed: 0,Pool name,Sequence
0,PL72_emx1_2-readout,CGCGCGCCTTTGGCGGGAAGTCCTGgaggcggattgagattcggtg...
1,PL72_emx1_2-readout,CGCGCGCCTTTGGCGGGAAGTCCTGcgatggtcgtcctcgtttcgc...
2,PL72_emx1_2-readout,CGCGCGCCTTTGGCGGGAAGTCCTGgtttgcgtgtaatcgactctg...
3,PL72_emx1_2-readout,CGCGCGCCTTTGGCGGGAAGTCCTGgccgtcgtcacgtgcgagtag...
4,PL72_emx1_2-readout,CGCGCGCCTTTGGCGGGAAGTCCTGgatgcctcttcgatagattcg...
...,...,...
121,PL72_emx1_13-readout,CGCGCGCCTTTGGCGGGAAGTCCTGggcactaggataactttaggg...
122,PL72_emx1_13-readout,CGCGCGCCTTTGGCGGGAAGTCCTGgtccatgatacgaggtgatag...
123,PL72_emx1_13-readout,CGCGCGCCTTTGGCGGGAAGTCCTGgcacgtatgtcccgtccattg...
124,PL72_emx1_13-readout,CGCGCGCCTTTGGCGGGAAGTCCTGaagggcgatgtaacggcgcaa...


In [101]:
# save
overwrite = True
save_folder = r'/lab/solexa_weissman/puzheng/MERFISH_Probes'
save_probe_filename = os.path.join(save_folder,
                             f"{time.localtime().tm_year}_{time.localtime().tm_mon}_{time.localtime().tm_mday}_variable_length_PL72_emx1.xlsx")

if not os.path.exists(save_probe_filename) or overwrite:
    print(f"saving probes to file: {save_probe_filename}")
    summary_df.to_excel(save_probe_filename, index=None)

saving probes to file: /lab/solexa_weissman/puzheng/MERFISH_Probes/2023_2_15_variable_length_PL72_emx1.xlsx


In [102]:
# primers
primer_dict = {"Name":['PL72_fwd', 'PL72_T7_rev', 'PL72_T7'], 
               "Sequence":[fwd_primer, str(Seq(ref_primer_rc +T7_promoter_rc).reverse_complement()), str(T7_promoter)], 
               "Scale":['25nm']*3, 
               "Purification":['STD']*3}
primer_df = pd.DataFrame(primer_dict)
primer_df

Unnamed: 0,Name,Sequence,Scale,Purification
0,PL72_fwd,CGCGCGCCTTTGGCGGGAAGTCCTG,25nm,STD
1,PL72_T7_rev,TAATACGACTCACTATAGGGAGTCGCATGCCGTGGCCGGCGACTT,25nm,STD
2,PL72_T7,TAATACGACTCACTATAGGG,25nm,STD


In [103]:
save_primer_filename = os.path.join(save_folder,
                             f"{time.localtime().tm_year}_{time.localtime().tm_mon}_{time.localtime().tm_mday}_Primers_PL72.xlsx")

if not os.path.exists(save_primer_filename) or overwrite:
    print(f"saving primers to file: {save_primer_filename}")
    primer_df.to_excel(save_primer_filename, index=None)

saving primers to file: /lab/solexa_weissman/puzheng/MERFISH_Probes/2023_2_15_Primers_PL72.xlsx


# Check readouts

In [20]:
readout_folder = r'/lab/solexa_weissman/puzheng/References/Readouts'
readout_reference = os.path.join(readout_folder, 'Readout_summary.xlsx')

# load
readout_table = pd.read_excel(readout_reference)

probe_bits_list = []
readout_len = 20

sel_probe_df = probe_df.loc[probe_df['Pool name']=='PL72_emx1_bit_350']

for _seq in sel_probe_df['Sequence']:
    #print(_seq)
    _seq_bits = []
    for _i in range(0, len(_seq)-readout_len+1):
        _target = _seq[_i:_i+readout_len].upper()
        if _target in readout_table['Target'].values:
            #print(_target)
            _seq_bits.append(readout_table.loc[readout_table['Target']==_target, 'Name'].values[0])
            
    probe_bits_list.append(_seq_bits)
    

In [21]:
readout_table['Target'].values

array(['ATCCTCCTTCAATACATCCC', 'ACACTACCACCATTTCCTAT',
       'ACTCCACTACTACTCACTCT', ..., 'AAGCGTAATGACCTGCTCCA',
       'TTATGGTACCTTCGACCTGT', 'GAACCCGACATCAGGGCGAG'], dtype=object)

In [25]:
sel_probe_df['readout'] = [np.unique(_rds)[0] for _rds in probe_bits_list]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sel_probe_df['readout'] = [np.unique(_rds)[0] for _rds in probe_bits_list]


In [29]:
sel_probe_df

Unnamed: 0,Pool name,Sequence,readout
240,PL72_emx1_bit_350,CGCGCGCCTTTGGCGGGAAGTCCTGATGGGAGAACAGTTCTTCTGg...,NDB_634
241,PL72_emx1_bit_350,CGCGCGCCTTTGGCGGGAAGTCCTGATGGGAGTGTCCTTCTTCTGc...,NDB_332
242,PL72_emx1_bit_350,CGCGCGCCTTTGGCGGGAAGTCCTGATGGGAGATGGATTCTTCTGg...,NDB_405
243,PL72_emx1_bit_350,CGCGCGCCTTTGGCGGGAAGTCCTGATGGGAGCAAGCTTCTTCTGg...,NDB_715
244,PL72_emx1_bit_350,CGCGCGCCTTTGGCGGGAAGTCCTGATGGGAGGTACTTTCTTCTGg...,NDB_758
245,PL72_emx1_bit_350,CGCGCGCCTTTGGCGGGAAGTCCTGATGGGAGAATGTTTCTTCTGa...,NDB_1035
246,PL72_emx1_bit_350,CGCGCGCCTTTGGCGGGAAGTCCTGATGGGAGACGTCTTCTTCTGc...,NDB_883
247,PL72_emx1_bit_350,CGCGCGCCTTTGGCGGGAAGTCCTGATGGGAGATTTGTTCTTCTGt...,NDB_536
248,PL72_emx1_bit_350,CGCGCGCCTTTGGCGGGAAGTCCTGATGGGAGTAGGGTTCTTCTGt...,NDB_663
249,PL72_emx1_bit_350,CGCGCGCCTTTGGCGGGAAGTCCTGATGGGAGTTGATTTCTTCTGg...,NDB_1033


In [36]:
sel_probe_df.to_excel(r'/lab/solexa_weissman/puzheng/MERFISH_Probes/PE_TS/Edits/PL72_Emx1_annotated.xlsx', index=None)