# create reference files for adding adaptors and analysis

for library CTP-10 Aire (209 gene)

by Pu Zheng

2021.06.07

In [1]:
%run "..\Startup_py3.py"
sys.path.append(r"..\..\..\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

# other required parameters
from ImageAnalysis3.classes import _allowed_kwds

14140


In [23]:
pool_folder = r'\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-11_brain'
library_folder = os.path.join(pool_folder, r'mouse_genome_1000')

In [24]:
probe_filename = os.path.join(pool_folder, 'final_pool_probes.fasta')
if not os.path.isfile(probe_filename):
    raise IOError(f"input probe file: {probe_filename} doesn't exist.")


In [3]:
# biopython imports
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML

In [4]:
pb_records = []
with open(probe_filename, 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        pb_records.append(record)

In [5]:
pb_records[0].id

'loc_1:3740000-3760000_gene_1_pb_23_pos_2742_strand_-_readouts_[NDB_856_c,NDB_946_c,NDB_935_c]_primers_[W1A03_primer_2,W1A10_primer_9]_library_1000-mouse-genome_500_library_1000-mouse-genome_500'

In [9]:
# extract library names
lib_names = []
lib_splitter = 'library'
for _r in pb_records:
    _lib_name = _r.id.split('_'+lib_splitter+'_')[-1]
    if _lib_name not in lib_names:
        lib_names.append(_lib_name)
print(lib_names)

['1000-mouse-genome_500', '1000-mouse-genome_250_1', '1000-mouse-genome_250_2', '1000-mouse-genome-short_500', '1000-mouse-genome-short_250_1', '1000-mouse-genome-short_250_2']


In [31]:
# select a library
lib_id = 0
# select lib type
lib_type = 'combo'

In [18]:
# extract library probes
lib_records_dict = {_n:[] for _n in lib_names}
for _r in pb_records:
    _lib_name = _r.id.split('_'+lib_splitter+'_')[-1]
    if _lib_name == lib_names[lib_id]:
        lib_records_dict[_lib_name].append(_r)

In [22]:
from ImageAnalysis3 import library_tools
pb_dict = library_tools.quality_check.split_probe_by_gene(lib_records_dict[lib_names[lib_id]])
print(len(pb_dict))

499


In [25]:
readout_usage_file = os.path.join(library_folder, 'readout_usage.pkl')
readout_dict = pickle.load(open(readout_usage_file, 'rb'))

In [32]:
selected_readout_names = [_r.id for _r in readout_dict[_allowed_kwds[lib_type]]]

## save adaptor_sequences.csv for adding adaptors

In [33]:
readout_folder = r'\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\Readouts'
ref_files = [_fl for _fl in os.listdir(readout_folder) if 'designed_readouts' in _fl]

ref_readout_dict = {}
for _fl in ref_files:
    _channel = int(_fl.split('designed_readouts_')[1].split('.fasta')[0])
    _ref_readout_names = []
    with open(os.path.join(readout_folder, _fl), 'r') as _rd_handle:
        for _readout in SeqIO.parse(_rd_handle, "fasta"):
            _ref_readout_names.append(_readout.id)
    ref_readout_dict[_channel] = _ref_readout_names

In [34]:
# sort regions with readout types
readout_by_channel = {_c:[] for _c in ref_readout_dict}
for _rd in readout_dict[_allowed_kwds[lib_type]]:
    for _c in readout_by_channel:
        if _rd.id in ref_readout_dict[_c]:
            readout_by_channel[_c].append(_rd.id)
readout_by_channel = {_k:_v for _k,_v in sorted(readout_by_channel.items(), key=lambda v:-int(v[0])) if len(_v) > 0}

In [35]:
readout_by_channel

{750: ['NDB_784',
  'NDB_826',
  'NDB_865',
  'NDB_817',
  'NDB_652',
  'NDB_718',
  'NDB_847',
  'NDB_643',
  'NDB_760',
  'NDB_790',
  'NDB_778',
  'NDB_844',
  'NDB_961',
  'NDB_661',
  'NDB_901',
  'NDB_868',
  'NDB_1027',
  'NDB_754',
  'NDB_856',
  'NDB_634',
  'NDB_715',
  'NDB_883',
  'NDB_1033',
  'NDB_631',
  'NDB_1075',
  'NDB_1060',
  'NDB_805',
  'NDB_730',
  'NDB_880',
  'NDB_832',
  'NDB_835',
  'NDB_946',
  'NDB_721',
  'NDB_853',
  'NDB_838',
  'NDB_994',
  'NDB_1066',
  'NDB_637',
  'NDB_706',
  'NDB_889',
  'NDB_862',
  'NDB_694',
  'NDB_751',
  'NDB_742',
  'NDB_958',
  'NDB_925',
  'NDB_712',
  'NDB_940',
  'NDB_934',
  'NDB_910'],
 647: ['NDB_755',
  'NDB_713',
  'NDB_725',
  'NDB_710',
  'NDB_971',
  'NDB_743',
  'NDB_683',
  'NDB_815',
  'NDB_965',
  'NDB_917',
  'NDB_728',
  'NDB_974',
  'NDB_746',
  'NDB_875',
  'NDB_773',
  'NDB_812',
  'NDB_716',
  'NDB_860',
  'NDB_680',
  'NDB_737',
  'NDB_698',
  'NDB_863',
  'NDB_878',
  'NDB_758',
  'NDB_653',
  'NDB_90

In [39]:
readout_names = {}
for _r in pb_records:
    _rname = _r.id.split('gene_')[1].split('_')[0]
    _rd_names = _r.id.split('readouts_[')[1].split(']')[0].split(',')
    if _rname not in readout_names:
        readout_names[_rname] = []
    for _rd in _rd_names:
        if _rd not in readout_names[_rname]:
            if '_'+_allowed_kwds[lib_type] not in _rd:
                readout_names[_rname].append(_rd)
            else:
                readout_names[_rname].append(_rd.split('_'+_allowed_kwds[lib_type])[0])

'1000-mouse-genome_500'

In [49]:
import csv

num_tubes = 25

save_folder = os.path.join(pool_folder, 'Summary_tables')
if not os.path.exists(save_folder):
    os.makedirs(save_folder)
    
with open(os.path.join(save_folder, f'{lib_names[lib_id]}_{lib_type}_adaptor_sequences.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    _header = ['group', 'hyb', ]
    for _ch in readout_by_channel:
        _header += [f"{_ch}_bits", f"{_ch}_readouts"]
        
    csvwriter.writerow(_header)
    
    for _i in range(max([len(_v) for _k,_v in readout_by_channel.items()])):
        _row = [int((_i)/num_tubes)+1, _i%num_tubes+1,] 
        for _j, (_ch, _names) in enumerate(readout_by_channel.items()):
            if _i >= len(_names):
                _row += ['', '']
            else:
                _row += [f"b{_i*len(readout_by_channel)+_j}", _names[_i]]      
        csvwriter.writerow(_row)

In [129]:
from copy import copy

In [52]:
# find adaptors
reload(library_tools.sequences)
#library_tools.sequences.fasta_reader()

adaptor_folder = r'\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\Adaptors'
adaptor_files = [os.path.join(adaptor_folder, _fl) for _fl in os.listdir(adaptor_folder) 
                 if _fl.split(os.extsep)[-1]=='fasta' and _fl.split(os.extsep)[-2][-9:] == '_adaptors']
adaptors = library_tools.sequences.fasta_reader(adaptor_files, True).load()

loading 3 fasta files
- loading from file: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\Adaptors\NDB_adaptors.fasta
- loading from file: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\Adaptors\Stv_adaptors.fasta
- loading from file: \\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\Adaptors\20200121_extend_stv_adaptors.fasta


# find used adaptors and generate IDT batch order

In [56]:
from copy import copy
selected_adaptors = {}

for _ch, _rnames in readout_by_channel.items():
    selected_adaptors[_ch] = []
    for _rname in _rnames:
        _matched_adaptors = [_adt for _adt in adaptors if _rname in _adt.id]
        if len(_matched_adaptors) == 1:
            _matched_adaptor = copy(_matched_adaptors[0])
            _matched_adaptor.id = _matched_adaptor.id+'rc'
            _matched_adaptor.description = ""

            selected_adaptors[_ch].append(_matched_adaptor)
        else:
            print(_rname)
            
readout_usage_folder = os.path.dirname(readout_usage_file)
# generate csv file to order in IDT
import csv

with open(os.path.join(save_folder, f'{lib_names[lib_id]}_{lib_type}_adaptor_idt_order_tubes.csv'), 'w') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', lineterminator='\n',
                       quotechar='|', quoting=csv.QUOTE_MINIMAL)
    # write header
    _header = ['Name', 'Sequence', 'Scale', 'Purification']
    csvwriter.writerow(_header)
    # write sequence
    for _ch, _adaptors in selected_adaptors.items():
        for _adaptor in _adaptors:
            _info = [_adaptor.id, str(_adaptor.seq), '25nm', 'STD']
            csvwriter.writerow(_info)

# Generate Color_Usage.csv

In [59]:
import csv

drift_channel = '488'
dapi_channel = '405'
ref_hyb = 0

chrom_labels = {}

with open(os.path.join(save_folder, f'{lib_names[lib_id]}_{lib_type}_Color_Usage.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    
    # write header
    _header = ['Hyb']
    for _ch in readout_by_channel:
        _header .append(str(_ch))
    _header.append(drift_channel)
    _header.append(dapi_channel)
    print(_header)
    csvwriter.writerow(_header)
    
    # write reference frame
    if len(chrom_labels) > 0:
        _ref_row = ['H0R0']
        for _ch in readout_by_channel:
            if str(_ch) in chrom_labels:
                _ref_row.append(chrom_labels[str(_ch)]+'_chrom')
            else:
                _ref_row.append("")
        _ref_row.append('beads')
        _ref_row.append('DAPI')
        print(_ref_row)
        csvwriter.writerow(_ref_row)
    
    
    for _i in range(max([len(_v) for _k,_v in readout_by_channel.items()])):
        _row = [f"H{int(_i)}C{int(_i)+1}",] 
        for _j, (_ch, _names) in enumerate(readout_by_channel.items()):
            if _i >= len(_names):
                _row += ['']
            else:
                _row += [f"{_allowed_kwds[lib_type]}{_i*len(readout_by_channel)+_j}"]
            
        _row.append("beads")
        # append for ref
        if _i == ref_hyb:
            _row.append('DAPI')
        #
        print(_row)
        csvwriter.writerow(_row)


['Hyb', '750', '647', '488', '405']
['H0C1', 'c0', 'c1', 'beads', 'DAPI']
['H1C2', 'c2', 'c3', 'beads']
['H2C3', 'c4', 'c5', 'beads']
['H3C4', 'c6', 'c7', 'beads']
['H4C5', 'c8', 'c9', 'beads']
['H5C6', 'c10', 'c11', 'beads']
['H6C7', 'c12', 'c13', 'beads']
['H7C8', 'c14', 'c15', 'beads']
['H8C9', 'c16', 'c17', 'beads']
['H9C10', 'c18', 'c19', 'beads']
['H10C11', 'c20', 'c21', 'beads']
['H11C12', 'c22', 'c23', 'beads']
['H12C13', 'c24', 'c25', 'beads']
['H13C14', 'c26', 'c27', 'beads']
['H14C15', 'c28', 'c29', 'beads']
['H15C16', 'c30', 'c31', 'beads']
['H16C17', 'c32', 'c33', 'beads']
['H17C18', 'c34', 'c35', 'beads']
['H18C19', 'c36', 'c37', 'beads']
['H19C20', 'c38', 'c39', 'beads']
['H20C21', 'c40', 'c41', 'beads']
['H21C22', 'c42', 'c43', 'beads']
['H22C23', 'c44', 'c45', 'beads']
['H23C24', 'c46', 'c47', 'beads']
['H24C25', 'c48', 'c49', 'beads']
['H25C26', 'c50', 'c51', 'beads']
['H26C27', 'c52', 'c53', 'beads']
['H27C28', 'c54', 'c55', 'beads']
['H28C29', 'c56', 'c57', 'beads']

## save region_positions for analysis

In [60]:
import csv

with open(os.path.join(save_folder, f'{lib_names[lib_id]}_{lib_type}_Region_Positions.csv'), 'w', newline='') as csvfile:
    csvwriter = csv.writer(csvfile, delimiter=',', 
                           quotechar='|', quoting=csv.QUOTE_MINIMAL)
    # write header
    csvwriter.writerow(['region', 'chr', 'start', 'end', 'midpoint',])
    
    for _i,_info in region_info.items():
        csvwriter.writerow([_i, 
                            _info['chr'], 
                            _info['start'],
                            _info['end'],
                            _info['mid'],
                           ])

NameError: name 'region_info' is not defined

In [61]:
import pandas as pd

In [62]:
pb_records[0].id

'loc_1:3740000-3760000_gene_1_pb_23_pos_2742_strand_-_readouts_[NDB_856_c,NDB_946_c,NDB_935_c]_primers_[W1A03_primer_2,W1A10_primer_9]_library_1000-mouse-genome_500_library_1000-mouse-genome_500'

In [80]:
region_position_df = pd.DataFrame(columns=['id', 'region', 'chr', 'start', 'end', 'strand']).set_index('id')

In [89]:
_record = pb_records[0]
if 'res_' in _record.id 

Unnamed: 0,id,region,chr,start,end,strand


In [92]:
region_position_df.append({'id':'test'}, ignore_index=True)

Unnamed: 0,id,region,chr,start,end,strand
0,test,,,,,


In [86]:
region_position_df

Unnamed: 0,id,region,chr,start,end,strand


In [None]:
for _r in pb_records:
    
    

In [74]:
region_position_df

Unnamed: 0_level_0,region,chr,start,end,strand
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
