<a id='0.1'></a>
## 0.1 load required packages

In [1]:
%run "..\..\..\Startup_py3.py"
sys.path.append(r"..\..\..\..\..\Documents")

import ImageAnalysis3 as ia
%matplotlib notebook

from ImageAnalysis3 import *
print(os.getpid())

# library design specific tools
from ImageAnalysis3.library_tools import LibraryDesigner as ld
from ImageAnalysis3.library_tools import LibraryTools as lt
# biopython imports
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import pandas as pd

30964


<a id='0.2'></a>
## 0.2 folders

In [2]:
## Some folders
# human genome
reference_folder = r'\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\Genomes\mouse\GRCm38_ensembl'
genome_folder = os.path.join(reference_folder, 'Genome')
# Library directories
pool_folder = r'\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-12_mop_markers'
# folder for sub-pool
library_folder = os.path.join(pool_folder, f'marker_gene_TSS')

<a id='0.3'></a>
## 0.3 load probes

In [3]:
# load probes
probes = []
final_pb_filename = os.path.join(library_folder, 'blast_full_probes.fasta')

with open(final_pb_filename, 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        probes.append(record)

In [4]:
from ImageAnalysis3 import library_tools
pb_dict = library_tools.quality_check.split_probe_by_gene(probes)
print(len(pb_dict))

28


In [5]:
# load used readouts
readout_usage_file = os.path.join(library_folder, 'readout_usage.pkl')
readout_dict = pickle.load(open(readout_usage_file, 'rb'))

In [6]:
# select a library
lib_id = 0
# select lib type
lib_type = 'unique'

In [7]:
from ImageAnalysis3.classes import _allowed_kwds
selected_readout_names = [_r.id for _r in readout_dict[_allowed_kwds[lib_type]]]

In [8]:
# load region names and real coordinates
region_2_names = {}
region_2_coords = {}
for _pb in probes:
    _reg = _pb.id.split('loc_')[1].split('_')[0]
    _start, _end = _reg.split(':')[1].split('-')
    # gene info
    _gene = _pb.id.split('gene_')[1].split('_')[0]
    _pb_pos = int(_pb.id.split('pos_')[1].split('_')[0]) + int(_start)
    
    if _gene not in region_2_names:
        region_2_names[_gene] = _reg
    if _gene not in region_2_coords:
        region_2_coords[_gene] = [_pb_pos]
    else:
        region_2_coords[_gene].append(_pb_pos)
print(region_2_names)

{'Aqp4': '18:15400982-15420982', 'Bgn': 'X:73473602-73493602', 'Car3': '3:14853512-14873512', 'Ctss': '3:95516786-95536786', 'Cux2': '5:122040102-122060102', 'Flt1': '5:147716011-147736011', 'Gad1': '2:70543072-70563072', 'Igf2': '7:142656816-142676816', 'Lamp5': '2:136042239-136062239', 'Lratd2': '15:60843778-60863778', 'Nxph4': '10:127524559-127544559', 'Otof': '5:30451932-30471932', 'Pdgfra': '5:75142292-75162292', 'Ptpru': '4:131828288-131848288', 'Pvalb': '15:78196400-78216400', 'Rorb': '19:19101196-19121196', 'Rspo1': '4:124976430-124996430', 'Slc17a7': '7:45153949-45173949', 'Slc30a3': '5:31098237-31118237', 'Slc32a1': '2:158600767-158620767', 'Sncg': '14:34364789-34384789', 'Sox10': '15:79155240-79175240', 'Sst': '16:23880958-23900958', 'Sulf2': '2:166145663-166165663', 'Syt6': '3:103565231-103585231', 'Tshz2': '2:169623013-169643013', 'Vip': '10:5629218-5649218', 'Vtn': '11:78489091-78509091'}


## bit_2_readout

In [9]:
import pandas as pd
save_folder = os.path.join(pool_folder, 'Summary_tables')
dtype_dict = {_v:_k for _k,_v in ia.classes._allowed_kwds.items()}

In [10]:
for _dtype, _records in readout_dict.items():
    
    bit_readout_df = pd.DataFrame([[_i+1, _r.id] for _i, _r in enumerate(_records)], columns=['Bit','ReadoutName'])
    save_filename = os.path.join(save_folder, f'CTP12-TSS_{dtype_dict[_dtype]}_readouts.csv')
    print(save_filename)
    bit_readout_df.to_csv(save_filename, index=False)

\\10.245.74.212\Chromatin_NAS_2\Chromatin_Libraries\CTP-12_mop_markers\Summary_tables\CTP12-TSS_unique_readouts.csv


## Generate sequential codebook

In [11]:
# load region names

gene_2_region = {}

for _pb in probes:
    _reg = _pb.id.split('loc_')[1].split('_')[0]
    _gene = _pb.id.split('gene_')[1].split('_')[0]
    #print(_reg,_gene)
    
    if _gene not in gene_2_region:
        gene_2_region[_gene] = _reg
print(gene_2_region)

{'Aqp4': '18:15400982-15420982', 'Bgn': 'X:73473602-73493602', 'Car3': '3:14853512-14873512', 'Ctss': '3:95516786-95536786', 'Cux2': '5:122040102-122060102', 'Flt1': '5:147716011-147736011', 'Gad1': '2:70543072-70563072', 'Igf2': '7:142656816-142676816', 'Lamp5': '2:136042239-136062239', 'Lratd2': '15:60843778-60863778', 'Nxph4': '10:127524559-127544559', 'Otof': '5:30451932-30471932', 'Pdgfra': '5:75142292-75162292', 'Ptpru': '4:131828288-131848288', 'Pvalb': '15:78196400-78216400', 'Rorb': '19:19101196-19121196', 'Rspo1': '4:124976430-124996430', 'Slc17a7': '7:45153949-45173949', 'Slc30a3': '5:31098237-31118237', 'Slc32a1': '2:158600767-158620767', 'Sncg': '14:34364789-34384789', 'Sox10': '15:79155240-79175240', 'Sst': '16:23880958-23900958', 'Sulf2': '2:166145663-166165663', 'Syt6': '3:103565231-103585231', 'Tshz2': '2:169623013-169643013', 'Vip': '10:5629218-5649218', 'Vtn': '11:78489091-78509091'}


In [12]:
# laod encoding
# summarize total readout usage
gene_2_readout_dict = pickle.load(open(os.path.join(library_folder, 'gene_2_readout.pkl'), 'rb'))
print(gene_2_readout_dict)

{'Slc30a3': ['u0', 'u0'], 'Slc17a7': ['u1', 'u1'], 'Slc32a1': ['u2', 'u2'], 'Gad1': ['u3', 'u3'], 'Otof': ['u4', 'u4'], 'Rspo1': ['u5', 'u5'], 'Pvalb': ['u6', 'u6'], 'Sst': ['u7', 'u7'], 'Vip': ['u8', 'u8'], 'Sncg': ['u9', 'u9'], 'Lamp5': ['u10', 'u10'], 'Lratd2': ['u11', 'u11'], 'Tshz2': ['u12', 'u12'], 'Syt6': ['u13', 'u13'], 'Nxph4': ['u14', 'u14'], 'Cux2': ['u15', 'u15'], 'Rorb': ['u16', 'u16'], 'Sulf2': ['u17', 'u17'], 'Ptpru': ['u18', 'u18'], 'Car3': ['u19', 'u19'], 'Aqp4': ['u20', 'u20'], 'Flt1': ['u21', 'u21'], 'Igf2': ['u22', 'u22'], 'Pdgfra': ['u23', 'u23'], 'Sox10': ['u24', 'u24'], 'Ctss': ['u25', 'u25'], 'Vtn': ['u26', 'u26'], 'Bgn': ['u27', 'u27']}


In [13]:
# generate the sequential_codebook
sequential_codebook = pd.DataFrame(columns=['name','id', 'chr']+[_r.id for _r in readout_dict['u']])
#sequential_codebook.add(['name', 'id'], axis=1)
sequential_codebook['id'] = list(gene_2_readout_dict.keys())
# assign gene and name
sequential_codebook['name'] = [gene_2_region[str(_r)] for _r in list(gene_2_readout_dict.keys())]
sequential_codebook['chr'] = [gene_2_region[str(_r)].split(':')[0] for _r in list(gene_2_readout_dict.keys())]

In [14]:
max_bit = np.max([np.max([int(_b[1:]) for _b in _bits]) for _gname, _bits in gene_2_readout_dict.items()]) + 1
from tqdm.notebook import tqdm
for _gname, _bits in tqdm(gene_2_readout_dict.items()):
    binary_code = []
    for _i in range(max_bit):
        if f"u{_i}" in _bits:
            binary_code.append(1)
        else:
            binary_code.append(0)
    #print(_gname)
    sequential_codebook.loc[sequential_codebook['id']==_gname, sequential_codebook.columns[3:]] = binary_code
    #sequential_codebook.loc[sequential_codebook['name']==_gname,'id'] = int(_bits[0].split('u')[1])
    

  0%|          | 0/28 [00:00<?, ?it/s]

In [15]:
sequential_codebook

Unnamed: 0,name,id,chr,Stv_91,Stv_32,Stv_92,Stv_33,Stv_94,Stv_35,Stv_95,...,Stv_106,Stv_45,Stv_107,Stv_46,Stv_109,Stv_48,Stv_118,Stv_50,Stv_119,Stv_53
0,5:31098237-31118237,Slc30a3,5,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,7:45153949-45173949,Slc17a7,7,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2:158600767-158620767,Slc32a1,2,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2:70543072-70563072,Gad1,2,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5:30451932-30471932,Otof,5,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,4:124976430-124996430,Rspo1,4,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
6,15:78196400-78216400,Pvalb,15,0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0
7,16:23880958-23900958,Sst,16,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,10:5629218-5649218,Vip,10,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,14:34364789-34384789,Sncg,14,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
save_folder = os.path.join(pool_folder, 'Summary_tables')
save_filename = os.path.join(save_folder, 'CTP12-TSS_sequential-codebook.csv')

In [17]:
sequential_codebook.to_csv(save_filename, index=None)