# Library design for CTP-05, Exonic RNA


by Pu Zheng

This library design is based on target regions designed by Stephen

## Imports

In [4]:
%run "E:\Users\puzheng\Documents\Startup_py3.py"
sys.path.append(r"E:\Users\puzheng\Documents")

import ImageAnalysis3
from ImageAnalysis3 import get_img_info, visual_tools, corrections, library_tools

from ImageAnalysis3.library_tools import LibraryDesigner as ld
from ImageAnalysis3.library_tools import LibraryTools as lt

%matplotlib notebook
print(os.getpid())

## Some Global used folder names

In [None]:
# folder for genomic info
genome_folder = r'E:\Genomes\hg38'

# Library directories
library_folder = r'X:\Libraries\CTP-06'
# folder for sub-pool
pool_folder = os.path.join(library_folder, 'chr21_promoters')
# folder for fasta sequences
sequence_folder = os.path.join(pool_folder, 'sequences')

# folder to save result probes
save_folder = os.path.join(pool_folder, 'reports')

# Design probes

## probe_designer

## Construct count table with all the 17-mers in the genome

Only do this if you don't have pre-built 17-mer

However you can do almost the same thing for your own library during quality check

In [None]:
# genome
names,seqs = lt.fastaread(os.path.join(genome_folder, 'human_genome.fasta'))
ct = ld.countTable(word=17,save_file=os.path.join(genome_folder, 'hg38_genome_17w.npy'), sparse=False)
for iseq in tqdm(range(len(seqs))):
    print(names[iseq], end=' ')
    ct.consume(seqs[iseq],verbose=True)
ct.complete(verbose=True)
ct.save()

In [None]:
# repeats
names,seqs = lt.fastaread(os.path.join(genome_folder, 'HumanRepeats.fasta'))
ct = ld.countTable(word=17,save_file=os.path.join(genome_folder, 'hg38_genome_repeats_17w.npy'), sparse=False)
for iseq in tqdm(range(len(seqs))):
    print(names[iseq], end=' ')
    ct.consume(seqs[iseq],verbose=True)
ct.complete(verbose=True)
ct.save()

## Parse sequences

## Run probe_designer

In [None]:
##Construct SI15 probes-whole_chr21
#import time,os,sys
#reload(ld)
#import matplotlib.pyplot as plt
#plt.switch_backend('agg')
reload(ld)
# folder for genomic info
genome_folder = r'E:\Genomes\hg38'
# Indeces
genmoe_index = os.path.join(genome_folder, r'hg38_genome_17w.npy')
repeat_index = os.path.join(genome_folder, r'hg38_genome_repeats_17w.npy')
transcriptome_index = os.path.join(genome_folder, r'hg38_transcriptome_17w.npy')

# Library directories
library_folder = r'X:\Libraries\CTP-06'
# folder for sub-pool
pool_folder = os.path.join(library_folder, 'chr21_promoters')
# folder for fasta sequences
sequence_folder = os.path.join(pool_folder, 'sequences')
# get input files 
input_files = glob.glob(os.path.join(sequence_folder, '*.fasta'))
# folder to save result probes
save_folder = os.path.join(pool_folder, 'reports')

overwrite = True

if not os.path.exists(save_folder):
    os.makedirs(save_folder);
# Loop through all input files
for in_file in input_files:
    #print in_file
    save_file = os.path.join(save_folder, os.path.basename(in_file).replace('.fasta','.pbr'))#'/pb_reports_reg_'+str(-i-1)+'.pbr'
    if not os.path.exists(save_file) or overwrite: # if probe report file doesnt exist
        
        local_genome_fl = in_file
        print(in_file)
        pb_designer = ld.pb_reports_class(
            sequence_dic={'file':in_file,'use_revc':False,'use_kmer':True},
            map_dic={'genome':{'file':genmoe_index,'use_revc':True,'use_kmer':True},
                  'rep_genome':{'file':repeat_index,'use_revc':True,'use_kmer':True},
                  'local_genome':{'file':local_genome_fl,'force_list':True,'use_revc':True,'use_kmer':True}},
            save_file=save_file,
            params_dic={'word_size':17,'pb_len':42,'buffer_len':0,'max_count':2**16-1,'check_on_go':False,'auto':False},
            dic_check={('genome','local_genome'):75,'rep_genome':0,'gc':[0.25,0.85],'tm':70, })

        pb_designer.computeOTmaps()
        pb_designer.compute_pb_report()
        pb_designer.perform_check_end()
        pb_designer.plots()
        pb_designer.save_csv()
        print(f"Number of probes kept: {len(pb_designer.pb_reports_keep)}")


# Screen probes

## screen probes against an existing fasta (previous library)

In [51]:
library_folder = r'X:\Libraries\CTP-06\chr21_promoters'
sequence_folder = os.path.join(library_folder, 'sequences')
report_folder = os.path.join(library_folder, 'reports')


In [133]:
reload(library_tools)
pb_dict = library_tools.Screen_probe_against_fasta(report_folder, 
                                                   r'X:\Libraries\CTP-04\chr21\filtered_blast_centered_probes.fasta',
                                                   overwrite=True)

- Screen probes against given fasta file:X:\Libraries\CTP-04\chr21\filtered_blast_centered_probes.fasta
-- start reading 86 probe files
-- constructing reference table for fasta file
Mapping no. of seqs: 231435
--- 112 / 112 probes kept for ADAMTS1
--- 32 / 108 probes kept for ADAMTS5
--- 36 / 109 probes kept for ADARB1
--- 21 / 85 probes kept for AGPAT3
--- 62 / 64 probes kept for ANKRD20A11P
--- 25 / 97 probes kept for APP
--- 32 / 101 probes kept for ATP5PF
--- 35 / 109 probes kept for BACE2
--- 82 / 82 probes kept for BACH1
--- 26 / 68 probes kept for BRWD1
--- 52 / 96 probes kept for BTG3
--- 33 / 94 probes kept for C21orf91
--- 96 / 96 probes kept for C2CD2
--- 16 / 75 probes kept for CBR3
--- 28 / 87 probes kept for CCT8
--- 29 / 76 probes kept for CHAF1B
--- 109 / 109 probes kept for COL18A1
--- 32 / 90 probes kept for COL6A1
--- 105 / 105 probes kept for COL6A2
--- 29 / 99 probes kept for CRYZL1
--- 34 / 85 probes kept for CXADR
--- 56 / 56 probes kept for CYP4F29P
--- 45 / 84

In [29]:
from ImageAnalysis3.library_tools import readouts
readout_cand_file = r'E:\Users\puzheng\Documents\Adaptors\NDBs_new.fasta'
adaptor_site_file = r'E:\Users\puzheng\Documents\Adaptors\Adaptor_sites.fasta'
barcode_dir = r'W:\Pu\Readouts'
existing_readout_files = [os.path.join(barcode_dir, 'Stvs.fasta'),
                          os.path.join(barcode_dir, 'NDBs.fasta')]
saved_readouts = readouts.Check_adaptors_against_fasta(readout_cand_file, adaptor_site_file, existing_readout_files,
                                                       save=True, save_adaptors=True)

- Check raedouts->adaptors against fasta
-- 252 readout loaded
Mapping no. of seqs: 198
Mapping no. of seqs: 1125
Mapping no. of seqs: 198
Mapping no. of seqs: 1125
Mapping no. of seqs: 198
Mapping no. of seqs: 1125
Mapping no. of seqs: 198
Mapping no. of seqs: 1125
Mapping no. of seqs: 198
Mapping no. of seqs: 1125
-- 199 readous are kept.


In [396]:
reload(library_tools)
splitted_readouts = library_tools.Split_readouts_into_channels(saved_readouts, num_channels=3, save_name='NDB_new')

- Splitting 199 readouts into 3 channels
-- saving 67 readouts into file:\\SMIRNOV\Chromatin_NAS_3\Pu\Readouts\NDB_new_0.fasta
-- saving 66 readouts into file:\\SMIRNOV\Chromatin_NAS_3\Pu\Readouts\NDB_new_1.fasta
-- saving 66 readouts into file:\\SMIRNOV\Chromatin_NAS_3\Pu\Readouts\NDB_new_2.fasta


## Patch Barcodes

In [398]:
reload(library_tools)

# folder for this library
library_folder = r'X:\Libraries\CTP-06\chr21_promoters'

# extract primers and readouts
primers = library_tools.load_primers([2,9])
unique_readouts = library_tools.load_readouts(len(pb_dict), 'NDB_New', _num_colors=3, _start_id=0)

readout_dict = {'u': unique_readouts}

- Picked primer: ID: W1A03_primer_2
Name: W1A03_primer_2
Description: W1A03_primer_2
Number of features: 0
Seq('CCCGCAATGGCTGACAACCG', SingleLetterAlphabet())
- Picked primer: ID: W1A10_primer_9
Name: W1A10_primer_9
Description: W1A10_primer_9
Number of features: 0
Seq('TAATACGACTCACTATAGGGATTGCCGCATGGTTTCCG', SingleLetterAlphabet())


In [402]:
reload(library_tools)
from ImageAnalysis3.library_tools import _assemble_single_probe, _assemble_single_probename
cand_probes, readout_summary = Assemble_probes(library_folder, pb_dict, gene_readout_dict, readout_dict, primers, 
                                               rc_targets=True, overwrite=True)

- Assemble probes by given target sequences, readouts and primers.
-- included readout types: ['u']
--- assemblying 112 probes in region: ADAMTS1
--- assemblying 32 probes in region: ADAMTS5
--- assemblying 36 probes in region: ADARB1
--- assemblying 21 probes in region: AGPAT3
--- assemblying 62 probes in region: ANKRD20A11P
--- assemblying 25 probes in region: APP
--- assemblying 32 probes in region: ATP5PF
--- assemblying 35 probes in region: BACE2
--- assemblying 82 probes in region: BACH1
--- assemblying 26 probes in region: BRWD1
--- assemblying 52 probes in region: BTG3
--- assemblying 33 probes in region: C21orf91
--- assemblying 96 probes in region: C2CD2
--- assemblying 16 probes in region: CBR3
--- assemblying 28 probes in region: CCT8
--- assemblying 29 probes in region: CHAF1B
--- assemblying 109 probes in region: COL18A1
--- assemblying 32 probes in region: COL6A1
--- assemblying 105 probes in region: COL6A2
--- assemblying 29 probes in region: CRYZL1
--- assemblying 34 p

## 5. Check quality

In [405]:
# biopython for SeqRecord
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC

# blast
from Bio.Blast.Applications import NcbiblastnCommandline
from Bio.Blast import NCBIXML 

In [406]:
# folder for this library
# candidate full-length probe filename
candidate_full_name = 'candidate_probes.fasta'
# load full probes
full_records = []
with open(os.path.join(library_folder, candidate_full_name), 'r') as handle:
    for record in SeqIO.parse(handle, "fasta"):
        full_records.append(record)

print(f"Total probe loaded: {len(full_records)}")

Total probe loaded: 4253


In [407]:
reload(check)

primer_check = check._check_primer_usage(full_records, primers[0], primers[1])

reg_size_dic, len_check = check._check_region_size(full_records, min_size=15)

reg_readout_dic, reg2readout_check = check._check_region_to_readouts(full_records, readout_dict)

readout_reg_dic, readout2reg_check = check._check_readout_to_region(reg_readout_dic, full_records, readout_dict)

-- Checking primer usage, total probes: 4253
gene: ADAMTS1 -> 112
gene: ADAMTS5 -> 32
gene: ADARB1 -> 36
gene: AGPAT3 -> 21
gene: ANKRD20A11P -> 62
gene: APP -> 25
gene: ATP5PF -> 32
gene: BACE2 -> 35
gene: BACH1 -> 82
gene: BRWD1 -> 26
gene: BTG3 -> 52
gene: C21orf91 -> 33
gene: C2CD2 -> 96
gene: CBR3 -> 16
gene: CCT8 -> 28
gene: CHAF1B -> 29
gene: COL18A1 -> 109
gene: COL6A1 -> 32
gene: COL6A2 -> 105
gene: CRYZL1 -> 29
gene: CXADR -> 34
gene: CYP4F29P -> 56
gene: DIP2A -> 45
gene: DONSON -> 41
gene: DSCR9 -> 78
gene: DYRK1A -> 31
gene: ETS2 -> 109
gene: EVA1C -> 20
gene: FAM207A -> 94
gene: HLCS -> 39
gene: HMGN1 -> 96
gene: HSPA13 -> 29
gene: HUNK -> 34
gene: IFNAR1 -> 32
gene: IFNAR2 -> 21
gene: IL10RB -> 29
gene: ITGB2 -> 35
gene: JAM2 -> 30
gene: LCA5L -> 94
gene: LSS -> 35
gene: LTN1 -> 37
gene: MCM3AP -> 20
gene: MIS18A -> 27
gene: MORC3 -> 15
gene: MRPL39 -> 79
gene: MRPS6 -> 34
gene: MX1 -> 30
gene: MX2 -> 79
gene: N6AMT1 -> 28
gene: NCAM2 -> 103
gene: NRIP1 -> 36
gene: PAXBP

In [408]:
reload(check)
reload(ld)
int_map = check._construct_internal_map(full_records, library_folder)

readout_count_dic, readout_count_check = check._check_readout_in_probes(readout_reg_dic, reg_size_dic, int_map, readout_dict)

kept_records, removed_count = check._check_between_probes(full_records, int_map)
# save kept records
with open(os.path.join(library_folder, 'filtered_full_probes.fasta'), 'w') as output_handle:
    SeqIO.write(kept_records, output_handle, "fasta")

-- saving internal 17-mer map to file:X:\Libraries\CTP-06\chr21_promoters\probe_table_17.npz
--- Sequence:chr21:33321943-33326943_gene_IFNAR1_pb_4_pos_1441_readouts_[NDB_1149_u,NDB_1149_u,NDB_1149_u] got hits:52, dumped
--- Sequence:chr21:33321943-33326943_gene_IFNAR1_pb_5_pos_1501_readouts_[NDB_1149_u,NDB_1149_u,NDB_1149_u] got hits:62, dumped
--- Sequence:chr21:33321943-33326943_gene_IFNAR1_pb_9_pos_1749_readouts_[NDB_1149_u,NDB_1149_u,NDB_1149_u] got hits:87, dumped
--- Sequence:chr21:33321943-33326943_gene_IFNAR1_pb_10_pos_1791_readouts_[NDB_1149_u,NDB_1149_u,NDB_1149_u] got hits:92, dumped
--- Sequence:chr21:33321943-33326943_gene_IFNAR1_pb_11_pos_1833_readouts_[NDB_1149_u,NDB_1149_u,NDB_1149_u] got hits:59, dumped
--- Sequence:chr21:33321943-33326943_gene_IFNAR1_pb_13_pos_1930_readouts_[NDB_1149_u,NDB_1149_u,NDB_1149_u] got hits:59, dumped
--- Sequence:chr21:33321943-33326943_gene_IFNAR1_pb_14_pos_1998_readouts_[NDB_1149_u,NDB_1149_u,NDB_1149_u] got hits:75, dumped
-- total probe

## blast

In [16]:
reload(ImageAnalysis3)
from ImageAnalysis3.library_tools import quality_check

In [None]:
quality_check.Blast_probes(kept_records, library_folder)

In [411]:
kept_pbs, blast_keep_dic, hard_count_list, soft_count_list = quality_check.Screening_Probes_by_Blast(library_folder, 80)

- Number of probes loaded: 4246
- Number of regions in this library: 86
-- checking probes in region: ADAMTS1
--- number of probes: 112 , kept by blast: 112 , if remove dups: 112
--- remove duplicated probes
-- number of probes kept for this region: 80
-- checking probes in region: ADAMTS5
--- gene=ADAMTS5, id=28 removed by soft count = 13627
--- number of probes: 32 , kept by blast: 31 , if remove dups: 31
-- number of probes kept for this region: 31
-- checking probes in region: ADARB1
--- gene=ADARB1, id=16 removed by soft count = 1018
--- gene=ADARB1, id=24 removed by soft count = 80
--- number of probes: 36 , kept by blast: 34 , if remove dups: 34
-- number of probes kept for this region: 34
-- checking probes in region: AGPAT3
--- gene=AGPAT3, id=8 removed by hard count = 0
--- number of probes: 21 , kept by blast: 20 , if remove dups: 20
-- number of probes kept for this region: 20
-- checking probes in region: ANKRD20A11P
--- gene=ANKRD20A11P, id=1 removed by soft count = 1695


--- gene=CYP4F29P, id=48 removed by soft count = 47
--- gene=CYP4F29P, id=49 removed by hard count = 4
--- gene=CYP4F29P, id=50 removed by soft count = 32
--- gene=CYP4F29P, id=51 removed by soft count = 63
--- gene=CYP4F29P, id=52 removed by soft count = 102
--- gene=CYP4F29P, id=53 removed by hard count = 3
--- gene=CYP4F29P, id=54 removed by hard count = 3
--- gene=CYP4F29P, id=55 removed by soft count = 45
--- number of probes: 56 , kept by blast: 32 , if remove dups: 21
-- number of probes kept for this region: 32
-- checking probes in region: DIP2A
--- gene=DIP2A, id=0 removed by soft count = 95
--- gene=DIP2A, id=8 removed by soft count = 72
--- gene=DIP2A, id=21 removed by soft count = 61
--- gene=DIP2A, id=38 removed by soft count = 35
--- gene=DIP2A, id=44 removed by soft count = 86
--- number of probes: 45 , kept by blast: 40 , if remove dups: 40
-- number of probes kept for this region: 40
-- checking probes in region: DONSON
--- gene=DONSON, id=3 removed by soft count = 37

--- number of probes: 15 , kept by blast: 13 , if remove dups: 13
-- number of probes kept for this region: 13
-- checking probes in region: MRPL39
--- gene=MRPL39, id=77 removed by soft count = 117
--- number of probes: 79 , kept by blast: 78 , if remove dups: 78
--- remove duplicated probes
-- number of probes kept for this region: 78
-- checking probes in region: MRPS6
--- gene=MRPS6, id=0 removed by soft count = 31
--- gene=MRPS6, id=6 removed by soft count = 33
--- gene=MRPS6, id=8 removed by soft count = 36
--- gene=MRPS6, id=25 removed by soft count = 39
--- number of probes: 34 , kept by blast: 30 , if remove dups: 30
-- number of probes kept for this region: 30
-- checking probes in region: MX1
--- gene=MX1, id=0 removed by soft count = 87
--- gene=MX1, id=1 removed by soft count = 34
--- gene=MX1, id=14 removed by soft count = 40
--- gene=MX1, id=17 removed by soft count = 94
--- gene=MX1, id=24 removed by hard count = 5
--- gene=MX1, id=25 removed by soft count = 1049
--- ge

--- number of probes: 40 , kept by blast: 40 , if remove dups: 40
-- number of probes kept for this region: 40
-- checking probes in region: SETD4
--- gene=SETD4, id=7 removed by soft count = 201
--- gene=SETD4, id=9 removed by soft count = 353
--- gene=SETD4, id=10 removed by soft count = 51
--- gene=SETD4, id=15 removed by soft count = 3495
--- gene=SETD4, id=16 removed by soft count = 200
--- gene=SETD4, id=17 removed by soft count = 360
--- gene=SETD4, id=18 removed by soft count = 1624
--- number of probes: 25 , kept by blast: 18 , if remove dups: 18
-- number of probes kept for this region: 18
-- checking probes in region: SIK1
--- gene=SIK1, id=20 removed by hard count = 0
--- number of probes: 53 , kept by blast: 52 , if remove dups: 1
-- number of probes kept for this region: 52
-- checking probes in region: SIM2
--- gene=SIM2, id=82 removed by soft count = 38
--- number of probes: 107 , kept by blast: 106 , if remove dups: 106
--- remove duplicated probes
-- number of probes 

## extra selection

In [413]:
primer_check = check._check_primer_usage(kept_pb_list, primers[0], primers[1])
print(primer_check)
reg_size_dic, len_check = check._check_region_size(kept_pb_list,min_size=15)
print(len_check)
reg_readout_dic, reg2readout_check = check._check_region_to_readouts(kept_pb_list, readout_dict)
print(reg2readout_check)

readout_reg_dic, readout2reg_check = check._check_readout_to_region(reg_readout_dic, kept_pb_list, readout_dict)
print(readout2reg_check)

int_map = check._construct_internal_map(kept_pb_list, library_folder)

readout_count_dic, readout_count_check = check._check_readout_in_probes(readout_reg_dic, reg_size_dic, int_map, readout_dict)
print(readout_count_check)

kept_records, removed_count = check._check_between_probes(kept_pb_list, int_map)
# save kept records
with open(os.path.join(library_folder, 'final_probes', 'extra_filtered_full_probes.fasta'), 'w') as output_handle:
    SeqIO.write(kept_records, output_handle, "fasta")

-- Checking primer usage, total probes: 3626
True
gene: ADAMTS1 -> 80
gene: ADAMTS5 -> 31
gene: ADARB1 -> 34
gene: AGPAT3 -> 20
gene: ANKRD20A11P -> 43
gene: APP -> 24
gene: ATP5PF -> 29
gene: BACE2 -> 35
gene: BACH1 -> 67
gene: BRWD1 -> 25
gene: BTG3 -> 50
gene: C21orf91 -> 28
gene: C2CD2 -> 80
gene: CBR3 -> 16
gene: CCT8 -> 22
gene: CHAF1B -> 26
gene: COL18A1 -> 80
gene: COL6A1 -> 32
gene: COL6A2 -> 80
gene: CRYZL1 -> 26
gene: CXADR -> 30
gene: CYP4F29P -> 32
gene: DIP2A -> 40
gene: DONSON -> 38
gene: DSCR9 -> 68
gene: DYRK1A -> 29
gene: ETS2 -> 80
gene: EVA1C -> 18
gene: FAM207A -> 80
gene: HLCS -> 25
gene: HMGN1 -> 80
gene: HSPA13 -> 25
gene: HUNK -> 33
gene: IFNAR1 -> 14
gene: IFNAR2 -> 20
gene: IL10RB -> 29
gene: ITGB2 -> 33
gene: JAM2 -> 29
gene: LCA5L -> 80
gene: LSS -> 32
gene: LTN1 -> 33
gene: MCM3AP -> 20
gene: MIS18A -> 26
gene: MORC3 -> 13
gene: MRPL39 -> 78
gene: MRPS6 -> 30
gene: MX1 -> 21
gene: MX2 -> 75
gene: N6AMT1 -> 26
gene: NCAM2 -> 80
gene: NRIP1 -> 36
gene: PAXBP

-- saving internal 17-mer map to file:X:\Libraries\CTP-06\chr21_promoters\probe_table_17.npz
True
-- total probes removed by internal screening: 0
