**Author**: Seon Kinrot

**Email**: skinrot@g.harvard.edu

**Date**: 3/4/2020

In [None]:
#Dependencies
import numpy as np
import cPickle as pkl
import os
import codeDesigner as cd

# Choose genomic locations with equal spacing on each chromosome

In [None]:
min_chr=30 #minimal number of loci to be chosen on a chromosome
target_res = 3*10**6 #default target distance between chosen loci

chosen_by_chr = [],[] #chosen loci genome-wide, by chromosome
home_ = '/n/home02/seonkinrot/Genomes/human/hg38' #folder containing fasta files for the entire human genome
shift = 5*10**4 #size of region in each direction required to be mappable

for chr_ in range(23):
    chr_text = 'chr'+str(chr_+1) if chr_<22 else 'chrX'
    _,seq = cd.fastaread(home_+os.sep+chr_text+'.fa')
    seq = seq[0]
    chr_len = len(seq)
    target_dist = np.min([float(chr_len)/min_chr,target_res])
    chr_chosen_ = []
    while len(chr_chosen_)<min_chr:
        chr_chosen__ = np.arange(target_dist,chr_len,target_dist,dtype=int) #uniform probes
        chr_chosen__ = [c_ for c_ in chr_chosen__ if ((c_+shift<chr_len) and (c_>shift))]
        chr_chosen_ = [chosen_ for chosen_ in chr_chosen__ if seq[chosen_]!='N' and seq[chosen_]!='n']
        chr_chosen_ = [chosen_ for chosen_ in chr_chosen_ if seq[chosen_+shift]!='N' and seq[chosen_+shift]!='n']
        chr_chosen_ = [chosen_ for chosen_ in chr_chosen_ if seq[chosen_-shift]!='N' and seq[chosen_-shift]!='n']
        target_dist = target_dist*0.9
    chosen_by_chr.append(chr_chosen_)
n_per_chr = [len(ch_) for ch_ in chosen_by_chr]
print n_per_chr

In [None]:
save_fld = '/n/home02/seonkinrot/TAD_sim/SI14'
fid = open(save_fld+os.sep+'hg38_chosen_regs.txt','w')
for ichr_,chr_ in enumerate(chosen_by_chr):
    chr_str = 'chr'+str(ichr_+1) if ichr_<22 else 'chrX'
    for reg_ in chr_:
        ln = chr_str+':'+str(int(reg_-shift))+'-'+str(int(reg_+shift))+'\n'
        fid.write(ln)
fid.close()

Load list of loci used in this study

In [None]:
chrs_regs = [ln for ln in open(r'hg38_chosen_regs.txt','r')] #loci chosen for each chromosome in actual study
chr_strs = ['chr'+str(ichr_+1) if ichr_<22 else 'chrX' for ichr_ in range(23)]
chosen_by_chr = []
for chr_ in chr_strs:
    c_regs = [reg for reg in chrs_regs if reg.split(':')[0]==chr_] #loci for given chromosome
    chr_chosen_ = map(np.mean,[map(int,reg.split(':')[1].split('-')) for reg in c_regs]) #mid-point of each locus
    chosen_by_chr.append(chr_chosen_)
n_per_chr = [len(ch_) for ch_ in chosen_by_chr]
print n_per_chr

# Generate barcodes per locus

In [None]:
nhb = 50 #number of rounds of imaging (including separate colors) to design
buf_ = 100
split_ = True #if true, the code design is for double the number of rounds in nhb, but can be pooled to halve imaging time

#this is what was used in this study, but code generation can be done directly for 100 bits by setting: 
# nhb=100, split_=False

hybes,tads_assign = cd.code_encoder_v2(nTADs=n_per_chr,refine_chr=500000,refine_TAD=250000,nchr=23,nhybes=nhb,buffer_=buf_)
new_code, new_hybes, new_tads_assign = cd.split_code(code,hybes) #new_code corresponds to the sparser code

sv_params = [n_per_chr]
sv_params.append([code,hybes,tads_assign])
sv_params.append([new_code,new_hybes,new_tads_assign])

Barcodes for intron probes were generated analogously, but without the code splitting step

FISH probes for each DNA locus were desgined as described in Bintu et al., Science (2018). Code for probe design can be found at https://github.com/BogdanBintu/ChromatinImaging, under LibraryDesign