# Identification of barcodes from Nanopore data

In [2]:
from Bio import Align
from Bio.Seq import Seq
from os import path
from tqdm import tqdm
from itertools import product
import re
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import wimpy as wp

### Import sequences

In [None]:
_, ls, seqs = wp.fastqall('./')
puro = Seq('cgctccgcatcggcctaaggaaccggcgtggttcctggctacggtgggagtctcacctgaccatcaaggaaagggattgggaagtgctgtcgttcttcca')
new_seq, right_seq, f = wp.bowtile(seqs, puro)
seqs_reads_correct = [s for s, l in zip(new_seq, ls) if s and 6000 < l < 8000]


### Read reference promoter and terminator sequences

In [None]:
# promoters
promoters_file = pd.read_excel('512-promoters.xlsx', index_col='Promoter')
promoters = [Seq(s).upper() for s in promoters_file['Sequence']]
num_promoters = len(promoters)

# terminators
terminators_file = pd.read_excel('512-terminators.xlsx', header=None)
terminators = [Seq(s).upper() for s in terminators_file[1]]
num_terminators = len(terminators)

### Define mRuby and BFP sequences

In [69]:
# mRuby and BFP sequence
m_ruby = Seq('GTGAGTAAAGGCGAAGAACTTATCAAGGAAAATATGCGGATGAAAGTGGTTATGGAGGGTAGCGTGAACGGACACCAGTTCAAATGCACGGGAGAGGGCGAGGGGCGACCCTACGAGGGAGTCCAAACAATGAGGATTAAGGTTATAGAAGGTGGTCCGCTGCCATTCGCATTCGATATTTTGGCCACGTCCTTCATGTACGGCTCCCGAACCTTTATCAAATACCCTGCGGATATCCCAGACTTTTTCAAGCAATCCTTTCCGGAAGGGTTCACGTGGGAGCGAGTCACGAGATATGAGGATGGAGGCGTAGTAACAGTAACCCAAGACACATCACTTGAGGACGGTGAGCTTGTCTACAATGTGAAGGTACGCGGCGTCAATTTCCCCTCAAATGGCCCGGTGATGCAAAAGAAAACTAAAGGATGGGAGCCCAACACCGAAATGATGTACCCGGCAGATGGGGGGCTTAGGGGCTATACGGACATCGCATTGAAGGTTGATGGCGGGGGCCATCTCCATTGTAACTTTGTAACTACATATCGGTCAAAAAAGACTGTGGGGAACATTAAAATGCCGGGAGTACACGCTGTTGATCATCGCCTGGAAAGGATAGAGGAAAGCGACAATGAAACGTATGTAGTACAGCGGGAGGTCGCCGTCGCCAAATATAGTAATCTGGGCGGTGGCATGGACGAGCTTTATAAA').upper()
m_ruby = m_ruby[49:150]
tiles_mrb = to_tiles(m_ruby)

bfp = Seq('GTCAGTAAAGGGGAAgagcttataaaggaaaatatgcacatgaagctctacatggagggcactgtagataaccaccatttcaaatgtacctctgaaggggagggcaagccatacgaaggtactcaaaccatgcgaataaaagtagttgaaggcgggcctcttccctttgcattcgacattctcgcaacctcttttctgtacggcagtaagactttcataaaccacactcaaggcattccagacttcttcaagcaatcattccccgaaggattcacctgggagcgagttactacttatgaggacggaggagtccttactgcaacccaagacacctcactgcaagatgggtgcctgatttacaatgtaaagatcagaggggtgaatttcacaagcaatgggccagttatgcaaaaaaagacccttggatgggaggccttcaccgagacactgtacccagccgatggtggactggagggcaggaatgacatggccctcaagctcgtcggaggcagtcacttgattgccaacgccaaaaccacttaccgctctaagaaacctgctaaaaacctgaagatgcccggcgtctattatgtggactatcgacttgagagaattaaggaggcaaacaacgagacttatgtcgaacagcatgaagttgccgtggctaggtactgtgatttgcccagcaaattgggtcataaacttaac').upper()
bfp = bfp[49:150]
tiles_bfp = to_tiles(bfp)

### Align reads to mRuby and BFP

In [None]:
aligner = Align.PairwiseAligner()
aligner.mode = 'local'
aligner.substitution_matrix = Align.substitution_matrices.load("NUC.4.4")
aligner.open_gap_score = -8

l_mrb = len(m_ruby)
l_bfp = len(bfp)

perc_align = []
start_idx_mrb = []
start_idx_bfp = []

for seq in tqdm(seqs_reads_correct, desc='aligning to mRuby and BFP'):
    alignment_mrb = aligner.align(seq, m_ruby)[0]
    alignment_bfp = aligner.align(seq, bfp)[0]

    perc_align.append((alignment_mrb.score / l_mrb, alignment_bfp.score / l_bfp))
    start_idx_mrb.append(alignment_mrb.aligned[0][0][0])
    start_idx_bfp.append(alignment_bfp.aligned[0][0][0])

### Fragment sequneces to regions

In [71]:
seqs_regions = []

for seq, i_mrb, i_bfp in zip(seqs_reads_correct, start_idx_mrb, start_idx_bfp):
    seqs_regions.append({
        'p': seq[0:i_mrb],
        't': seq[i_mrb:i_bfp],
        'k': seq[i_mrb - 150 if i_mrb > 150 else 0:i_mrb],
        't_bfp': seq[i_bfp:i_bfp + 1200]
    })


### Promoter and terminator assigments

In [None]:
tile_len = 12
lens_p = np.array([len(p) for p in promoters])
lens_t = np.array([len(t) for t in terminators])

p_tiles_mrb = np.zeros((len(seqs_reads_correct), num_promoters))
t_tiles_mrb = np.zeros((len(seqs_reads_correct), num_terminators))

tiles_p = [to_tiles(p, tile_len=12) for p in promoters]
tiles_t = [to_tiles(t, tile_len=12) for t in terminators]

for i, seq_regions in tqdm(enumerate(seqs_regions), 
                           desc='tiling promoters and terminators',
                           total=len(seqs_regions)):

    for j, tiles in enumerate(tiles_p):
        for tile in tiles:
            if tile in seq_regions['p']: p_tiles_mrb[i, j] += 1
            
    for j, tiles in enumerate(tiles_t):
        for tile in tiles:
            if tile in seq_regions['t']: t_tiles_mrb[i, j] += 1

p_tiles_mrb /= lens_p
t_tiles_mrb /= lens_t

In [73]:
threshold = 0.04

# special cases for promoters and terminators that have overlapping regions
diff_p = p_tiles_mrb[:, 0] - p_tiles_mrb[:, 1]
p_tiles_mrb[:, 0] = diff_p
p_tiles_mrb[:, 1] = -diff_p

diff_t = t_tiles_mrb[:, 0] - t_tiles_mrb[:, 1]
t_tiles_mrb[:, 0] = diff_t
t_tiles_mrb[:, 1] = -diff_t

p_conf_mrb = np.zeros((num_promoters, num_promoters))
t_conf_mrb = np.zeros((num_terminators, num_terminators))

for (i, j), _ in np.ndenumerate(p_conf_mrb):
    p_conf_mrb[i, j] = sum((p_tiles_mrb[:, i] > threshold) & (p_tiles_mrb[:, j] > threshold))
    
for (i, j), _ in np.ndenumerate(t_conf_mrb):
    t_conf_mrb[i, j] = sum((t_tiles_mrb[:, i] > threshold) & (t_tiles_mrb[:, j] > threshold))

In [None]:
plt.figure()
fig, ax = plt.subplots()
ax.matshow(p_conf_mrb)
for (i, j), z in np.ndenumerate(p_conf_mrb):
    ax.text(j, i, '{:0.0f}'.format(z), ha='center', va='center')
plt.show()

plt.figure()
fig, ax = plt.subplots()
ax.matshow(t_conf_mrb)
for (i, j), z in np.ndenumerate(t_conf_mrb):
    ax.text(j, i, '{:0.0f}'.format(z), ha='center', va='center')
plt.show()

In [75]:
promoter_assignment = p_tiles_mrb > 0.03
terminator_assignment = t_tiles_mrb > 0.03

In [76]:
assigned_t_bfp = []
for ass, region in zip(terminator_assignment, seqs_regions):
    if np.sum(ass) == 1: assigned_t_bfp.append((np.where(ass)[0][0], region['t_bfp']))

### Barcode finder - alignment

In [77]:
bc1_construct = Seq('TTATTATTATTATTATTA')
sub_matrix_bba = Align.substitution_matrices.Array(
    data=np.array([[5, -5, -5, -5], [-5, 5, 5, 5],[-5, 5, 5, 5], [-5, 5, 5, 5]]),
    alphabet='ATGC')

aligner = Align.PairwiseAligner()
aligner.mode = 'local'
aligner.substitution_matrix = sub_matrix_bba
aligner.open_gap_score = -8


In [None]:
bc_bba_align = []
bc_10n_align = []

for ass, seq in tqdm(assigned_t_bfp, desc='searching for BBA and 10N barcode using alignment'):
    alignment = aligner.align(seq, bc1_construct)[0]
    bc = alignment[0]
    if alignment.score > 85:
        if ass == 7: bc_bba_align.append(bc)
        elif ass == 1: bc_10n_align.append(bc)

len_bba_align = [len(bc) for bc in bc_bba_align]
len_10n_align = [len(bc) for bc in bc_10n_align]

plt.figure(dpi=150)
plt.hist(len_bba_align, bins=20, label='bba')
plt.hist(len_10n_align, bins=20, label='10n')
plt.legend()
plt.show()


In [83]:
with open(f'nanopore_reads_{fidelity}_bba_align.csv', 'w') as file:
    for bc in bc_bba_align: file.write(str(bc) + '\n')

with open(f'nanopore_reads_{fidelity}_10n_align.csv', 'w') as file:
    for bc in bc_10n_align: file.write(str(bc) + '\n')

### Barcode finder - upstream and downstream scar

In [None]:
bc_bba_scar = []
bc_10n_scar = []

for ass, seq in tqdm(assigned_t_bfp, desc='searching for BBA and 10N barcode using scar'):
    upscar = [x.start() for x in re.finditer('AAACG', str(seq))]
    upscar += [x.start() + 1 for x in re.finditer('GAAAC', str(seq))]
    downscar = [x.start() - 1 for x in re.finditer('AGTTC', str(seq))]
    downscar += [x.start() for x in re.finditer('CAGTT', str(seq))]
    
    # either upscar or downscar is not found
    if not (upscar and downscar):
        if ass == 7:   bc_bba_scar.append('')
        elif ass == 1: bc_10n_scar.append('')
        continue

    bc = Seq('')
    for i, j in product(upscar, downscar):
        if 10 < j - i < 30: 
            bc = seq[i + 5 : j]
            break

    if ass == 7:   bc_bba_scar.append(bc)
    elif ass == 1: bc_10n_scar.append(bc)
    
len_bba_scar = [len(bc) for bc in bc_bba_scar]
len_10n_scar = [len(bc) for bc in bc_10n_scar]

plt.figure(dpi=150)
plt.hist(len_bba_scar, bins=20, label='bba')
plt.hist(len_10n_scar, bins=20, label='10n')
plt.legend()
plt.show()

In [85]:
with open(f'nanopore_reads_{fidelity}_bba_scar.csv', 'w') as file:
    for bc in bc_bba_scar: file.write(str(bc) + '\n')

with open(f'nanopore_reads_{fidelity}_10n_scar.csv', 'w') as file:
    for bc in bc_10n_scar: file.write(str(bc) + '\n')
