## Simulate sequences from a lineage tracing experiment 

Simulate $N$ sequences resulting from Cas9 cutting over $T$ timesteps to be used for testing alignment strategies.

In [599]:
import pandas as pd
import numpy as np
from Bio import SeqIO

T = 3
N = 100000

Read in the reference sequence to be mutated

In [600]:
ref_file = "/home/mattjones/PCT48_seq.fasta"
ref_seq = ""
with open(ref_file, "r") as handle:
    ref_seq = str(list(SeqIO.parse(handle, "fasta"))[0].seq)
    
extension = "TAAACCCGCTGATCAGCCTCGACTGTGCCTTCTAGTTGCCAGCCATCTGTTGTTTGCCCCTCCCCCGTGCCTTCCTTGACCCTGGAAGGTGCCACTCCCACTGTCCTTTCCTAATAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAA"

# These are the locations for the cut sites in the reference sequence.
cut_site_locs = {-1: 94, 0: 112, 1: 166, 2: 220}
REF_SEQ_LEN = 296

In [605]:
class TargetSiteSequence:
    
    def __init__(self, seq, ident, needle=True):
        
        self.seq = seq
        self.add_intBC()
        self.id = ident
        
        # Boolean values indicating whether or not the cute sites have been cut yet
        self.is_cut = np.array([False, False, False])
        self.cut_sites = {-1: 94, 0: 112, 1: 166, 2: 220}
        self.allele_cigar = [""]*3
        
        self.needle = needle
            
    def cut_site(self):
        """ 
        Cut sites in sequence with N_CAS9 cas9 proteins.
        """
        
        # all cut sites have been cut!
        if all(self.is_cut):
            return
        
        nts = ["A", "C", "T", "G"]
        
        bool_to_cigar = {True: "D", False: "I"}
        
        n_cas9 = draw_ncas9()
        
        # no cutting if no cas9 come to the cut site
        if n_cas9 == 0:
            return
        
        
        sites = self.draw_sites(n_cas9)

        if len(sites) == 1:
            s = sites[0]
            # Introduce random indel in this particular site
            
            # choose location in site
            loc = pick_site(cut_site_locs[s])
            
            # choose length of indel
            indel_length = draw_indel_length()
            
            # flip coin to be insertion or deletion
            is_del = (0.75 > np.random.uniform())
            
            # Add indel to self.allele_cigar         
            self.allele_cigar[s] = str(indel_length) + bool_to_cigar[is_del]
            
            # Add indel to self.seq
            if is_del:
                self.seq = (self.seq[:loc] + self.seq[(loc+indel_length):])
                
            else:
                inserted_nt = "".join(np.random.choice(nts, indel_length))
                self.seq = (self.seq[:loc] + inserted_nt + self.seq[(loc):])
                
            self.cut_sites[s] = loc
            self.is_cut[s] = True
                
        elif len(sites) == 2:
            # if there are two sites being cut simultaenously, we need to remove the segment between cut sites
            s1, s2 = min(sites[0], sites[1]), max(sites[0], sites[1])
            loc1, loc2 = pick_site(cut_site_locs[s1]), pick_site(cut_site_locs[s2])
            
            self.allele_cigar[s1] = str(loc2 - loc1) + "D"
            self.allele_cigar[s2] = "Resected"
            
            if s1 == 0 and s2 == 2:
                self.allele_cigar[1] = "Resected"
                self.is_cut[1] = True 
                
                
            self.seq = self.seq[:loc1] + self.seq[(loc2):]
            self.cut_sites[s1], self.cut_sites[s2] = loc1, loc2
            self.is_cut[s1], self.is_cut[s2] = True, True
            
        else:
            s1, s2 = 0, 2
            loc1, loc2 = pick_site(cut_site_locs[s1]), pick_site(cut_site_locs[s2])
            
            self.allele_cigar[s1] = str(loc2 - loc1) + "D"
            self.allele_cigar[s2] = "Resected"
            
            self.allele_cigar[1] = "Resected"
            
            self.seq = self.seq[:loc1] + self.seq[(loc2):]
            
            self.cut_sites[0], self.cut_sites[2] = loc1, loc2
            self.is_cut[0], self.is_cut[1], self.is_cut[2] = True, True, True

        
        
    def add_intBC(self):
        
        alphabet = ["A", "C", "G", "T"]
        bc = "".join(np.random.choice(alphabet, 14))
        
        self.seq = self.seq.replace("N"*14, bc)
        
    def draw_sites(self, n_cas9):
        
        unavail_sites = np.where(self.is_cut == True)[0]
        
        probs = np.ones(len(self.is_cut)) / (3 - len(unavail_sites))
        probs[unavail_sites] = 0
        
        sites = np.random.choice(3, size=min(n_cas9, 3 - len(unavail_sites)), p=probs, replace=False)
        
        return sites
    
    def add_mutations(self, l=5):
        
        nt = ["A", "C", "G", "T"]
        n_muts = np.random.poisson(5)
        
        for i in range(n_muts):
            loc = np.random.choice(len(self.seq))
            c_nt = self.seq[loc]
            n_nt = np.random.choice(nt)
            
            while c_nt == n_nt:
                n_nt = np.random.choice(nt)
            
            self.seq = self.seq[:loc] + n_nt + self.seq[loc+1:]
    
def draw_indel_length(N=0.5, P=0.1):        
    
    return min(30, max(1, np.random.negative_binomial(N, P)))
        
def draw_ncas9(l = 0.4):
    
    return min(3, np.random.poisson(lam = l))

def pick_site(loc):
    """
    Draw a random site to begin a cut at, centered at loc.
    """
    
    offset = max(-14, min(4, 10 - np.random.poisson(10)))
    
    return loc + offset

def create_cigar_string(cigar_list, cut_sites):
    
    
    cs = [str(cut_site_locs[-1]) + "M"]
    for i in range(3):
        
        if cigar_list[i] == "Resected":
            continue
        
        if cigar_list[i] == "":
            
            match_len = cut_sites[i] - cut_sites[i-1]
            
            if i != 0 and cigar_list[i-1] != "Resected":
                if cs[len(cs) - 1][-1] == "D":
                    match_len -= int(cs[len(cs)-1][:-1])

            if cs[len(cs) - 1][-1] == "M":

                match_len = match_len + int(cs[len(cs) -1][:-1])
                cs[len(cs) - 1] = str(match_len) + "M"


            else:
                cs += [str(match_len) + "M"]
            
            
        else:
            match_len = cut_sites[i] - cut_sites[i-1]
            
            if cs[len(cs) - 1][-1] != "M":
                if i != 0 and cigar_list[i-1] != "Resected":
                    if cs[len(cs) - 1][-1] == "D":
                        match_len -= int(cs[len(cs)-1][:-1])
                    cs += [str(match_len) + "M"] + [cigar_list[i]]
                else:
                    cs += [str(match_len) + "M"] + [cigar_list[i]]
                
                
            if cs[len(cs) - 1][-1] == "M":
                match_len = match_len + int(cs[len(cs) -1][:-1])
                cs[len(cs) - 1] = str(match_len) + "M"
                cs += [cigar_list[i]]
    
    final_match_len = REF_SEQ_LEN - cut_sites[2]
    
    if cigar_list[2] != "Resected":
        if cs[len(cs) - 1][-1] == "D":
            final_match_len -= int(cs[len(cs) - 1][:-1])
            
    
    if cs[len(cs) - 1][-1] == "M":
        final_match_len = final_match_len + int(cs[len(cs) -1][:-1])
        cs[len(cs) - 1] = str(final_match_len) + "M"
        
    else:
        cs += [str(final_match_len) + "M"]
    
        
    
    return "".join(cs), cs

            
def test_cigar_caller(cigar):
    s = 0
    for c in cigar:
        if c[-1] != "I":
            s += int(c[:-1])
    
    return s == REF_SEQ_LEN
        

### Now simulate sequences

In [606]:
out_file = open("simulated_target_sites.needle.txt", "w")
fa_out = open("simulated_target_sites.needle.fa", "w")

out_file.write("ID\tSequence\tCigar\n")

for n in range(N):
    
    seq = TargetSiteSequence(ref_seq, "seq_" + str(n))
    for t in range(min(5, np.random.poisson(T))):
        seq.cut_site()
        
    cs, cs_list = create_cigar_string(seq.allele_cigar, seq.cut_sites)
    
    assert test_cigar_caller(cs_list) == True
    
    seq.add_mutations()
    
    out_file.write(seq.id + "\t" + seq.seq + '\t' + cs + "\n")
    fa_out.write(">" + seq.id + "\n" + seq.seq + '\n')
    
out_file.close()
    
        