In [50]:
import math
from Bio import SeqIO
from Bio import pairwise2
import matplotlib
import matplotlib.pyplot as plt
import pandas as pd

from codon_table import dna_codons
from colors import rasmol

lpla_seq = "AGCACCCTGAGACTGCTGATCAGCGACAGCTACGACCCCTGGTTCAACCTGGCCGTGGAGGAGTGCATCTTCAGACAGATGCCCGCCACCCAGAGAGTGCTGTTCCTGGTGAGAAACGCCGACACCGTGGTGATCGGCAGAGCCCAGAACCCCTGGAAGGAGTGCAACACCAGAAGAATGGAGGAGGACAACGTGAGACTGGCCAGAAGAAGCAGCGGCGGCGGCGCCGTGTTCCACGACCTGGGCAACACCTGCTTCACCTTCATGGCCGGCAAGCCCGAGTACGACAAGACCATCAGCACCAGCATCGTGCTGAACGCCCTGAACGCCCTGGGCGTGAGCGCCGAGGCCAGCGGCAGAAACGACCTGGTGGTGAAGACCGTGGAGGGCGACAGAAAGGTGAGCGGCAGCGCCTACAGAGAGACCAAGGACAGAGGCTTCCACCACGGCACCCTGCTGCTGAACGCCGACCTGAGCAGACTGGCCAACTACCTGAACCCCGACAAGAAGAAGCTGGCCGCCAAGGGCATCACCAGCGTGAGAAGCAGAGTGACCAACCTGACCGAGCTGCTGCCCGGCATCACCCACGAGCAGGTGTGCGAGGCCATCACCGAGGCCTTCTTCGCCCACTACGGCGAGAGAGTGGAGGCCGAGATCATCAGCCCCAACAAGACCCCCGACCTGCCCAACTTCGCCGAGACCTTCGCCAGACAGAGCAGCTGGGAGTGGAACTTCGGCCAGGCCCCCGCCTTCAGCCACCTGCTGGACGAGAGATTCACCTGGGGCGGCGTGGAGCTGCACTTCGACGTGGAGAAGGGCCACATCACCAGAGCCCAGGTGTTCACCGACAGCCTGAACCCCGCCCCCCTGGAGGCCCTGGCCGGCAGACTGCAGGGCTGCCTGTACAGAGCCGACATGCTGCAGCAGGAGTGCGAGGCCCTGCTGGTGGACTTCCCCGAGCAGGAGAAGGAGCTGAGAGAGCTGAGCGCCTGGATGGCCGGCGCCGTGAGATAAgaattctgcagatatccatcacactgg"

def translate(seq, codon_table):
    protein = ""
    for codon_idx in range(len(seq) // 3):
        codon = seq[codon_idx * 3 : (codon_idx + 1) * 3].upper()
        if codon in codon_table:
            if codon_table[codon] == "*":
                return protein
            else:
                protein += codon_table[codon]

    return protein


def find_mutations(wt_pseq, mut_pseq):
    mutations = []
    for a, b, idx in zip(wt_pseq.upper(), mut_pseq.upper(), range(len(wt_pseq))):
        prot_idx = idx + 1
        if a != b:
            mutations.append(f"{a}{prot_idx}{b}")

    return mutations


def reverse_complement(seq):
    complement = {"A": "T", "T": "A", "G": "C", "C": "G"}
    return "".join([complement[base.upper()] for base in seq[::-1]])


match_score = 2
mismatch_penalty = 0

['L161Q']


In [78]:
df = pd.read_csv("primers.csv")

In [81]:
from pandas import isna
from tqdm.notebook import tqdm


df = pd.read_csv("primers.csv")

lpla_seq = lpla_seq.upper()


muts = []
for primer in tqdm(df["Sequence"]):

    if pd.isna(primer):
        muts.append("")
    else:
        primer = primer.upper()
        alignments = pairwise2.align.globalms(
            lpla_seq, primer, match_score, mismatch_penalty, -10, -1, penalize_end_gaps=False
        )
        alignment = alignments[0]
        mismatches = math.ceil(
            (len(primer) * match_score - alignment.score) / (match_score - mismatch_penalty)
        )

        if mismatches > 3:
            primer = reverse_complement(primer)
            alignments = pairwise2.align.globalms(
                lpla_seq,
                primer,
                match_score,
                mismatch_penalty,
                -10,
                -1,
                penalize_end_gaps=False,
            )
            alignment = alignments[0]
            mismatches = math.ceil(
                (len(primer) * match_score - alignment.score) / (match_score - mismatch_penalty)
            )
            if mismatches > 3:
                raise ValueError(
                    "Primer does not match sequence! There are more than three mismatches in either orientation."
                )

        template_seq = alignment.seqA
        primer_seq = alignment.seqB

        merged_seq = ""
        for ba, bb in zip(template_seq, primer_seq):
            if bb == "-":
                merged_seq += ba
            else:
                merged_seq += bb

        original_pseq = translate(lpla_seq, dna_codons)
        mutant_pseq = translate(merged_seq, dna_codons)
        mutation = find_mutations(original_pseq, mutant_pseq)
        
        muts.append("".join(mutation))

  0%|          | 0/234 [00:00<?, ?it/s]

In [66]:
alignments = pairwise2.align.globalms(
    lpla_seq.upper(),
    "GCGCCTGGATGGCCAAAGCCGTGAGATAAGAATTC",
    match_score,
    mismatch_penalty,
    -10,
    -1,
    penalize_end_gaps=False,
)

In [74]:
df

Unnamed: 0,Target_Mutation,Orientation,Sequence,Pred_Mutation,Mutation
0,Y139V,F,GAGCGGCAGCGCCGTGAGAGAGACCAAGGAC,Y139V,Y139V
1,,R,CTTGGTCTCTCTCACGGCGCTGCCGCTCACC,Y139V,Y139V
2,Y139T,F,GAGCGGCAGCGCCACCAGAGAGACCAAGGAC,Y139T,Y139T
3,,R,CTTGGTCTCTCTGGTGGCGCTGCCGCTCACC,Y139T,Y139T
4,Y139S,F,GAGCGGCAGCGCCAGCAGAGAGACCAAGGAC,Y139S,Y139S
...,...,...,...,...,...
229,,R,TCTGGTGTTGCAGGCCTTCCAGGGGTTCTGG,E54A,E54A
230,E54H,F,CCCCTGGAAGCACTGCAACACCAGAAGAATG,E54H,E54H
231,,R,CTGGTGTTGCAGTGCTTCCAGGGGTTCTGGG,E54H,E54H
232,A248P,F,AACTTCGGCCAGCCCCCCGCCTTCAGC,A248P,A248P


In [83]:
prev = ""
real_targets = []
for target in df['Target_Mutation']:
    if pd.isna(target):
        real_targets.append(prev)
    else:
        real_targets.append(target)
        prev = target
    

In [84]:
df['Target_Mutation'] = real_targets

In [92]:
df.to_csv('primers_checked.csv', index=False)

In [91]:
df

Unnamed: 0,Target_Mutation,Orientation,Sequence,Pred_Mutation,Primer_Correct
0,Y139V,F,GAGCGGCAGCGCCGTGAGAGAGACCAAGGAC,Y139V,True
1,Y139V,R,CTTGGTCTCTCTCACGGCGCTGCCGCTCACC,Y139V,True
2,Y139T,F,GAGCGGCAGCGCCACCAGAGAGACCAAGGAC,Y139T,True
3,Y139T,R,CTTGGTCTCTCTGGTGGCGCTGCCGCTCACC,Y139T,True
4,Y139S,F,GAGCGGCAGCGCCAGCAGAGAGACCAAGGAC,Y139S,True
...,...,...,...,...,...
229,E54A,R,TCTGGTGTTGCAGGCCTTCCAGGGGTTCTGG,E54A,True
230,E54H,F,CCCCTGGAAGCACTGCAACACCAGAAGAATG,E54H,True
231,E54H,R,CTGGTGTTGCAGTGCTTCCAGGGGTTCTGGG,E54H,True
232,A248P,F,AACTTCGGCCAGCCCCCCGCCTTCAGC,A248P,True


In [90]:
df["Pred_Mutation"] = muts
df["Primer_Correct"] = [
    pred == target if pred != "" else ""
    for pred, target in zip(df["Pred_Mutation"], df["Target_Mutation"])
]

In [71]:
df['Pred_Mutation'] = muts

In [65]:
len("GCGCCTGGATGGCCAAAGCCGTGAGATAAGAATTC")

35

In [68]:
alignments[0].score

64.0

In [64]:
print(alignments[0].seqA)
print(alignments[0].seqB)

AGCACCCTGAGACTGCTGATCAGCGACAGCTACGACCCCTGGTTCAACCTGGCCGTGGAGGAGTGCATCTTCAGACAGATGCCCGCCACCCAGAGAGTGCTGTTCCTGGTGAGAAACGCCGACACCGTGGTGATCGGCAGAGCCCAGAACCCCTGGAAGGAGTGCAACACCAGAAGAATGGAGGAGGACAACGTGAGACTGGCCAGAAGAAGCAGCGGCGGCGGCGCCGTGTTCCACGACCTGGGCAACACCTGCTTCACCTTCATGGCCGGCAAGCCCGAGTACGACAAGACCATCAGCACCAGCATCGTGCTGAACGCCCTGAACGCCCTGGGCGTGAGCGCCGAGGCCAGCGGCAGAAACGACCTGGTGGTGAAGACCGTGGAGGGCGACAGAAAGGTGAGCGGCAGCGCCTACAGAGAGACCAAGGACAGAGGCTTCCACCACGGCACCCTGCTGCTGAACGCCGACCTGAGCAGACTGGCCAACTACCTGAACCCCGACAAGAAGAAGCTGGCCGCCAAGGGCATCACCAGCGTGAGAAGCAGAGTGACCAACCTGACCGAGCTGCTGCCCGGCATCACCCACGAGCAGGTGTGCGAGGCCATCACCGAGGCCTTCTTCGCCCACTACGGCGAGAGAGTGGAGGCCGAGATCATCAGCCCCAACAAGACCCCCGACCTGCCCAACTTCGCCGAGACCTTCGCCAGACAGAGCAGCTGGGAGTGGAACTTCGGCCAGGCCCCCGCCTTCAGCCACCTGCTGGACGAGAGATTCACCTGGGGCGGCGTGGAGCTGCACTTCGACGTGGAGAAGGGCCACATCACCAGAGCCCAGGTGTTCACCGACAGCCTGAACCCCGCCCCCCTGGAGGCCCTGGCCGGCAGACTGCAGGGCTGCCTGTACAGAGCCGACATGCTGCAGCAGGAGTGCGAGGCCCTGCTGGTGGACTTCCCCGAGCAGGAGAAGGAGCTGAGAGAGCTGAGCGCCTGGATGGCCG

In [58]:
reverse_complement(primer)

'GCGCCTGGATGGCCAAAGCCGTGAGATAAGAATTC'

In [54]:
df.head(50)

Unnamed: 0,Target_Mutation,Orientation,Sequence
0,Y139V,F,GAGCGGCAGCGCCGTGAGAGAGACCAAGGAC
1,,R,CTTGGTCTCTCTCACGGCGCTGCCGCTCACC
2,Y139T,F,GAGCGGCAGCGCCACCAGAGAGACCAAGGAC
3,,R,CTTGGTCTCTCTGGTGGCGCTGCCGCTCACC
4,Y139S,F,GAGCGGCAGCGCCAGCAGAGAGACCAAGGAC
5,,R,CTTGGTCTCTCTGCTGGCGCTGCCGCTCACC
6,A48D,F,
7,,R,
8,E265V,F,GGCGGCGTGGTGCTGCACTTCGACGTGGAG
9,,R,GTCGAAGTGCAGCACCACGCCGCCCCAGGTG


In [45]:
alignment

Alignment(seqA='AGCACCCTGAGACTGCTGATCAGCGACAGCTACGACCCCTGGTTCAACCTGGCCGTGGAGGAGTGCATCTTCAGACAGATGCCCGCCACCCAGAGAGTGCTGTTCCTGGTGAGAAACGCCGACACCGTGGTGATCGGCAGAGCCCAGAACCCCTGGAAGGAGTGCAACACCAGAAGAATGGAGGAGGACAACGTGAGACTGGCCAGAAGAAGCAGCGGCGGCGGCGCCGTGTTCCACGACCTGGGCAACACCTGCTTCACCTTCATGGCCGGCAAGCCCGAGTACGACAAGACCATCAGCACCAGCATCGTGCTGAACGCCCTGAACGCCCTGGGCGTGAGCGCCGAGGCCAGCGGCAGAAACGACCTGGTGGTGAAGACCGTGGAGGGCGACAGAAAGGTGAGCGGCAGCGCCTACAGAGAGACCAAGGACAGAGGCTTCCACCACGGCACCCTGCTGCTGAACGCCGACCTGAGCAGACTGGCCAACTACCTGAACCCCGACAAGAAGAAGCTGGCCGCCAAGGGCATCACCAGCGTGAGAAGCAGAGTGACCAACCTGACCGAGCTGCTGCCCGGCATCACCCACGAGCAGGTGTGCGAGGCCATCACCGAGGCCTTCTTCGCCCACTACGGCGAGAGAGTGGAGGCCGAGATCATCAGCCCCAACAAGACCCCCGACCTGCCCAACTTCGCCGAGACCTTCGCCAGACAGAGCAGCTGGGAGTGGAACTTCGGCCAGGCCCCCGCCTTCAGCCACCTGCTGGACGAGAGATTCACCTGGGGCGGCGTGGAGCTGCACTTCGACGTGGAGAAGGGCCACATCACCAGAGCCCAGGTGTTCACCGACAGCCTGAACCCCGCCCCCCTGGAGGCCCTGGCCGGCAGACTGCAGGGCTGCCTGTACAGAGCCGACATGCTGCAGCAGGAGTGCGAGGCCCTGCTGGTGGACTTCCCCGAGCAGGAGAAGGAGCTGAGAGAGCTG

In [19]:
translate('hhhhhh', dna_codons)

hhh
hhh


In [11]:
dna_codons

{'TTT': 'F',
 'CTT': 'L',
 'ATT': 'I',
 'GTT': 'V',
 'TTC': 'F',
 'CTC': 'L',
 'ATC': 'I',
 'GTC': 'V',
 'TTA': 'L',
 'CTA': 'L',
 'ATA': 'I',
 'GTA': 'V',
 'TTG': 'L',
 'CTG': 'L',
 'ATG': 'M',
 'GTG': 'V',
 'TCT': 'S',
 'CCT': 'P',
 'ACT': 'T',
 'GCT': 'A',
 'TCC': 'S',
 'CCC': 'P',
 'ACC': 'T',
 'GCC': 'A',
 'TCA': 'S',
 'CCA': 'P',
 'ACA': 'T',
 'GCA': 'A',
 'TCG': 'S',
 'CCG': 'P',
 'ACG': 'T',
 'GCG': 'A',
 'TAT': 'Y',
 'CAT': 'H',
 'AAT': 'N',
 'GAT': 'D',
 'TAC': 'Y',
 'CAC': 'H',
 'AAC': 'N',
 'GAC': 'D',
 'TAA': 'STOP',
 'CAA': 'Q',
 'AAA': 'K',
 'GAA': 'E',
 'TAG': 'STOP',
 'CAG': 'Q',
 'AAG': 'K',
 'GAG': 'E',
 'TGT': 'C',
 'CGT': 'R',
 'AGT': 'S',
 'GGT': 'G',
 'TGC': 'C',
 'CGC': 'R',
 'AGC': 'S',
 'GGC': 'G',
 'TGA': 'STOP',
 'CGA': 'R',
 'AGA': 'R',
 'GGA': 'G',
 'TGG': 'W',
 'CGG': 'R',
 'AGG': 'R',
 'GGG': 'G'}

In [1]:


alignments = pairwise2.align.globalms(
    template, subject, 2, 0, -10, -1, penalize_end_gaps=False
)

NameError: name 'template' is not defined

In [6]:
from pysanger import *

import os
from Bio import SeqIO

from tqdm.notebook import tqdm

def tokenize_fp_make_id(inp):
    tokens = inp.replace('-', '_').split('_')
    user = tokens[0]
    construct = tokens[2]
    num = tokens[3]

    forward = 'Mcherry' in tokens
    reverse = 'WPRE' in tokens

    if not forward and not reverse:
        raise Exception('Neither forward nor reverse! Invalid filename.')
    else:
        if forward:
            primer = 'MCHERRY'
            direction = 'FORWARD'
            strand = 1
        else:
            primer = 'WPRE'
            direction = 'REVERSE'
            strand = -1

    id = f"{user}_{construct}_{num}_{primer}_{direction}"

    return id, strand

#read in template plasmid
with open("data/bfp-expression-control-mcherry-flag-hlpla-pcdna3-w37v.fasta") as file:
    for record in SeqIO.parse(file, "fasta"):
        template = record.seq

fp = 'PC_812288-3088_48_PC-442--Mcherry-Seq-Forward_H11.ab1'
id, strand = tokenize_fp_make_id(fp)
abidata = abi_to_dict(f"peter_data/{fp}")

if strand == 1:
    template_limits = (4660, 5250)
else:
    template_limits = (5000, 5740)
fig, stats = visualize(id, abidata, template=template, strand=strand, fig=None, region="read", translation_limits = (4702, None), template_limits = template_limits)
fig.savefig(f'{id}.png', bbox_inches = "tight")

stat_list.append(stats)

2


ValueError: Something has gone wrong :(

In [2]:
pd.concat([stats, stats])

Unnamed: 0,id,alignment_score,nt_insertions,nt_deletions,nt_mismatches,silent_mutations,missense_mutations,gap_mutations
0,PC_574_2_MCHERRY_FORWARD,1102.0,,,4792(C->G),31,,
0,PC_574_2_MCHERRY_FORWARD,1102.0,,,4792(C->G),31,,


In [3]:
stats

Unnamed: 0,id,alignment_score,nt_insertions,nt_deletions,nt_mismatches,silent_mutations,missense_mutations,gap_mutations
0,<built-in function id>,1102.0,,,4792(C->G),31,,
