## Load the Codon Table

The CODON_TABLE dictionary is constructed directly from the lines of the file, where each line is split and the first element is used as the key and the second element (if present) is used as the value.

In [1]:
with open('data/RNA_codon_table.txt', 'r') as reference_file:
    CODON_TABLE = {line.split()[0]: line.split()[1] if len(line.split()) > 1 else '' for line in reference_file}

len(CODON_TABLE)

64

## Protein Translation Problem: Translate an RNA string into an amino acid string.

**Input**: An RNA string Pattern and the array GeneticCode.  
**Output**: The translation of Pattern into an amino acid string Peptide.

> Notes:
> 
> - The "Stop" codon should not be translated, as shown in the sample below.
> - For your convenience, we provide a downloadable RNA codon table indicating which codons encode which amino acids.

In [2]:
def translate(nucleotides: str) -> str:
    """
    Translate a sequence of nucleotides into a polypeptide.

    Args:
        nucleotides: A string representing the sequence of nucleotides.

    Returns:
        A string representing the polypeptide translated from the nucleotide sequence.

    Algorithm:
    1. Initialize an empty string 'polypeptide' to store the translated polypeptide.
    2. Iterate over the range of indices in the nucleotide sequence, with a step size of 3.
    3. Extract the current codon from the nucleotide sequence.
    4. Append the translation of the codon to the 'polypeptide' string using the codon_table.
    5. Return the 'polypeptide' string representing the translated polypeptide.
    """
    polypeptide = ""
    for i in range(0, len(nucleotides) - 2, 3):
        codon = nucleotides[i: i + 3]
        polypeptide += CODON_TABLE[codon]
    return polypeptide

In [3]:
sample_input = 'AUGGCCAUGGCGCCCAGAACUGAGAUCAAUAGUACCCGUAUUAACGGGUGA'
sample_output = 'MAMAPRTEINSTRING'

assert translate(sample_input) == sample_output

In [4]:
input_filename = 'dataset_96_4'
with open(f'data/{input_filename}.txt', 'r') as input_file:
    test_input = input_file.read()

result = translate(test_input)

output_filename = 'submission_' + '_'.join(input_filename.split('_')[1:])
with open(f'data/{output_filename}.txt', 'w') as output_file:
    output_file.write(result)

In [72]:
for s in ['CCUCGUACAGAAAUCAAC', 'CCGAGGACCGAAAUCAAC', 'CCAAGUACAGAGAUUAAC', 'CCAAGAACAGAUAUCAAU']:
    print(translate(s))

PRTEIN
PRTEIN
PSTEIN
PRTDIN


## Peptide Encoding Problem: Find substrings of a genome encoding a given amino acid sequence.

Input: A DNA string Text, an amino acid string Peptide, and the array GeneticCode.  
Output: All substrings of Text encoding Peptide (if any such substrings exist).  
Code Challenge: Solve the Peptide Encoding Problem. Click here for the RNA codon table corresponding to the array GeneticCode.  

Note: The solution may contain repeated strings if the same string occurs more than once as a substring of Text and encodes Peptide.  

In [5]:
nucleotide_map = {
    "A": "U",
    "T": "A",
    "G": "C",
    "C": "G"
}

reverse_nucleotide_map = {
    "U": "A",
    "A": "T",
    "C": "G",
    "G": "C",
}

def transcribe(nucleotides, reverse=False):
    """
    Transcribes DNA sequence into RNA sequence or vice versa.

    Args:
        nucleotides (str): DNA or RNA sequence.
        reverse (bool): Specifies whether to perform reverse transcription.

    Returns:
        str: Transcribed sequence.
    """
    if not reverse:
        return nucleotides.replace("T", "U")
    
    return "".join(nucleotide_map[base] for base in nucleotides[::-1])


def reverse_transcribe(nucleotides, reverse=False):
    """
    Reverse transcribes RNA sequence into DNA sequence or vice versa.

    Args:
        nucleotides (str): RNA or DNA sequence.
        reverse (bool): Specifies whether to perform reverse transcription.

    Returns:
        str: Reverse transcribed sequence.
    """
    if not reverse:
        return nucleotides.replace("U", "T")
    
    return "".join(reverse_nucleotide_map[base] for base in nucleotides[::-1])


def peptide_encoding(nucleotides, expected_peptide):
    """
    Finds all DNA segments that encode a given peptide.

    Args:
        nucleotides (str): DNA sequence.
        expected_peptide (str): Peptide to be encoded.

    Returns:
        list: List of DNA segments encoding the peptide.
    """
    nucleotide_length = len(expected_peptide) * 3
    result = []
    rna = transcribe(nucleotides)
    complementary_rna = transcribe(nucleotides, reverse=True)

    # Check for peptide encoding in the original RNA sequence
    for i in range(len(rna) - nucleotide_length):
        rna_segment = rna[i: nucleotide_length+i]
        peptide = translate(rna_segment)
        if peptide == expected_peptide:
            result.append(reverse_transcribe(rna_segment))

    # Check for peptide encoding in the complementary RNA sequence
    for i in range(len(complementary_rna) - nucleotide_length-1, -1, -1):
        rna_segment = complementary_rna[i: nucleotide_length+i]
        peptide = translate(rna_segment)
        if peptide == expected_peptide:
            result.append(reverse_transcribe(rna_segment, reverse=True))

    return result

In [6]:
sample_dna = 'ATGGCCATGGCCCCCAGAACTGAGATCAATAGTACCCGTATTAACGGGTGA'
sample_peptide = 'MA'
sample_output = [
    'ATGGCC',
    'GGCCAT',
    'ATGGCC',
]

assert sorted(sample_output) == sorted(peptide_encoding(sample_dna, sample_peptide))

In [7]:
input_filename = 'dataset_96_7'
with open(f'data/{input_filename}.txt', 'r') as input_file:
    test_input = input_file.readlines()

dna = test_input[0].strip()
peptide = test_input[1].strip()
result = peptide_encoding(dna, peptide )

output_filename = 'submission_' + '_'.join(input_filename.split('_')[1:])
with open(f'data/{output_filename}.txt', 'w') as output_file:
    output_file.write('\n'.join(result))

Amino acids typically have non-integer masses (e.g., glycine has total integer mass equal to approximately 57.02 Da); for simplicity, however, we will work with the integer mass table given below. ![](http://bioinformaticsalgorithms.com/images/Antibiotics/integer_mass_table.png)

In [8]:
amino_acid_masses = {
    "G": 57,
    "A": 71,
    "S": 87,
    "P": 97,
    "V": 99,
    "T": 101,
    "C": 103,
    "I": 113,
    "L": 113,
    "N": 114,
    "D": 115,
    "K": 128,
    "Q": 128,
    "E": 129,
    "M": 131,
    "H": 137,
    "F": 147,
    "R": 156,
    "Y": 163,
    "W": 186,
}


## The Cyclopeptide Sequencing Problem

For now, we will assume for simplicity that the mass spectrometer breaks the copies of a cyclic peptide at every possible two bonds, so that the resulting experimental spectrum contains the masses of all possible linear fragments of the peptide, which are called subpeptides.  

**Exercise Break**: How many subpeptides does a cyclic peptide of length n have?

In [9]:
from math import factorial

def nb_subpeptides(cyclopeptide_length, k=2):
    """
    Calculates the number of subpeptides of a cyclopeptide.

    Args:
        cyclopeptide_length (int): Length of the cyclopeptide.
        k (int): Length of subpeptides. Default is 2.

    Returns:
        int: Number of subpeptides.
    """
    n = cyclopeptide_length

    # Calculate the binomial coefficient
    binomial_coeff = factorial(n) / (factorial(k) * factorial(n - k))

    # Calculate the number of subpeptides
    num_subpeptides = int(k * binomial_coeff)

    return num_subpeptides

In [10]:
sample_input = 31315
sample_output = 980597910

assert nb_subpeptides(sample_input) == sample_output

In [11]:
nb_subpeptides(24213)

586245156

## Generating Theoretical Spectrum Problem: Generate the theoretical spectrum of a cyclic peptide.

The theoretical spectrum of a cyclic peptide Peptide, denoted Cyclospectrum(Peptide), is the collection of all of the masses of its subpeptides, in addition to the mass 0 and the mass of the entire peptide, with masses ordered from smallest to largest.

Input: An amino acid string Peptide.  
Output: Cyclospectrum(Peptide).  
Code Challenge: Solve the Generating Theoretical Spectrum Problem.  

In [50]:
from collections import deque

def cyclospectrum(peptide, masses=amino_acid_masses):
    """
    Generates the cyclic spectrum of a peptide.

    Args:
        masses (list): List of integer masses representing the peptide.

    Returns:
        list: Cyclic spectrum of the peptide.
    """
    masses = [int(amino_acid_masses[aa]) for aa in peptide]  # Convert masses to integers
    spectrum = [0]
    spectrum.extend(mass for i in range(len(masses)) for mass in accumulate_masses(masses, i))
    spectrum.append(sum(masses))
    return sorted(spectrum)

def accumulate_masses(masses, index):
    """
    Accumulates masses starting from the given index in a cyclic manner.

    Args:
        masses (list): List of integer masses representing the peptide.
        index (int): Starting index for accumulation.

    Yields:
        int: Accumulated mass.
    """
    size = len(masses)
    accumulated_mass = 0
    for i in range(size - 1):
        accumulated_mass += masses[(index + i) % size]
        yield accumulated_mass

In [51]:
sample_input = 'LEQN'
sample_output = '0 113 114 128 129 227 242 242 257 355 356 370 371 484'
# peptide_mass= [amino_acid_masses[aa] for aa in sample_input]
sample_result = cyclospectrum(sample_input)
assert " ".join([str(v) for v in sample_result]) == sample_output

In [53]:
challenge_input = 'MEDWFIHNHRTM'
# peptide_mass= [amino_acid_masses[aa] for aa in challenge_input]
result = cyclospectrum(challenge_input)
print(" ".join([str(v) for v in result]))

0 101 113 114 115 129 131 131 137 137 147 156 186 232 244 250 251 251 257 260 260 262 293 301 333 363 364 375 388 388 391 394 397 407 430 446 448 492 501 506 508 511 519 525 544 561 561 577 583 607 639 645 648 648 656 657 690 692 697 698 708 758 763 770 776 785 793 804 812 821 827 834 839 889 899 900 905 907 940 941 949 949 952 958 990 1014 1020 1036 1036 1053 1072 1078 1086 1089 1091 1096 1105 1149 1151 1167 1190 1200 1203 1206 1209 1209 1222 1233 1234 1264 1296 1304 1335 1337 1337 1340 1346 1346 1347 1353 1365 1411 1441 1450 1460 1460 1466 1466 1468 1482 1483 1484 1496 1597


In [79]:
for s in ['TMLA', 'TMIA', 'TLAM', 'MTAI', 'TAIM', 'MAIT']:
    result = [0, 71, 101, 113, 131, 184, 202, 214, 232, 285, 303, 315, 345, 416]
    print(cyclospectrum(s) == result)

False
False
True
False
False
True


## Counting Peptides with Given Mass Problem: Compute the number of peptides of given mass.

Input: An integer m.  
Output: The number of linear peptides having integer mass m.

In [15]:
from collections import defaultdict

def nb_linear_peptides(parent_mass):
    """
    Count the number of linear peptides with a given parent mass.

    Args:
        parent_mass (int): The parent mass.

    Returns:
        int: The number of linear peptides with the given parent mass.
    """
    masses = set(amino_acid_masses.values())  # Set of possible masses
    to_expand = [(0, [])]  # Stack to store current mass and history of masses
    nb_wins = 0  # Number of wins (peptides with parent mass)
    losers = set()  # Set of losing masses (greater than parent mass)
    winners = set()  # Set of winning masses (equal to parent mass)

    while to_expand:
        current, history = to_expand.pop()
        if current in winners:
            nb_wins += 1
            continue
        for mass in masses:
            new = current + mass
            if new in losers:
                continue
            if new in winners:
                nb_wins += 1
                continue
            if new == parent_mass:
                nb_wins += 1
                winners.update(history + [current])
                continue
            if new < parent_mass:
                to_expand.append((new, history + [current]))
                continue
            losers.add(new)

    return nb_wins

In [17]:
sample_input = 1024
sample_output = 14712706211

In [22]:
# sample_result = nb_linear_peptides(sample_input)
# assert sample_result == sample_output
# sample_result

## Exercise Break: How many subpeptides does a linear peptide of given length n have? (Include the empty peptide and the entire peptide.)

Input: An integer n.  
Output: The number of subpeptides of a linear peptide of length n.

In [33]:
def linear_subpeptide_count(x):
    """
    Count the number of linear subpeptides in a peptide of length x.

    Args:
        x (int): Length of the peptide.

    Returns:
        int: The number of linear subpeptides.
    """
    result = 0

    # Iterate from 1 to x (inclusive)
    for i in range(1, x+1):
        if i <= 2:
            result += 2  # Add 2 for subpeptides of length 1 or 2
        else:
            result += i  # Add i for subpeptides of length greater than 2

    return result

In [35]:
linear_subpeptide_count(4)

11

In [34]:
linear_subpeptide_count(26363)

347517067

## Code Challenge: Implement LinearSpectrum.

Input: An amino acid string Peptide.  
Output: The linear spectrum of Peptide.

```
LinearSpectrum(Peptide, Alphabet, AminoAcidMass)
    PrefixMass(0) ← 0
    for i ← 1 to |Peptide|
        for every symbol s in Alphabet
            if s = i-th amino acid in Peptide
                PrefixMass(i) ← PrefixMass(i − 1) + AminoAcidMass[s]
    LinearSpectrum ← a list consisting of the single integer 0
    for i ← 0 to |Peptide| − 1
        for j ← i + 1 to |Peptide|
            add PrefixMass(j) − PrefixMass(i) to LinearSpectrum
    return sorted list LinearSpectrum
```

In [67]:
def linear_spectrum(peptide):
    """
    Calculate the linear spectrum of a peptide.

    Args:
        peptide (str): The peptide sequence.

    Returns:
        list: The linear spectrum of the peptide.
    """
    prefix_mass = [0] * (len(peptide) + 1)

    # Calculate the prefix mass for each prefix of the peptide
    for i, aa in enumerate(peptide):
        prefix_mass[i+1] = prefix_mass[i] + amino_acid_masses[aa]
    
    linear_spectrum = [0]
    
    # Calculate the differences between prefix masses to obtain the linear spectrum
    for i in range(len(prefix_mass)):
        for j in range(i+1, len(prefix_mass)):
            linear_spectrum.append(prefix_mass[j] - prefix_mass[i])

    linear_spectrum.sort()
    return linear_spectrum

In [69]:
sample_input = "NQEL"
sample_output = [0, 113, 114, 128, 129, 242, 242, 257, 370, 371, 484]
sample_result = linear_spectrum(sample_input)
assert sample_result == sample_output

In [71]:
challenge_input = 'HWYWGATLHPMEAERFVDVHPVYHDGPFKNPRCYVYIGPHPGLYMQVRWT'
print(' '.join([str(i) for i in linear_spectrum(challenge_input)]))

0 57 57 57 57 71 71 97 97 97 97 97 97 99 99 99 99 99 101 101 103 113 113 113 114 115 115 128 128 128 129 129 131 131 137 137 137 137 137 147 147 154 154 154 156 156 156 163 163 163 163 163 170 170 172 172 186 186 186 196 200 200 211 214 214 214 227 228 229 234 234 234 234 236 242 243 244 246 250 252 253 255 259 259 260 262 262 262 266 267 267 269 275 276 276 285 285 287 291 291 294 300 301 303 309 313 314 323 329 331 331 333 333 333 333 339 342 342 347 349 349 351 351 356 356 357 358 359 361 365 365 367 372 375 383 388 388 389 399 402 404 404 406 406 407 415 415 416 422 422 422 425 428 429 430 430 432 432 432 441 443 445 448 448 450 460 460 464 470 472 477 478 479 485 486 486 486 494 495 496 496 501 501 503 514 514 517 519 519 521 521 528 528 529 531 535 535 538 542 543 544 547 547 553 557 558 558 561 565 567 567 569 569 571 576 578 579 583 592 592 595 595 597 598 602 607 611 616 616 618 632 633 633 634 640 641 642 646 646 650 658 663 664 664 665 666 668 668 670 671 672 677 678 681 684

## Code Challenge: Implement CyclopeptideSequencing.

```
CyclopeptideSequencing(Spectrum)
    CandidatePeptides ← a set containing only the empty peptide FinalPeptides ← empty list of strings
    while CandidatePeptides is nonempty
        CandidatePeptides ← Expand(CandidatePeptides)
        for each peptide Peptide in CandidatePeptides
            if Mass(Peptide) = ParentMass(Spectrum)
                if Cyclospectrum(Peptide) = Spectrum and Peptide is not in FinalPeptides
                    append Peptide to FinalPeptides
                remove Peptide from CandidatePeptides
            else if Peptide is not consistent with Spectrum
                remove Peptide from CandidatePeptides
    return FinalPeptides
```

In [60]:
masses = list(set(amino_acid_masses.values()))
aa = list(amino_acid_masses.keys())

def cyclopeptide_sequencing(spectrum):
    """
    Find all possible cyclopeptides that match a given spectrum.

    Args:
        spectrum (list): The target spectrum.

    Returns:
        list: List of cyclopeptides matching the spectrum.
    """
    final_peptides = []
    candidate_subpeptides = ['']
    parent_mass = parent_mass_from_spectrum(spectrum)

    while len(candidate_subpeptides):
        candidate_subpeptides = expand(candidate_subpeptides)

        for i, peptide in enumerate(candidate_subpeptides):
            if peptide in final_peptides:
                continue

            if mass(peptide) == parent_mass:
                if cyclospectrum(peptide) == spectrum:
                    final_peptides.append(peptide)
                else:
                    candidate_subpeptides[i] = ''
            elif not is_consistent(peptide, spectrum):
                candidate_subpeptides[i] = ''

        candidate_subpeptides = list(filter(lambda x: x != '', candidate_subpeptides))

    return final_peptides


def expand(peptides):
    """
    Expand the list of peptides by adding all possible amino acids to each peptide.

    Args:
        peptides (list): List of peptides.

    Returns:
        list: Expanded list of peptides.
    """
    new_peptides = []
    for peptide in peptides:
        for amino_acid in aa:
            new_peptides.append(peptide + amino_acid)
    return new_peptides


def mass(peptide):
    """
    Calculate the mass of a peptide.

    Args:
        peptide (str): The peptide sequence.

    Returns:
        int: The mass of the peptide.
    """
    return sum(amino_acid_masses[aa] for aa in peptide)


def parent_mass_from_spectrum(spectrum):
    """
    Calculate the parent mass from a spectrum.

    Args:
        spectrum (list): The spectrum.

    Returns:
        int: The parent mass.
    """
    return max(spectrum)

def is_consistent(peptide, spectrum):
    """
    Check if a peptide is consistent with a spectrum.

    Args:
        peptide (str): The peptide sequence.
        spectrum (list): The spectrum.

    Returns:
        bool: True if the peptide is consistent with the spectrum, False otherwise.
    """
    ls = linear_spectrum(peptide)
    for mass in ls:
        if mass not in spectrum:
            return False
    return True

def display_cyclopeptide_seq(seq, masses=amino_acid_masses):
    """
    Display the cyclopeptide sequence as a string of masses.

    Args:
        seq (list): The cyclopeptide sequence.
        masses (dict): Dictionary mapping amino acids to their masses.

    Returns:
        str: The cyclopeptide sequence represented as a string of masses.
    """
    result_as_masses = {}

    # Convert each result in the sequence to a string of masses
    for result in seq:
        mass = "-".join(str(masses[aa]) for aa in result)
        result_as_masses[mass] = True

    # Join the unique masses and return as a string
    return " ".join(result_as_masses.keys())

In [65]:
sample_input = '0 113 128 186 241 299 314 427'
sample_output = '186-128-113 186-113-128 128-186-113 128-113-186 113-186-128 113-128-186'

sample_spectrum = [int(mass) for mass in sample_input.split()]
sample_result = display_cyclopeptide_seq(cyclopeptide_sequencing(sample_spectrum))

assert sorted(sample_result.split(' ')) == sorted(sample_output.split(' '))

In [66]:
challenge_input = '0 71 71 97 103 103 113 113 128 142 147 174 184 200 210 216 231 245 255 260 275 287 302 313 313 357 358 358 373 378 384 388 426 449 455 460 461 485 486 491 497 520 558 562 568 573 588 588 589 633 633 644 659 671 686 691 701 715 730 736 746 762 772 799 804 818 833 833 843 843 849 875 875 946'
challenge_spectrum = [int(mass) for mass in challenge_input.split()]
challenge_result = display_cyclopeptide_seq(cyclopeptide_sequencing(challenge_spectrum))
print(challenge_result)

71-71-103-128-147-113-97-103-113 71-71-113-103-97-113-147-128-103 71-103-128-147-113-97-103-113-71 71-113-103-97-113-147-128-103-71 97-103-113-71-71-103-128-147-113 97-113-147-128-103-71-71-113-103 103-71-71-113-103-97-113-147-128 103-97-113-147-128-103-71-71-113 103-113-71-71-103-128-147-113-97 103-128-147-113-97-103-113-71-71 113-71-71-103-128-147-113-97-103 113-97-103-113-71-71-103-128-147 113-103-97-113-147-128-103-71-71 113-147-128-103-71-71-113-103-97 128-103-71-71-113-103-97-113-147 128-147-113-97-103-113-71-71-103 147-113-97-103-113-71-71-103-128 147-128-103-71-71-113-103-97-113


In [84]:
for s in ['QCV', 'TCE', 'CTV', 'AQV', 'TCQ', 'VAQ']:
    spectrum = [0, 71, 99, 101, 103, 128, 129, 199, 200, 204, 227, 230, 231, 298, 303, 328, 330, 332, 333]
    print(is_consistent(s, spectrum))

False
False
True
True
True
False
