<a href="https://colab.research.google.com/github/byunsy/bioinformatics-algorithms-py/blob/main/BA_4I.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Convolution Cyclopeptide Sequencing Problem

### Function

In [1]:
def Expand(pep, freq_aminos):

    new_pep = []

    for i in pep:
        for amino in freq_aminos:
            if i != "":
                new_pep.append(i + "-" + amino)
            else:
                new_pep.append(i + amino)

    return new_pep

In [2]:
def Mass(peptides):

    split_pep = peptides.split("-")
    mass = 0

    for i in split_pep:
        mass += int(i)

    return mass

In [6]:
def LinearSpectrum(peptide):
    
    split_pep = peptide.split("-")

    prefix_mass = [0]
    for amino in split_pep:
        prefix_mass.append(prefix_mass[-1] + int(amino))

    linear_spectrum = [0]
    for i in range(len(split_pep)):
        j = i + 1
        while j <= len(split_pep):
            linear_spectrum.append(prefix_mass[j]-prefix_mass[i])
            j += 1
    
    return sorted(linear_spectrum)

In [3]:
from collections import Counter

def LinearScore(peptide, spectrum):

    pep_spectrum = LinearSpectrum(peptide)

    pep_counter = Counter(pep_spectrum) 
    spc_counter = Counter(spectrum)
    score = 0

    for i in set(pep_spectrum):
        if i in spectrum:
            if pep_counter[i] <= spc_counter[i]:
                score += pep_counter[i]
            else: # pep_counter[i] > spc_counter[i]
                score += spc_counter[i]

    return score

In [4]:
def CyclicSpectrum(peptide):
    
    split_pep = peptide.split("-")

    prefix_mass = [0]
    for amino in split_pep:
        prefix_mass.append(prefix_mass[-1] + int(amino))

    peptide_mass = prefix_mass[-1]
    cyclic_spectrum = [0]
    for i in range(len(split_pep)):
        j = i + 1
        while j <= len(split_pep):
            cyclic_spectrum.append(prefix_mass[j]-prefix_mass[i])
            if i > 0 and j < len(split_pep):
                cyclic_spectrum.append(peptide_mass - (prefix_mass[j]-prefix_mass[i]))
            j += 1
    
    return sorted(cyclic_spectrum)

In [5]:
from collections import Counter

def CyclopeptideScore(peptide, spectrum):

    pep_spectrum = CyclicSpectrum(peptide)

    pep_counter = Counter(pep_spectrum) 
    spc_counter = Counter(spectrum)
    score = 0

    for i in set(pep_spectrum):
        if i in spectrum:
            if pep_counter[i] <= spc_counter[i]:
                score += pep_counter[i]
            else: # pep_counter[i] > spc_counter[i]
                score += spc_counter[i]

    return score

In [7]:
def Trim(leaderboard, spectrum, n):

    linear_scores = []
    for pep in leaderboard:
        linear_scores.append(LinearScore(pep, spectrum))

    lb_pep    = [p for _,p in sorted(zip(linear_scores,leaderboard), reverse=True)]
    lb_scores = [s for s,_ in sorted(zip(linear_scores,leaderboard), reverse=True)]

    cutoff_idx = n

    for i in range(cutoff_idx, len(lb_scores)):
        if lb_scores[i] < lb_scores[i-1]:
            break
        else:
            cutoff_idx += 1

    return lb_pep[:cutoff_idx]

In [8]:
from collections import Counter

def SpectralConvolution(spectrum):

    convolutions = []
    sorted_spectrum = sorted(spectrum)

    # column spectrum should not include parent_mass == max(spectrum)
    for col in sorted_spectrum[:-1]:
        for row in sorted_spectrum:
            if row > col:
                convolutions.append(row - col)

    counted_conv = Counter(convolutions)
    order = [i for i,_ in counted_conv.most_common()]

    return sorted(convolutions, key=order.index)


In [9]:
def MostFreqElementConv(convolution, m):

    filtered = [str(x) for x in convolution if x >= 57 and x <= 200]

    cnt_filtered = Counter(filtered)

    order  = [i for i,_ in cnt_filtered.most_common()]
    counts = [c for _,c in cnt_filtered.most_common()]

    cutoff_idx = m

    for i in range(cutoff_idx, len(counts)):
        if counts[i] < counts[i-1]:
            break
        else:
            cutoff_idx += 1

    return order[:cutoff_idx]


In [10]:
def ConvCyclopeptideSequencing(m, n, spectrum):

    conv = SpectralConvolution(spectrum)
    freq_amino = MostFreqElementConv(conv, m)
    
    parent_mass = spectrum[-1]
    leaderboard = [""]
    leader_pep = "0"

    while len(leaderboard) != 0:
        leaderboard = Expand(leaderboard, freq_amino)

        to_remove = set()
        for pep in leaderboard:
            if Mass(pep) == parent_mass:
                if CyclopeptideScore(pep, spectrum) > CyclopeptideScore(leader_pep, spectrum):
                    leader_pep = pep
            elif Mass(pep) > parent_mass:
                to_remove.add(pep)

        for i in to_remove:
            leaderboard.remove(i)

        leaderboard = Trim(leaderboard, spectrum, n)

    return leader_pep

### Test Cases

In [15]:
# Create a function for test suite
def TestSuite(function, cases):
    print("*"*50)
    print("TEST SUITE\n")
    passed = 0
    for i, case in enumerate(cases):
        m, n, spectrum, answer = case
        result = function(m, n, spectrum)
        if result == answer:
            print("- Test Case {} Passed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
            passed += 1
        else:
            print("- Test Case {} Failed. Expected: {}, Actual: {}"
                  .format(i+1, answer, result))
    print("\n{} out of {} passed.".format(passed, len(cases)), end=" ")
    print("END OF TEST SUITE.")
    print("*"*50)

In [16]:
# Create test cases to pass into test suite
case1 = (20, 60, [57, 57, 71, 99, 129, 137, 170, 186, 194, 208, 228, 265, 285, 299, 307, 323, 356, 364, 394, 422, 493], "99-71-137-57-72-57")

cases = [case1]

TestSuite(ConvCyclopeptideSequencing, cases)

**************************************************
TEST SUITE

- Test Case 1 Passed. Expected: 99-71-137-57-72-57, Actual: 99-71-137-57-72-57

1 out of 1 passed. END OF TEST SUITE.
**************************************************


### Stepik Coding Exercise

In [17]:
string = "0 97 99 101 101 113 113 115 128 129 137 147 147 156 186 200 202 230 236 241 242 244 253 260 262 265 269 301 301 331 333 337 343 364 366 375 378 389 397 400 430 430 438 444 448 448 465 477 490 494 504 513 534 543 545 561 566 567 578 586 591 595 605 631 633 641 679 680 690 690 692 695 699 701 706 708 730 734 778 791 805 805 808 808 814 827 827 831 835 837 848 877 892 921 932 934 938 942 942 955 961 961 964 964 978 991 1035 1039 1061 1063 1068 1070 1074 1077 1079 1079 1089 1090 1128 1136 1138 1164 1174 1178 1183 1191 1202 1203 1208 1224 1226 1235 1256 1265 1275 1279 1292 1304 1321 1321 1325 1331 1339 1339 1369 1372 1380 1391 1394 1403 1405 1426 1432 1436 1438 1468 1468 1500 1504 1507 1509 1516 1525 1527 1528 1533 1539 1567 1569 1583 1613 1622 1622 1632 1640 1641 1654 1656 1656 1668 1668 1670 1672 1769"

ret = string.replace(" ", ", ")
print(ret)

0, 97, 99, 101, 101, 113, 113, 115, 128, 129, 137, 147, 147, 156, 186, 200, 202, 230, 236, 241, 242, 244, 253, 260, 262, 265, 269, 301, 301, 331, 333, 337, 343, 364, 366, 375, 378, 389, 397, 400, 430, 430, 438, 444, 448, 448, 465, 477, 490, 494, 504, 513, 534, 543, 545, 561, 566, 567, 578, 586, 591, 595, 605, 631, 633, 641, 679, 680, 690, 690, 692, 695, 699, 701, 706, 708, 730, 734, 778, 791, 805, 805, 808, 808, 814, 827, 827, 831, 835, 837, 848, 877, 892, 921, 932, 934, 938, 942, 942, 955, 961, 961, 964, 964, 978, 991, 1035, 1039, 1061, 1063, 1068, 1070, 1074, 1077, 1079, 1079, 1089, 1090, 1128, 1136, 1138, 1164, 1174, 1178, 1183, 1191, 1202, 1203, 1208, 1224, 1226, 1235, 1256, 1265, 1275, 1279, 1292, 1304, 1321, 1321, 1325, 1331, 1339, 1339, 1369, 1372, 1380, 1391, 1394, 1403, 1405, 1426, 1432, 1436, 1438, 1468, 1468, 1500, 1504, 1507, 1509, 1516, 1525, 1527, 1528, 1533, 1539, 1567, 1569, 1583, 1613, 1622, 1622, 1632, 1640, 1641, 1654, 1656, 1656, 1668, 1668, 1670, 1672, 1769


In [18]:
spectrum = [0, 97, 99, 101, 101, 113, 113, 115, 128, 129, 137, 147, 147, 156, 186, 200, 202, 230, 236, 241, 242, 244, 253, 260, 262, 265, 269, 301, 301, 331, 333, 337, 343, 364, 366, 375, 378, 389, 397, 400, 430, 430, 438, 444, 448, 448, 465, 477, 490, 494, 504, 513, 534, 543, 545, 561, 566, 567, 578, 586, 591, 595, 605, 631, 633, 641, 679, 680, 690, 690, 692, 695, 699, 701, 706, 708, 730, 734, 778, 791, 805, 805, 808, 808, 814, 827, 827, 831, 835, 837, 848, 877, 892, 921, 932, 934, 938, 942, 942, 955, 961, 961, 964, 964, 978, 991, 1035, 1039, 1061, 1063, 1068, 1070, 1074, 1077, 1079, 1079, 1089, 1090, 1128, 1136, 1138, 1164, 1174, 1178, 1183, 1191, 1202, 1203, 1208, 1224, 1226, 1235, 1256, 1265, 1275, 1279, 1292, 1304, 1321, 1321, 1325, 1331, 1339, 1339, 1369, 1372, 1380, 1391, 1394, 1403, 1405, 1426, 1432, 1436, 1438, 1468, 1468, 1500, 1504, 1507, 1509, 1516, 1525, 1527, 1528, 1533, 1539, 1567, 1569, 1583, 1613, 1622, 1622, 1632, 1640, 1641, 1654, 1656, 1656, 1668, 1668, 1670, 1672, 1769]

print(ConvCyclopeptideSequencing(18, 346, spectrum))

99-137-128-113-156-97-147-186-115-147-113-129-101-101
