### Branch-and-Bound Algorithm for Cyclopeptide Sequencing

In [1]:
aminoacidMass = {'G':57, 'A':71, 'S':87, 'P':97, 'V':99, 'T':101, 'C':103, 'L':113, 'N':114, 'D':115, 'K':128, 'E':129, 'M':131, 'H':137, 'F':147, 'R':156, 'Y':163, 'W':186}
def linearSpectrum(peptide):
    """Input: An amino acid string Peptide.
     Output: The linear spectrum of Peptide."""
    prefixMass = [0]*((len(peptide)+1))
    for i in range(len(peptide)):
        prefixMass[i+1] = prefixMass[i] + aminoacidMass[peptide[i]]
    #print 'prefixMass', prefixMass
    linear_spectrum = [0]
    for i in range(len(prefixMass)-1):
        for j in range(i+1, len(prefixMass)):
            linear_spectrum.append(prefixMass[j] - prefixMass[i])
    return sorted(linear_spectrum)    

In [21]:
peptide = 'CPC'
linearSpectrum(peptide)

[0, 97, 103, 103, 200, 200, 303]

In [3]:
aminoacidMass = {'G':57, 'A':71, 'S':87, 'P':97, 'V':99, 'T':101, 'C':103, 'L':113, 'N':114, 'D':115, 'K':128, 'E':129, 'M':131, 'H':137, 'F':147, 'R':156, 'Y':163, 'W':186}
def cyclicSpectrum(peptide):
    """Input: An amino acid string Peptide.
     Output: The cyclic spectrum of Peptide."""
    prefixMass = [0]*((len(peptide)+1))
    for i in range(len(peptide)):
        prefixMass[i+1] = prefixMass[i] + aminoacidMass[peptide[i]]
    peptideMass = prefixMass[len(peptide)]
    cyclic_spectrum = [0]
    for i in range(len(prefixMass)-1):
        for j in range(i+1, len(prefixMass)):
            cyclic_spectrum.append(prefixMass[j] - prefixMass[i])
            if i > 0 and j < (len(prefixMass)-1):
                cyclic_spectrum.append(peptideMass - (prefixMass[j] - prefixMass[i]))
    return sorted(cyclic_spectrum) 

In [4]:
peptide = 'NKEL'
print cyclicSpectrum(peptide)

[0, 113, 114, 128, 129, 227, 242, 242, 257, 355, 356, 370, 371, 484]


In [5]:
aminoacidMass = {'G':57, 'A':71, 'S':87, 'P':97, 'V':99, 'T':101, 'C':103, 'L':113, 'N':114, 'D':115, 'K':128, 'E':129, 'M':131, 'H':137, 'F':147, 'R':156, 'Y':163, 'W':186}

def expand(peptides):
    aminoacid = ['G', 'A', 'S', 'P', 'V', 'T', 'C', 'L', 'N', 'D', 'K', 'E', 'M', 'H', 'F', 'R', 'Y', 'W']
    expanded = []
    for i in peptides:
        for j in aminoacid:
            expanded.append(i+j)
    return expanded

In [6]:
def expand(peptides, aa):
    expanded = []
    for i in peptides:
        for j in aa:
            expanded.append(i+j)
    return expanded

In [7]:
print expand(['K', 'L', 'W'], ['K', 'L', 'W'])

['KK', 'KL', 'KW', 'LK', 'LL', 'LW', 'WK', 'WL', 'WW']


In [8]:
aminoacidMass = {'G':57, 'A':71, 'S':87, 'P':97, 'V':99, 'T':101, 'C':103, 'L':113, 'N':114, 'D':115, 'K':128, 'E':129, 'M':131, 'H':137, 'F':147, 'R':156, 'Y':163, 'W':186}
def mass(peptide):
    massOfPeptide = 0
    for i in peptide:
        massOfPeptide += aminoacidMass[i]
    return massOfPeptide  

In [9]:
mass('VKF')

374

In [10]:
from collections import Counter
def isSubset(list1, list2):
    c1, c2 = Counter(list1), Counter(list2)
    for k, n in c1.items():
        if n > c2[k]:
            return False
    return True

In [11]:
list1 = [1,5,5,7]
list2 = [1,3,4,5,6,7]
isSubset(list1, list2)

False

In [12]:
list3 = [0, 113, 114, 128, 129, 242, 242, 257, 370, 371, 484]
list4 = [0, 113, 114, 128, 129, 227, 242, 242, 257, 355, 356, 370, 371, 484]
isSubset(list3, list4)

True

In [56]:
isSubset(linearSpectrum('ETC'), [0, 71, 99, 101, 103, 128, 129, 199, 200, 204, 227, 230, 231, 298, 303, 328, 330, 332, 333])

True

In [39]:
aminoacidMass = {'G':57, 'A':71, 'S':87, 'P':97, 'V':99, 'T':101, 'C':103, 'L':113, 'N':114, 'D':115, 'K':128, 'E':129, 'M':131, 'H':137, 'F':147, 'R':156, 'Y':163, 'W':186}
def cycloPeptideSequencing(spectrum):
    """Given an ideal spectrum, find the cyclic peptide whose theoretical spectrum matches the experimental."""
    peptides = [k for k,v in aminoacidMass.items() if v in spectrum]
    aa = peptides
    hits = []
    print 'aminoacid', peptides
    while len(peptides) > 0:
        peptides = expand(peptides, aa)
        print 'peptides', peptides
        for peptide in peptides[:]:
            if mass(peptide) >= max(spectrum):
                if cyclicSpectrum(peptide) == spectrum:
                    hits.append(peptide)
                peptides.remove(peptide)
            elif isSubset(linearSpectrum(peptide), spectrum) is False:
                peptides.remove(peptide)
        print 'trimmed peptides', peptides
    return hits

In [49]:
cycloPeptideSequencing([0, 71, 101, 113, 131, 184, 202, 214, 232, 285, 303, 315, 345, 416])

['AMTL', 'ALTM', 'MALT', 'MTLA', 'LAMT', 'LTMA', 'TMAL', 'TLAM']

In [40]:
spectrum = [0, 113, 128, 186, 241, 299, 314, 427]
cycloPeptideSequencing(spectrum)

aminoacid ['K', 'L', 'W']
peptides ['KK', 'KL', 'KW', 'LK', 'LL', 'LW', 'WK', 'WL', 'WW']
trimmed peptides ['KL', 'KW', 'LK', 'LW', 'WK', 'WL']
peptides ['KLK', 'KLL', 'KLW', 'KWK', 'KWL', 'KWW', 'LKK', 'LKL', 'LKW', 'LWK', 'LWL', 'LWW', 'WKK', 'WKL', 'WKW', 'WLK', 'WLL', 'WLW']
trimmed peptides []


['KLW', 'KWL', 'LKW', 'LWK', 'WKL', 'WLK']

In [41]:
spectrum2 = [0, 97, 97, 99, 101, 103, 196, 198, 198, 200, 202, 295, 297, 299, 299, 301, 394, 396, 398, 400, 400, 497]
cycloPeptideSequencing(spectrum2)

aminoacid ['C', 'P', 'T', 'V']
peptides ['CC', 'CP', 'CT', 'CV', 'PC', 'PP', 'PT', 'PV', 'TC', 'TP', 'TT', 'TV', 'VC', 'VP', 'VT', 'VV']
trimmed peptides ['CP', 'CV', 'PC', 'PT', 'PV', 'TP', 'TV', 'VC', 'VP', 'VT']
peptides ['CPC', 'CPP', 'CPT', 'CPV', 'CVC', 'CVP', 'CVT', 'CVV', 'PCC', 'PCP', 'PCT', 'PCV', 'PTC', 'PTP', 'PTT', 'PTV', 'PVC', 'PVP', 'PVT', 'PVV', 'TPC', 'TPP', 'TPT', 'TPV', 'TVC', 'TVP', 'TVT', 'TVV', 'VCC', 'VCP', 'VCT', 'VCV', 'VPC', 'VPP', 'VPT', 'VPV', 'VTC', 'VTP', 'VTT', 'VTV']
trimmed peptides ['CPT', 'CPV', 'CVP', 'PCV', 'PTP', 'PTV', 'PVC', 'PVT', 'TPC', 'TPV', 'TVP', 'VCP', 'VPC', 'VPT', 'VTP']
peptides ['CPTC', 'CPTP', 'CPTT', 'CPTV', 'CPVC', 'CPVP', 'CPVT', 'CPVV', 'CVPC', 'CVPP', 'CVPT', 'CVPV', 'PCVC', 'PCVP', 'PCVT', 'PCVV', 'PTPC', 'PTPP', 'PTPT', 'PTPV', 'PTVC', 'PTVP', 'PTVT', 'PTVV', 'PVCC', 'PVCP', 'PVCT', 'PVCV', 'PVTC', 'PVTP', 'PVTT', 'PVTV', 'TPCC', 'TPCP', 'TPCT', 'TPCV', 'TPVC', 'TPVP', 'TPVT', 'TPVV', 'TVPC', 'TVPP', 'TVPT', 'TVPV', 'VCPC', 'V

['CPTPV',
 'CVPTP',
 'PCVPT',
 'PTPCV',
 'PTPVC',
 'PVCPT',
 'TPCVP',
 'TPVCP',
 'VCPTP',
 'VPTPC']

In [42]:
spectrum = [0, 71, 97, 99, 103, 113, 113, 114, 115, 131, 137, 196, 200, 202, 208, 214, 226, 227, 228, 240, 245, 299, 311, 311, 316, 327, 337, 339, 340, 341, 358, 408, 414, 424, 429, 436, 440, 442, 453, 455, 471, 507, 527, 537, 539, 542, 551, 554, 556, 566, 586, 622, 638, 640, 651, 653, 657, 664, 669, 679, 685, 735, 752, 753, 754, 756, 766, 777, 782, 782, 794, 848, 853, 865, 866, 867, 879, 885, 891, 893, 897, 956, 962, 978, 979, 980, 980, 990, 994, 996, 1022, 1093]
cycloPeptideSequencing(spectrum)

aminoacid ['A', 'C', 'D', 'H', 'M', 'L', 'N', 'P', 'V']
peptides ['AA', 'AC', 'AD', 'AH', 'AM', 'AL', 'AN', 'AP', 'AV', 'CA', 'CC', 'CD', 'CH', 'CM', 'CL', 'CN', 'CP', 'CV', 'DA', 'DC', 'DD', 'DH', 'DM', 'DL', 'DN', 'DP', 'DV', 'HA', 'HC', 'HD', 'HH', 'HM', 'HL', 'HN', 'HP', 'HV', 'MA', 'MC', 'MD', 'MH', 'MM', 'ML', 'MN', 'MP', 'MV', 'LA', 'LC', 'LD', 'LH', 'LM', 'LL', 'LN', 'LP', 'LV', 'NA', 'NC', 'ND', 'NH', 'NM', 'NL', 'NN', 'NP', 'NV', 'PA', 'PC', 'PD', 'PH', 'PM', 'PL', 'PN', 'PP', 'PV', 'VA', 'VC', 'VD', 'VH', 'VM', 'VL', 'VN', 'VP', 'VV']
trimmed peptides ['AH', 'AM', 'CH', 'CP', 'CV', 'DL', 'DV', 'HA', 'HC', 'MA', 'MN', 'MP', 'LD', 'LL', 'LN', 'NM', 'NL', 'PC', 'PM', 'PV', 'VC', 'VD', 'VP']
peptides ['AHA', 'AHC', 'AHD', 'AHH', 'AHM', 'AHL', 'AHN', 'AHP', 'AHV', 'AMA', 'AMC', 'AMD', 'AMH', 'AMM', 'AML', 'AMN', 'AMP', 'AMV', 'CHA', 'CHC', 'CHD', 'CHH', 'CHM', 'CHL', 'CHN', 'CHP', 'CHV', 'CPA', 'CPC', 'CPD', 'CPH', 'CPM', 'CPL', 'CPN', 'CPP', 'CPV', 'CVA', 'CVC', 'CVD', 'CVH', 'C

['AHCPVDLLNM',
 'AMNLLDVPCH',
 'CHAMNLLDVP',
 'CPVDLLNMAH',
 'DLLNMAHCPV',
 'DVPCHAMNLL',
 'HAMNLLDVPC',
 'HCPVDLLNMA',
 'MAHCPVDLLN',
 'MNLLDVPCHA',
 'LDVPCHAMNL',
 'LLDVPCHAMN',
 'LLNMAHCPVD',
 'LNMAHCPVDL',
 'NMAHCPVDLL',
 'NLLDVPCHAM',
 'PCHAMNLLDV',
 'PVDLLNMAHC',
 'VDLLNMAHCP',
 'VPCHAMNLLD']

In [44]:
spectrum = [0, 113, 128, 186, 241, 299, 314, 427]
cycloPeptideSequencing(spectrum)

aminoacid ['K', 'L', 'W']
peptides ['KK', 'KL', 'KW', 'LK', 'LL', 'LW', 'WK', 'WL', 'WW']
trimmed peptides ['KL', 'KW', 'LK', 'LW', 'WK', 'WL']
peptides ['KLK', 'KLL', 'KLW', 'KWK', 'KWL', 'KWW', 'LKK', 'LKL', 'LKW', 'LWK', 'LWL', 'LWW', 'WKK', 'WKL', 'WKW', 'WLK', 'WLL', 'WLW']
trimmed peptides []


['KLW', 'KWL', 'LKW', 'LWK', 'WKL', 'WLK']

In [19]:
map(lambda i: aminoacidMass[i], 'LKW')

[113, 128, 186]

In [28]:
'-'.join(str(i) for i in map(lambda i: aminoacidMass[i], 'LKW'))

'113-128-186'

In [43]:
y = ['CPTPV','CVPTP','PCVPT','PTPCV','PTPVC','PVCPT','TPCVP','TPVCP','VCPTP','VPTPC']
' '.join(str(z) for z in map(lambda peptide: '-'.join(str(i) for i in map(lambda i: aminoacidMass[i], peptide)), y))

'103-97-101-97-99 103-99-97-101-97 97-103-99-97-101 97-101-97-103-99 97-101-97-99-103 97-99-103-97-101 101-97-103-99-97 101-97-99-103-97 99-103-97-101-97 99-97-101-97-103'

In [45]:
z = ['KLW', 'KWL', 'LKW', 'LWK', 'WKL', 'WLK']
' '.join(str(z) for z in map(lambda peptide: '-'.join(str(i) for i in map(lambda i: aminoacidMass[i], peptide)), z))

'128-113-186 128-186-113 113-128-186 113-186-128 186-128-113 186-113-128'

In [12]:
#Reading the data
f = open('input/cycloseq_data.txt', 'r') 
for line in f: #line is a string
    print 'line', line
    numbers = line.split() # split the string on white-space and return a list of numbers as strings
    spectrum = map(int, numbers) #convert numbers to integers
print 'numbers', numbers
print 'spectrum', spectrum

line 0 71 97 99 103 113 113 114 115 131 137 196 200 202 208 214 226 227 228 240 245 299 311 311 316 327 337 339 340 341 358 408 414 424 429 436 440 442 453 455 471 507 527 537 539 542 551 554 556 566 586 622 638 640 651 653 657 664 669 679 685 735 752 753 754 756 766 777 782 782 794 848 853 865 866 867 879 885 891 893 897 956 962 978 979 980 980 990 994 996 1022 1093

numbers ['0', '71', '97', '99', '103', '113', '113', '114', '115', '131', '137', '196', '200', '202', '208', '214', '226', '227', '228', '240', '245', '299', '311', '311', '316', '327', '337', '339', '340', '341', '358', '408', '414', '424', '429', '436', '440', '442', '453', '455', '471', '507', '527', '537', '539', '542', '551', '554', '556', '566', '586', '622', '638', '640', '651', '653', '657', '664', '669', '679', '685', '735', '752', '753', '754', '756', '766', '777', '782', '782', '794', '848', '853', '865', '866', '867', '879', '885', '891', '893', '897', '956', '962', '978', '979', '980', '980', '990', '994', '9

### Final algorithm for branch and bound CyclopeptideSequencing

In [48]:
#spectrum = [0, 113, 128, 186, 241, 299, 314, 427]
aminoacid = ['G', 'A', 'S', 'P', 'V', 'T', 'C', 'L', 'N', 'D', 'K', 'E', 'M', 'H', 'F', 'R', 'Y', 'W']
aminoacidMass = {'G':57, 'A':71, 'S':87, 'P':97, 'V':99, 'T':101, 'C':103, 'L':113, 'N':114, 'D':115, 'K':128, 'E':129, 'M':131, 'H':137, 'F':147, 'R':156, 'Y':163, 'W':186}
def linearSpectrum(peptide):
    """Input: An amino acid string Peptide.
     Output: The linear spectrum of Peptide."""
    prefixMass = [0]*((len(peptide)+1))
    for i in range(len(peptide)):
        prefixMass[i+1] = prefixMass[i] + aminoacidMass[peptide[i]]
    #print 'prefixMass', prefixMass
    linear_spectrum = [0]
    for i in range(len(prefixMass)-1):
        for j in range(i+1, len(prefixMass)):
            linear_spectrum.append(prefixMass[j] - prefixMass[i])
    return sorted(linear_spectrum)    

def cyclicSpectrum(peptide):
    """Input: An amino acid string Peptide.
     Output: The cyclic spectrum of Peptide."""
    prefixMass = [0]*((len(peptide)+1))
    for i in range(len(peptide)):
        prefixMass[i+1] = prefixMass[i] + aminoacidMass[peptide[i]]
    peptideMass = prefixMass[len(peptide)]
    cyclic_spectrum = [0]
    for i in range(len(prefixMass)-1):
        for j in range(i+1, len(prefixMass)):
            cyclic_spectrum.append(prefixMass[j] - prefixMass[i])
            if i > 0 and j < (len(prefixMass)-1):
                cyclic_spectrum.append(peptideMass - (prefixMass[j] - prefixMass[i]))
    return sorted(cyclic_spectrum) 

def expand(peptides, aa):
    """Expands the peptide or aa by only the aminoacids that matches the spectrum"""
    expanded = []
    for i in peptides:
        for j in aa:
            expanded.append(i+j)
    return expanded

def mass(peptide):
    """Calculates the mass of peptide using the aminoacidMass dictionary"""
    massOfPeptide = 0
    for i in peptide:
        massOfPeptide += aminoacidMass[i]
    return massOfPeptide

from collections import Counter
def isSubset(list1, list2):
    """This function checks if one list is subset of another. Will check if linearSpectrum(peptide) is consistent with spectrum."""
    c1, c2 = Counter(list1), Counter(list2)
    for k, n in c1.items():
        if n > c2[k]:
            return False
    return True

def cycloPeptideSequencing(spectrum):
    """Given an ideal spectrum, find the cyclic peptide whose theoretical spectrum matches the experimental."""
    peptides = [k for k,v in aminoacidMass.items() if v in spectrum]
    aa = peptides
    hits = []
    while len(peptides) > 0:
        peptides = expand(peptides, aa)
        for peptide in peptides[:]:
            if mass(peptide) >= max(spectrum):
                if cyclicSpectrum(peptide) == spectrum:
                    hits.append(peptide)
                peptides.remove(peptide)
            elif isSubset(linearSpectrum(peptide), spectrum) is False:
                peptides.remove(peptide)
    return hits

#Read the data file and convert to list of numbers as integers
f = open('input/rosalind_ba4e.txt', 'r') 
for line in f: #line is a string
    numbers = line.split() # split the string on white-space and return a list of numbers as strings
    spectrum = map(int, numbers) #convert numbers to integers
    
#Calling the function
ans = cycloPeptideSequencing(spectrum)

#Format the output to right format for submission
print ' '.join(str(z) for z in map(lambda peptide: '-'.join(str(i) for i in map(lambda i: aminoacidMass[i], peptide)), ans))

129-115-99-99-87-115-128-163-156 129-156-163-128-115-87-99-99-115 115-129-156-163-128-115-87-99-99 115-128-163-156-129-115-99-99-87 115-87-99-99-115-129-156-163-128 115-99-99-87-115-128-163-156-129 128-115-87-99-99-115-129-156-163 128-163-156-129-115-99-99-87-115 87-115-128-163-156-129-115-99-99 87-99-99-115-129-156-163-128-115 156-129-115-99-99-87-115-128-163 156-163-128-115-87-99-99-115-129 99-115-129-156-163-128-115-87-99 99-87-115-128-163-156-129-115-99 99-99-115-129-156-163-128-115-87 99-99-87-115-128-163-156-129-115 163-128-115-87-99-99-115-129-156 163-156-129-115-99-99-87-115-128
