# Sekvenciranje peptida

In [1]:
'''Tabela masa aminokiselina, uključujući masu 0 "praznog" proteina'''
amino_acid_masses = {
        '': 0,
        'G': 57,
        'A': 71,
        'S': 87,
        'P': 97,
        'V': 99,
        'T': 101,
        'C': 103,
        'I': 113,
        'L': 113,
        'N': 114,
        'D': 115,
        'K': 128,
        'Q': 128,
        'E': 129,
        'M': 131,
        'H': 137,
        'F': 147,
        'R': 156,
        'Y': 163,
        'W': 186,
    }

In [2]:
'''
Klasa PeptideSequencing sadrži sve pomoćne funkcije vezane za
problem sekvenciranja peptida
'''
class PeptideSequencing:
    def __init__(self, amino_acid_masses):
        self.amino_acid_masses = amino_acid_masses
        
    '''Generisanje teorijskog linearnog spektra'''
    def linear_spectrum(self, peptide):
        prefix_mass = [0]
        
        for aa in peptide:
            amino_acid_mass = self.amino_acid_masses[aa]
            prefix_mass.append(prefix_mass[-1] + amino_acid_mass)
            

        n = len(peptide)
        spectrum = [0]
        
        for i in range(n):
            for j in range(i + 1, n + 1):
                spectrum.append(prefix_mass[j] - prefix_mass[i])
                
        spectrum.sort()
        return spectrum
    
    '''Generisanje teorijskog cikličnog spektra (ciklospektra)'''
    def cyclic_spectrum(self, peptide):
        prefix_mass = [0]
        
        for aa in peptide:
            amino_acid_mass = self.amino_acid_masses[aa]
            prefix_mass.append(prefix_mass[-1] + amino_acid_mass)
            

        peptide_mass = prefix_mass[-1]
        n = len(peptide)
        spectrum = [0]
        
        for i in range(n):
            for j in range(i + 1, n + 1):
                fragment_mass = prefix_mass[j] - prefix_mass[i]
                spectrum.append(fragment_mass)
                if i > 0 and j < n:
                    spectrum.append(peptide_mass - fragment_mass)
                
        spectrum.sort()
        return spectrum
    
    '''
    Metod vraća poslednju vrednost u spektru, 
    koja predstavlja ukupnu masu peptida
    '''
    def mass(self, peptide_spectrum):
        return peptide_spectrum[-1]
    
    '''
    Generisanje novih peptida koji se dobijaju nadovezivanjem svake od 
    raspoloživih aminokiselina na peptide iz zadatog niza
    '''
    def expand(self, peptides):
        new_peptides = []
        for peptide in peptides:
            for aa in [available_aa for available_aa in list(self.amino_acid_masses.keys()) if available_aa != '']:
                new_peptide = peptide + aa
                new_peptides.append(new_peptide)
                
        return new_peptides
    
    '''
    Provera saglasnosti zadatog spektra sa ciljnim spektrom,
    provera da li zadati spektar (multiskup) predstavlja podskup ciljnog spektra (multiskupa).
    Napomena: Spektri predstavljaju SORTIRANE liste vrednosti
    '''
    def consistent(self, peptide_spectrum, target_spectrum):
        n = len(peptide_spectrum)
        m = len(target_spectrum)
        
        i = 0
        j = 0
        
        while i < n:
            if peptide_spectrum[i] == target_spectrum[j]:
                i += 1
                
            elif peptide_spectrum[i] < target_spectrum[j]:
                return False
            
            j += 1    
            if j == m:
                return i == n
            
        return True
    
    '''
    Bodovanje saglasnosti zadatog spektra sa ciljnim spektrom,
    računanje veličine preseka dva multiskupa
    Napomena: Spektri predstavljaju SORTIRANE liste vrednosti
    '''
    def score(self, peptide_spectrum, target_spectrum):
        n = len(peptide_spectrum)
        m = len(target_spectrum)
        
        total_score = 0
        
        i = 0
        j = 0
        
        while i < n:
            if peptide_spectrum[i] == target_spectrum[j]:
                total_score += 1
                i += 1
                j += 1
                
            elif peptide_spectrum[i] < target_spectrum[j]:
                i += 1
            
            else:
                j += 1
                
            if j == m:
                break
            
        return total_score
            
    '''Pronalaženje peptida čiji spektar odgovara zadatom ciklospektru'''
    def cyclopeptide_sequencing(self, spectrum):
        peptides = set([''])
        parent_mass = spectrum[-1]
        output = []
        
        while len(peptides) > 0:
            peptides = self.expand(peptides)
            to_remove = []
            
            for peptide in peptides:
                peptide_spectrum = self.cyclic_spectrum(peptide)
                if self.mass(peptide_spectrum) == parent_mass:
                    peptide_spectrum = self.cyclic_spectrum(peptide)
                    if peptide_spectrum == spectrum:
                        output.append(peptide)
                        
                    to_remove.append(peptide)
                    
                elif not self.consistent(self.linear_spectrum(peptide), spectrum):
                    to_remove.append(peptide)
                    
            for peptide in to_remove:
                peptides.remove(peptide)
                    
        return output
    
    '''
    Izdvajanje peptida čiji je skor manji ili jednak skoru N-tog elementa 
    na leaderboard listi
    '''
    def trim(self, leaderboard, spectrum, N):
        score_pairs = []
        
        for peptide in leaderboard:
            linear_score = self.score(self.linear_spectrum(peptide), spectrum)
            score_pairs.append((linear_score, peptide))
            
        score_pairs.sort(reverse=True)
        
        n = len(leaderboard)
        
        i = N
        
        for i in range(N, n):
            (linear_score, peptide) = score_pairs[i]
            if linear_score != score_pairs[N - 1]:
                break
                
        trimmer_leaderboard = score_pairs[:i]
        return [x[1] for x in trimmer_leaderboard]
            
    
    '''
    Pronalaženje peptida čiji spektar odgovara zadatom spektru
    uz dozvoljena odstupanja u vidu neispravnih, nedostajućih ili
    pogrešno dodatih vrednosti u ciljnom ciklospektru
    '''
    def leaderboard_sequencing(self, spectrum, N):
        leaderboard = set([''])
        parent_mass = spectrum[-1]
        leader_peptide = None
        leader_score = float('-inf')
        
        while len(leaderboard) > 0:
            leaderboard = self.expand(leaderboard)
            to_remove = []
            
            for peptide in leaderboard:
                peptide_spectrum = self.cyclic_spectrum(peptide)
                if self.mass(peptide_spectrum) == parent_mass:
                    peptide_spectrum = self.cyclic_spectrum(peptide)
                    current_score = self.score(peptide_spectrum, spectrum)
                    if current_score > leader_score:
                        leader_peptide = peptide
                        leader_score = current_score
                    
                elif self.mass(peptide_spectrum) > parent_mass:
                    to_remove.append(peptide)

            for peptide in to_remove:
                leaderboard.remove(peptide)
                
            leaderboard = self.trim(leaderboard, spectrum, N)
                    
        return leader_peptide
                    

In [3]:
# Primeri
ps = PeptideSequencing(amino_acid_masses)
peptide = 'NQEL'

In [4]:
print('Linear spectrum')
print(ps.linear_spectrum(peptide))

Linear spectrum
[0, 113, 114, 128, 129, 242, 242, 257, 370, 371, 484]


In [5]:
print('Cyclic spectrum')
print(ps.cyclic_spectrum(peptide))

Cyclic spectrum
[0, 113, 114, 128, 129, 227, 242, 242, 257, 355, 356, 370, 371, 484]


In [6]:
spectrum = [0, 113, 114, 128, 129, 227, 242, 242, 257, 355, 356, 370, 371, 484]
ps.cyclopeptide_sequencing(spectrum)
# Napomena: Aminokiseline I i L imaju istu masu

['INKE',
 'INQE',
 'IEKN',
 'IEQN',
 'LNKE',
 'LNQE',
 'LEKN',
 'LEQN',
 'NIEK',
 'NIEQ',
 'NLEK',
 'NLEQ',
 'NKEI',
 'NKEL',
 'NQEI',
 'NQEL',
 'KNIE',
 'KNLE',
 'KEIN',
 'KELN',
 'QNIE',
 'QNLE',
 'QEIN',
 'QELN',
 'EINK',
 'EINQ',
 'ELNK',
 'ELNQ',
 'EKNI',
 'EKNL',
 'EQNI',
 'EQNL']

In [7]:
#              !!
#          [0, 50, 110, 113, 114, 128, 129, 129, 129, 129,      227, 355, 356, 370, 371, 484]
spectrum = [0, 70, 110,      114, 128,           129, 129, 227, 355, 356, 370, 371, 484]
N = 10
ps.leaderboard_sequencing(spectrum, N)

# Napomena: Sve ciklične permutacije ciljnog peptida
# predstavljaju tačan rezultat

'QNLE'