In [103]:
%reset

Once deleted, variables cannot be recovered. Proceed (y/[n])? y


In [129]:

def aa_masses(filename):
    mass_table = {}
    with open (filename, "r") as myfile:
        for line in myfile:
            aa_mass = line.rstrip().split(" ")
            mass_table[aa_mass[0]] = int(aa_mass[1])
    return(mass_table)


def score_peptides(experimental_spectrum, theoretical_spectrum):
    score = 0
    for i in experimental_spectrum:
        if i in theoretical_spectrum:
            score += 1
            theoretical_spectrum.remove(i)
    return(score)


def spectrum(peptides):
    spec = [0]
    for pep in peptides:
        spec.append(sum(pep))
    return(sorted(spec))


def linear_subpeptides(pep_seq):
    n = len(pep_seq)
    sub_peptides = []
    for size in range(1,n):
        for start in range(n-size+1):
            sub_peptides.append(pep_seq[start:start+size])
    sub_peptides.append(pep_seq)
    return(sub_peptides)


def cyclic_subpeptides(pep_seq):
    n = len(pep_seq)
    cyclo_seq = pep_seq + pep_seq
    sub_peptides = []
    for size in range(1,n):
        for start in range(n):
            sub_peptides.append(cyclo_seq[start:start+size])

    sub_peptides.append(pep_seq)
    return(sub_peptides)


def aa2masses(peptide, masses):
    pep_masses = []
    for i in peptide:
        pep_masses.append(masses[i])
    return(pep_masses)


def trim(peptide_leaderboard, exp_spectrum, trim_length):
    scores = []
    for lb in peptide_leaderboard:
        #changed this to linear
        score = score_peptides(exp_spectrum, spectrum(linear_subpeptides(lb)))
        scores.append(score)
    if len(scores) > trim_length:
        cutoff = sorted(scores, reverse = True)[trim_length-1]
        num_keep = len([i for i in scores if i >= cutoff])
        lb_sorted = [x for s,x in sorted(zip(scores, peptide_leaderboard), reverse = True)]
        return(lb_sorted[0:num_keep])
    else:
        return(peptide_leaderboard)


In [124]:
def cyclic_leaderboard_sequence(exp_spec, trim_length, mass_set):

    parent_mass = max(exp_spec)
    peptides = [[]]
    leader_peptide = []

    while len(peptides) != 0:
        new_peptides = []
        
        # expand peptide list
        for pep in peptides:
            for mass in mass_set:
                new_pep = []
                new_pep.extend(pep)
                new_pep.append(mass)
                new_peptides.append(new_pep)
        
        # test each peptide to see if it's the leader peptide or greater than parent mass
        filtered_peptides = []
        for pep in new_peptides:

            if sum(pep) == parent_mass:
                new_spec = spectrum(cyclic_subpeptides(pep))
                leader_spec = spectrum(cyclic_subpeptides(leader_peptide))
                if score_peptides(exp_spec, new_spec) > score_peptides(exp_spec, leader_spec):
                    leader_peptide = pep
            if sum(pep) <= parent_mass:
                filtered_peptides.append(pep)

        # trim the list to just the top N peptides (and ties)
        peptides = trim(filtered_peptides, exp_spec, trim_length)
    
    return(leader_peptide)

In [132]:
# sample dataset

masses = aa_masses("integer-mass-table.txt")
mass18 = sorted(set(masses.values()))

example_spec = [0, 71, 113, 129, 147, 200, 218, 260, 313, 331, 347, 389, 460]
N = 5

result = cyclic_leaderboard_sequence(example_spec, N, mass18)
print("-".join([str(i) for i in result]))

147-71-129-113


In [135]:
masses = aa_masses("integer-mass-table.txt")
mass18 = sorted(set(masses.values()))

f = open("dataset_102_8.txt", "r")
content = f.readlines()
f.close()

N = 200
example_spec = [int(i) for i in content[1].split(" ")]

result = cyclic_leaderboard_sequence(example_spec, N, mass18)

f = open("leaderboard-solution.txt", "w")
f.write("-".join([str(i) for i in result]))
f.close()