In [1]:
from collections import defaultdict

import pandas as pd

In [2]:
file_path = "data/mq_variants_intensity_cleaned.csv"
df = pd.read_csv(file_path)

protein_column_index = 6
peptide_column_index = 0

protein_to_peptides = defaultdict(set)

# Store uniquely-mapped proteins and their corresponding peptides
for _, row in df.iterrows():
    protein_entry = row.iloc[protein_column_index]
    peptide = row.iloc[peptide_column_index]

    if pd.notna(protein_entry) and pd.notna(peptide):
        proteins = protein_entry.split(";")
        if len(proteins) == 1:
            protein = proteins[0]
            if "|" in protein:
                parts = protein.split("|")
                if len(parts) > 2:
                    protein_key = parts[1]
                    protein_to_peptides[protein_key].add(peptide)

print(len(protein_to_peptides))

1574


In [3]:
for protein, peptides in list(protein_to_peptides.items())[:2]:
    print(f"{protein}: {list(peptides)[:5]}")

P28482: ['.N+72.02YLLSLPHK.', '.LFPNADSK+183.03ALDLLDK.', '.PNADSKALDLLDKMLTFNPHK.', '.HYLDQLNHILGILGSPSQEDLN-57.041C+57.021IINLK.', '.A+245.116RNYLLSLPHK.']
P35232: ['.AAELIANSLATAGDGLIELRKLEAAEDIAYQLSR.', '.LEAAEDIAY+183.034QLSR.', '.KLEAAEDIAY+183.021QLSR.', '.IELR-39.022KLEAAEDIAYQLSR.', '.KLEAAEDIAY+183.027QLSR.']


In [4]:
for protein, peptides in protein_to_peptides.items():
    print(f"Protein: {protein}, Number of Peptides: {len(peptides)}")

Protein: P28482, Number of Peptides: 42
Protein: P35232, Number of Peptides: 14
Protein: P41240, Number of Peptides: 118
Protein: P14174, Number of Peptides: 33
Protein: P19338, Number of Peptides: 35
Protein: Q8TD19, Number of Peptides: 237
Protein: Q00535, Number of Peptides: 11
Protein: P21796, Number of Peptides: 30
Protein: Q9H479, Number of Peptides: 56
Protein: Q9H773, Number of Peptides: 54
Protein: P10809, Number of Peptides: 60
Protein: P06576, Number of Peptides: 9
Protein: Q9Y478, Number of Peptides: 27
Protein: Q16832, Number of Peptides: 31
Protein: P38646, Number of Peptides: 31
Protein: P11021, Number of Peptides: 69
Protein: P04264, Number of Peptides: 132
Protein: O96013, Number of Peptides: 31
Protein: P06241, Number of Peptides: 11
Protein: P41743, Number of Peptides: 24
Protein: O43353, Number of Peptides: 16
Protein: P13639, Number of Peptides: 70
Protein: Q9Y4K4, Number of Peptides: 50
Protein: P05141, Number of Peptides: 10
Protein: P17252, Number of Peptides: 2

In [5]:
df_output = pd.DataFrame(
    [
        {
            "Protein": protein,
            "Peptides": ";".join(peptides),
            "Peptide_Count": len(peptides),
        }
        for protein, peptides in protein_to_peptides.items()
    ]
)
df_output.sort_values(by="Peptide_Count", ascending=False, inplace=True)
df_output.to_csv("data/protein_to_peptides.csv", index=False)

In [6]:
df_filtered = df_output[df_output["Peptide_Count"] >= 10]
df_filtered.to_csv("data/protein_with_10+_peptides.csv", index=False)