In [2]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import pandas as pd

# List of sequences as SeqRecord objects
sequences = [
    SeqRecord(Seq("MKWVTFISLLFLFSSAYS"), id="Protein1", description="Description of Protein1"),
    SeqRecord(Seq("LLLKAGAGAGAWWDAGVVF"), id="Protein2", description="Description of Protein2"),
    SeqRecord(Seq("VVVVKKKRRRNNNDDD"), id="Protein3", description="Description of Protein3"),
]

# Write to FASTA file
with open("proteins.fasta", "w") as output_file:
    SeqIO.write(sequences, output_file, "fasta")


In [3]:
import csv

# Load the CSV file
file_path = "proteins.csv"  # Update with the path to your file
output_fasta = "proteins.fasta"

# Read CSV and write to FASTA
with open(file_path, "r") as csv_file, open(output_fasta, "w") as fasta_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        protein_id = row['Id']  # Column name for protein ID
        sequence = row['Sequence']    # Column name for sequence
        fasta_file.write(f">{protein_id}\n")  # Write header
        fasta_file.write(f"{sequence}\n")    # Write sequence


Validating Fasta Sequence


In [3]:
from Bio import SeqIO

def validate_fasta(file_path):
    try:
        # Try parsing the FASTA file
        for record in SeqIO.parse(file_path, "fasta"):
            if not record.seq:  # Check for empty sequences
                print(f"Warning: Sequence for {record.id} is empty.")
            else:
                print(f"Sequence {record.id} is valid.")
    except Exception as e:
        print(f"Error: {e}")

# Example usage
file_path = "ENDO_MASTER_LCMS_PLASMA.fasta"
validate_fasta(file_path)


Sequence tr|A0A481SHK9|A0A481SHK9_HUMAN is valid.
Sequence tr|B3VL17|B3VL17_HUMAN is valid.
Sequence tr|Q9H1I6|Q9H1I6_HUMAN is valid.
Sequence tr|E9PFT6|E9PFT6_HUMAN is valid.
Sequence tr|B2RBS8|B2RBS8_HUMAN is valid.
Sequence tr|A0A024R1Z6|Release is valid.
Sequence tr|G3V1N2|G3V1N2_HUMAN is valid.
Sequence tr|D9ZGF2|D9ZGF2_HUMAN is valid.
Sequence tr|A0A075B6Z2|A0A075B6Z2_HUMAN is valid.
Sequence tr|Q9HAR8|Q9HAR8_HUMAN is valid.
Sequence sp|P0DOX5|IGG1_HUMAN is valid.
Sequence tr|B0YJC4|B0YJC4_HUMAN is valid.
Sequence sp|P0C0S8|H2A1_HUMAN is valid.
Sequence tr|A0A385HVZ2|A0A385HVZ2_HUMAN is valid.
Sequence tr|A0A024QZJ4|A0A024QZJ4_HUMAN is valid.
Sequence tr|B4DI63|B4DI63_HUMAN is valid.
Sequence tr|H0YCU9|H0YCU9_HUMAN is valid.
Sequence tr|H6VRF8|H6VRF8_HUMAN is valid.
Sequence tr|B4DPP6|B4DPP6_HUMAN is valid.
Sequence tr|D2JYH4|D2JYH4_HUMAN is valid.
Sequence tr|C8C504|C8C504_HUMAN is valid.
Sequence tr|A0A481SHK9|A0A481SHK9_HUMAN is valid.
Sequence tr|B3VL17|B3VL17_HUMAN is valid.

Trypsin digestion | Output a CSV file

In [17]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "6503_trypsin.fasta"  # Replace with your FASTA file name
output_csv = "digested_peptides_6503_trypsin.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme (default is trypsin)
digestion.getEnzymeName()  # Should return "Trypsin"

# Set the number of missed cleavages
digestion.setMissedCleavages(2)

# Perform digestion with length constraints (7-40 residues)
min_length = 7
max_length = 40
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Write peptides to a CSV file
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Peptide Sequence"])
    for peptide in peptides:
        writer.writerow([peptide])

print(f"Digestion complete. {len(peptides)} peptides written to {output_csv}")



Digestion complete. 10199 peptides written to digested_peptides_6503_trypsin.csv


Lys-C Digestion

In [13]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "6503_lysC.fasta"  # Replace with your FASTA file name
output_csv = "digested_peptides_6503_lysC.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme to Lys-C
digestion.setEnzyme("Lys-C")

# Set the number of missed cleavages
digestion.setMissedCleavages(2)

# Perform digestion with length constraints (7-40 residues)
min_length = 7
max_length = 40
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Write peptides to a CSV file
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Peptide Sequence"])
    for peptide in peptides:
        writer.writerow([peptide])

print(f"Lys-C digestion complete. {len(peptides)} peptides written to {output_csv}")


Lys-C digestion complete. 1093 peptides written to digested_peptides_6503_lysC.csv


Digested peptides to a structured csv


In [30]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "insulin.fasta"  # Replace with your FASTA file name
output_csv = "digested_peptides_insulin_3.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme (default is trypsin)
digestion.getEnzymeName()  # Should return "Trypsin"

# Set the number of missed cleavages
digestion.setMissedCleavages(0)

# Perform digestion with length constraints (7-40 residues)
min_length = 0
max_length = 400
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Write peptides to a CSV file
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["id", "sequence"])
    for idx, peptide in enumerate(peptides, 1):
        writer.writerow([idx, peptide])

print(f"Digestion complete. {len(peptides)} peptides written to {output_csv}")


Digestion complete. 2 peptides written to digested_peptides_insulin_3.csv


Generate Molecular Formulas

In [32]:
import pandas as pd
import pyopenms as oms

# Load the CSV file
input_file = "md_bsa.csv"  # Replace with your file name
output_file = "mol_bsa.csv"
data = pd.read_csv(input_file)

# Function to compute molecular formula
def get_molecular_formula(sequence):
    try:
        
        seq_obj = oms.AASequence.fromString(sequence)
        return str(seq_obj.getFormula())
    except Exception as e:
        return f"Error: {e}"

# Apply the function to the Sequence column
data["MolecularFormula"] = data["sequence"].apply(get_molecular_formula)

# Save the results to a new CSV
data.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")


Results saved to mol_bsa.csv


Added Oxidated Methionine modification


In [33]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "insulin.fasta"  # Replace with your FASTA file name
output_csv = "md_insulin.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme (default is trypsin)
digestion.getEnzymeName()  # Should return "Trypsin"

# Set the number of missed cleavages
digestion.setMissedCleavages(0)

# Perform digestion with length constraints (7-40 residues)
min_length = 0
max_length = 400
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Define the oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Apply the variable modifications to the digested peptides
modified_peptides = []
for peptide in result:
    oms.ModifiedPeptideGenerator.applyVariableModifications(variable_modifications, peptide, 1, modified_peptides, False)

# Combine original and modified peptides
all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

# Write peptides to a CSV file
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["id", "sequence"])
    for idx, peptide in enumerate(all_peptides, 1):
        writer.writerow([idx, peptide])

print(f"Digestion complete. {len(all_peptides)} peptides written to {output_csv}")


Digestion complete. 2 peptides written to md_insulin.csv


Modified column Names | outputs Csv with Name and Molecular Formulae

In [40]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "6504_lysC.fasta"  # Replace with your FASTA file name
output_csv = "6504_lysC_formulas.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme (default is trypsin)
digestion.getEnzymeName()  # Should return "Trypsin"

# Set the number of missed cleavages
digestion.setMissedCleavages(1)

# Perform digestion with length constraints (7-40 residues)
min_length = 7
max_length = 40
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Define the oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Apply the variable modifications to the digested peptides
modified_peptides = []
for peptide in result:
    oms.ModifiedPeptideGenerator.applyVariableModifications(variable_modifications, peptide, 1, modified_peptides, False)

# Combine original and modified peptides
all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

# Function to compute molecular formula
def get_molecular_formula(sequence):
    try:
        # Convert the sequence to an AASequence object
        seq_obj = oms.AASequence.fromString(sequence)
        # Get the molecular formula
        return str(seq_obj.getFormula())
    except Exception as e:
        return f"Error: {e}"

# Write peptides to a CSV file with Name and Molecular Formula columns
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Name", "Molecular Formula"])
    for peptide in all_peptides:
        name = f"{fasta_file.split('.')[0]}_{peptide}"
        molecular_formula = get_molecular_formula(peptide)
        writer.writerow([name, molecular_formula])

print(f"Digestion complete. {len(all_peptides)} peptides written to {output_csv}")


Digestion complete. 2338 peptides written to 6504_lysC_formulas.csv


Removed peptides with unknown amino acids X

In [1]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "ENDO_MASTER_LCMS_PLASMA.fasta"  # Replace with your FASTA file name
output_csv = "peptides_with_formulas_ENDO_MASTER_LCMS_PLASMA.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme (default is trypsin)
digestion.getEnzymeName()  # Should return "Trypsin"

# Set the number of missed cleavages
digestion.setMissedCleavages(1)

# Perform digestion with length constraints (7-40 residues)
min_length = 6
max_length = 40
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Define the oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Apply the variable modifications to the digested peptides
modified_peptides = []
for peptide in result:
    oms.ModifiedPeptideGenerator.applyVariableModifications(variable_modifications, peptide, 1, modified_peptides, False)

# Combine original and modified peptides
all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

# Remove sequences with 'X' amino acid
filtered_peptides = [peptide for peptide in all_peptides if 'X' not in peptide]

# Function to compute molecular formula
def get_molecular_formula(sequence):
    try:
        # Convert the sequence to an AASequence object
        seq_obj = oms.AASequence.fromString(sequence)
        # Get the molecular formula
        return str(seq_obj.getFormula())
    except Exception as e:
        return f"Error: {e}"

# Write peptides to a CSV file with Name and Molecular Formula columns
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Name", "Molecular Formula"])
    for peptide in filtered_peptides:
        name = f"{fasta_file.split('.')[0]}_{peptide}"
        molecular_formula = get_molecular_formula(peptide)
        writer.writerow([name, molecular_formula])

print(f"Digestion complete. {len(filtered_peptides)} peptides written to {output_csv}")


Digestion complete. 32592 peptides written to peptides_with_formulas_ENDO_MASTER_LCMS_PLASMA.csv


Read files seperately and output to one file

In [46]:
import pyopenms as oms
import csv

# List of FASTA files to process
fasta_files = ["6503_lysC.fasta", "6503_trypsin.fasta", "6504_lysC.fasta", "6504_trypsin.fasta", "6507_lysC.fasta", "6507_trypsin.fasta"]
output_csv = "peptides_with_formulas_seperate.csv"

# Function to read the protein sequence from a FASTA file
def read_protein_sequence(fasta_file):
    with open(fasta_file, "r") as f:
        lines = f.readlines()
        # Concatenate all non-header lines
        return "".join(line.strip() for line in lines if not line.startswith(">"))

# Function to compute molecular formula for a sequence
def get_molecular_formula(sequence):
    try:
        # Convert the sequence to an AASequence object
        seq_obj = oms.AASequence.fromString(sequence)
        # Get the molecular formula
        return str(seq_obj.getFormula())
    except Exception as e:
        return f"Error: {e}"

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()
digestion.setMissedCleavages(1)  # Allow one missed cleavage

# Define digestion parameters
min_length = 6
max_length = 40

# Define oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Open the CSV file for writing
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(["Name", "Molecular Formula"])

    # Process each FASTA file
    for fasta_file in fasta_files:
        # Read protein sequence
        protein_sequence = read_protein_sequence(fasta_file)
        protein = oms.AASequence.fromString(protein_sequence)

        # Perform digestion
        result = []
        digestion.digest(protein, result, min_length, max_length)

        # Prepare peptides
        peptides = [peptide.toString() for peptide in result]

        # Apply variable modifications
        modified_peptides = []
        for peptide in result:
            oms.ModifiedPeptideGenerator.applyVariableModifications(
                variable_modifications, peptide, 1, modified_peptides, False
            )

        # Combine original and modified peptides
        all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

        # Filter peptides (remove those containing 'X')
        filtered_peptides = [peptide for peptide in all_peptides if 'X' not in peptide]

        # Write peptides and formulas to the CSV
        for peptide in filtered_peptides:
            name = f"{fasta_file.split('.')[0]}_{peptide}"
            molecular_formula = get_molecular_formula(peptide)
            writer.writerow([name, molecular_formula])

print(f"Digestion complete. Results written to {output_csv}.")


Digestion complete. Results written to peptides_with_formulas_seperate.csv.


Getting Protein Name of the peptide from fasta header to the name

In [5]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "ENDO_MASTER_LCMS_PLASMA.fasta"  # Replace with your FASTA file name
output_csv = "Peptides_2MC_ENDO_MASTER_LCMS_PLASMA.csv"

# Parse the FASTA file to extract protein names and sequences
def parse_fasta(file_path):
    proteins = []
    with open(file_path, "r") as f:
        lines = f.readlines()
        protein_name = ""
        protein_sequence = ""
        for line in lines:
            line = line.strip()
            if line.startswith(">"):
                if protein_name and protein_sequence:
                    proteins.append((protein_name, protein_sequence))
                protein_name = line.lstrip(">")
                protein_sequence = ""
            else:
                protein_sequence += line
        if protein_name and protein_sequence:
            proteins.append((protein_name, protein_sequence))
    return proteins

# Load all proteins from the FASTA file
proteins = parse_fasta(fasta_file)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()
digestion.setMissedCleavages(2)  # Set the number of missed cleavages
min_length = 6
max_length = 40

# Define the oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Prepare to write to the CSV
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Name", "Molecular Formula"])

    # Process each protein
    for protein_name, protein_sequence in proteins:
        protein = oms.AASequence.fromString(protein_sequence)
        result = []
        digestion.digest(protein, result, min_length, max_length)

        # Get peptides and apply modifications
        peptides = [peptide.toString() for peptide in result]
        modified_peptides = []
        for peptide in result:
            oms.ModifiedPeptideGenerator.applyVariableModifications(variable_modifications, peptide, 1, modified_peptides, False)

        # Combine original and modified peptides
        all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

        # Remove sequences with 'X' amino acid
        filtered_peptides = [peptide for peptide in all_peptides if 'X' not in peptide]

        # Compute molecular formulas and write to CSV
        for peptide in filtered_peptides:
            name = f"{protein_name}@{peptide}"  # Include the protein name
            try:
                seq_obj = oms.AASequence.fromString(peptide)
                molecular_formula = str(seq_obj.getFormula())
            except Exception as e:
                molecular_formula = f"Error: {e}"
            writer.writerow([name, molecular_formula])

print(f"Digestion complete. Peptides written to {output_csv}")


Digestion complete. Peptides written to Peptides_2MC_ENDO_MASTER_LCMS_PLASMA.csv


Code for outputing data in the template format

In [6]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "mcherry.fasta"  # Replace with your FASTA file name
output_csv = "mcherry.csv"

# Parse the FASTA file to extract protein names and sequences
def parse_fasta(file_path):
    proteins = []
    with open(file_path, "r") as f:
        lines = f.readlines()
        protein_name = ""
        protein_sequence = ""
        for line in lines:
            line = line.strip()
            if line.startswith(">"):
                if protein_name and protein_sequence:
                    proteins.append((protein_name, protein_sequence))
                protein_name = line.lstrip(">")
                protein_sequence = ""
            else:
                protein_sequence += line
        if protein_name and protein_sequence:
            proteins.append((protein_name, protein_sequence))
    return proteins

# Load all proteins from the FASTA file
proteins = parse_fasta(fasta_file)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()
digestion.setMissedCleavages(1)  # Set the number of missed cleavages
min_length = 6
max_length = 40

# Define the oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Prepare to write to the CSV
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    # Define output headers
    headers = ["RT", "Formula", "Name", "CCS [M+H]+", "CCS [M+Na]+", "CCS [M-H]-",
               "KEGG", "CAS", "PubChem", "ChemSpider", "HMDB", "BioCyc",
               "Metlin", "LipidMaps", "UserID", "InChI"]
    writer.writerow(headers)

    # Process each protein
    for protein_name, protein_sequence in proteins:
        protein = oms.AASequence.fromString(protein_sequence)
        result = []
        digestion.digest(protein, result, min_length, max_length)

        # Get peptides and apply modifications
        peptides = [peptide.toString() for peptide in result]
        modified_peptides = []
        for peptide in result:
            oms.ModifiedPeptideGenerator.applyVariableModifications(variable_modifications, peptide, 1, modified_peptides, False)

        # Combine original and modified peptides
        all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

        # Remove sequences with 'X' amino acid
        filtered_peptides = [peptide for peptide in all_peptides if 'X' not in peptide]

        # Compute molecular formulas and write to CSV
        for peptide in filtered_peptides:
            name = f"{protein_name}@{peptide}"  # Include the protein name
            try:
                seq_obj = oms.AASequence.fromString(peptide)
                molecular_formula = str(seq_obj.getFormula())
                # Placeholder values for other fields
                retention_time = ""
                ccs_mh_plus = ""
                ccs_mna_plus = ""
                ccs_mh_minus = ""
                kegg = ""
                cas = ""
                pubchem = ""
                chemspider = ""
                hmdb = ""
                biocyc = ""
                metlin = ""
                lipidmaps = ""
                user_id = ""
                inchi = ""
            except Exception as e:
                molecular_formula = f"Error: {e}"

            # Write the row
            writer.writerow([retention_time, molecular_formula, name, ccs_mh_plus,
                             ccs_mna_plus, ccs_mh_minus, kegg, cas, pubchem, chemspider,
                             hmdb, biocyc, metlin, lipidmaps, user_id, inchi])

print(f"Digestion complete. Peptides written to {output_csv}")


Digestion complete. Peptides written to mcherry.csv


In [6]:
import pandas as pd

# Path to your CSV file
file_path = "human_pro_peptides_detailed.csv"  # Replace with your actual CSV file path

# Load the CSV file
df = pd.read_csv(file_path)

# Remove duplicates based on the entire row
df_cleaned = df.drop_duplicates()

# Save the cleaned data to a new CSV file
output_path = "cleaned_file.csv"  # Specify the output file name
df_cleaned.to_csv(output_path, index=False)

print(f"Duplicates removed. Cleaned file saved to {output_path}")


Duplicates removed. Cleaned file saved to cleaned_file.csv


Calculate number of unique proteins in FASTA

In [4]:
def count_unique_proteins_in_fasta(file_path):
    """
    Counts the number of unique proteins in a given FASTA file.

    :param file_path: Path to the FASTA file
    :return: Number of unique proteins
    """
    unique_proteins = set()  # Set to store unique protein identifiers
    with open(file_path, "r") as file:
        for line in file:
            if line.startswith(">"):
                protein_id = line.strip()  # Get the header line (remove whitespace)
                unique_proteins.add(protein_id)  # Add to set (duplicates automatically ignored)
    return len(unique_proteins)

# Example usage
fasta_file = "ENDO_MASTER_LCMS_PLASMA.fasta"  # Replace with your FASTA file name
num_unique_proteins = count_unique_proteins_in_fasta(fasta_file)
print(f"The number of unique proteins in the FASTA file is: {num_unique_proteins}")


The number of unique proteins in the FASTA file is: 186


Code for Splitting the CSV 


In [1]:
import pyopenms as oms
import pandas as pd
import math

# Specify the local FASTA file
fasta_file = "mouse_nw.fasta"  # Replace with your FASTA file name
output_excel_prefix = "new_mouse_peptides"
max_rows_per_file = 500000  # Set to a safe limit below Excel's max

# Parse the FASTA file to extract protein names and sequences
def parse_fasta(file_path):
    proteins = []
    with open(file_path, "r") as f:
        lines = f.readlines()
        protein_name = ""
        protein_sequence = ""
        for line in lines:
            line = line.strip()
            if line.startswith(">"):
                if protein_name and protein_sequence:
                    proteins.append((protein_name, protein_sequence))
                protein_name = line.lstrip(">")
                protein_sequence = ""
            else:
                protein_sequence += line
        if protein_name and protein_sequence:
            proteins.append((protein_name, protein_sequence))
    return proteins

# Load all proteins from the FASTA file
proteins = parse_fasta(fasta_file)

digestion = oms.ProteaseDigestion()
digestion.setMissedCleavages(1)
min_length = 6
max_length = 40

variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

all_peptides = []

for protein_name, protein_sequence in proteins:
    protein = oms.AASequence.fromString(protein_sequence)
    result = []
    digestion.digest(protein, result, min_length, max_length)
    peptides = [peptide.toString() for peptide in result]
    modified_peptides = []
    for peptide in result:
        oms.ModifiedPeptideGenerator.applyVariableModifications(variable_modifications, peptide, 1, modified_peptides, False)
    
    all_peptides.extend([(protein_name, peptide) for peptide in peptides if 'X' not in peptide])
    all_peptides.extend([(protein_name, p.toString()) for p in modified_peptides if 'X' not in p.toString()])

# Split data into multiple files
num_files = math.ceil(len(all_peptides) / max_rows_per_file)
for i in range(num_files):
    start_idx = i * max_rows_per_file
    end_idx = min((i + 1) * max_rows_per_file, len(all_peptides))
    batch = all_peptides[start_idx:end_idx]
    
    data = []
    for protein_name, peptide in batch:
        try:
            seq_obj = oms.AASequence.fromString(peptide)
            molecular_formula = str(seq_obj.getFormula())
        except Exception as e:
            molecular_formula = f"Error: {e}"
        
        data.append(["", molecular_formula, f"{protein_name}@{peptide}", "", "", "", "", "", "", "", "", "", "", "", "", ""])
    
    df = pd.DataFrame(data, columns=["RT", "Formula", "Name", "CCS [M+H]+", "CCS [M+Na]+", "CCS [M-H]-",
                                      "KEGG", "CAS", "PubChem", "ChemSpider", "HMDB", "BioCyc",
                                      "Metlin", "LipidMaps", "UserID", "InChI"])
    
    output_file = f"{output_excel_prefix}_part{i+1}.xlsx"
    df.to_excel(output_file, index=False, engine='openpyxl')
    print(f"Written {output_file}")

print("Digestion complete. Peptides written to multiple Excel files.")

Written new_mouse_peptides_part1.xlsx
Written new_mouse_peptides_part2.xlsx
Written new_mouse_peptides_part3.xlsx
Written new_mouse_peptides_part4.xlsx
Digestion complete. Peptides written to multiple Excel files.


Code for fetch FASTA Sequences from uniprot

In [4]:
import pandas as pd
import re
import requests
import pyopenms as oms
import csv

# ----------- Step 1: Read LC-MS/MS CSV and Clean Headers -----------
input_csv = "1.csv"  # Replace with your actual file path
df = pd.read_csv(input_csv, sep="\t")  # Change sep="," if needed
df.columns = df.columns.str.strip()  # Clean column names

print("Detected columns:", df.columns.tolist())

# ----------- Step 2: Try to Automatically Detect the Protein Group Column -----------
protein_col = None
for col in df.columns:
    if "protein" in col.lower() and "group" in col.lower():
        protein_col = col
        break

if not protein_col:
    raise KeyError("❌ Could not find a 'Protein Group Name'-like column in your file.")

print(f"✅ Using column: {protein_col} to extract UniProt IDs")

# ----------- Step 3: Extract UniProt IDs from the Detected Column -----------
def extract_uniprot_id(entry):
    match = re.search(r"\|([A-Z0-9]+)\|", str(entry))
    return match.group(1) if match else None

df['UniProt_ID'] = df[protein_col].apply(extract_uniprot_id)
unique_ids = df['UniProt_ID'].dropna().unique()

# ----------- Step 4: Fetch FASTA Sequences from UniProt -----------
fasta_file = "master_proteins.fasta"
base_url = "https://rest.uniprot.org/uniprotkb/{}.fasta"

with open(fasta_file, "w") as fasta_out:
    for uid in unique_ids:
        response = requests.get(base_url.format(uid))
        if response.status_code == 200:
            fasta_out.write(response.text)
            print(f"✅ Fetched: {uid}")
        else:
            print(f"❌ Failed to fetch {uid} (status {response.status_code})")

# ----------- Step 5: Parse FASTA File -----------
def parse_fasta(file_path):
    proteins = []
    with open(file_path, "r") as f:
        lines = f.readlines()
        name = ""
        seq = ""
        for line in lines:
            line = line.strip()
            if line.startswith(">"):
                if name and seq:
                    proteins.append((name, seq))
                name = line[1:]
                seq = ""
            else:
                seq += line
        if name and seq:
            proteins.append((name, seq))
    return proteins

proteins = parse_fasta(fasta_file)

# ----------- Step 6: Set Up Digestion Parameters -----------
digestion = oms.ProteaseDigestion()
digestion.setEnzyme("Trypsin")
digestion.setMissedCleavages(1)
min_len = 6
max_len = 40

modifications = oms.ModifiedPeptideGenerator.getModifications([b"Oxidation (M)"])

# ----------- Step 7: In Silico Digestion + Export to MetaboScape Format -----------
output_csv = "metaboscape_peptides.csv"
with open(output_csv, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([
        "RT", "Formula", "Name", "Mass", "CCS [M+H]+", "CCS [M+Na]+", "CCS [M-H]-",
        "KEGG", "CAS", "PubChem", "ChemSpider", "HMDB", "BioCyc",
        "Metlin", "LipidMaps", "UserID", "InChI"
    ])

    for protein_name, protein_seq in proteins:
        try:
            aa_seq = oms.AASequence.fromString(protein_seq)
        except Exception as e:
            print(f"⚠️ Error parsing {protein_name}: {e}")
            continue

        peptides = []
        digestion.digest(aa_seq, peptides, min_len, max_len)

        modified_peptides = []
        for p in peptides:
            oms.ModifiedPeptideGenerator.applyVariableModifications(modifications, p, 1, modified_peptides, False)

        all_peptides = set([p.toString() for p in peptides] + [p.toString() for p in modified_peptides])
        all_peptides = [pep for pep in all_peptides if "X" not in pep]

        for pep_str in all_peptides:
            try:
                seq_obj = oms.AASequence.fromString(pep_str)
                formula = seq_obj.getFormula().toString()
                mass = seq_obj.getMonoWeight()
            except Exception as e:
                formula = f"Error: {e}"
                mass = ""

            name = f"{protein_name}@{pep_str}"
            writer.writerow(["", formula, name, mass, "", "", "", "", "", "", "", "", "", "", "", ""])

print(f"\n🎉 Digestion complete. Peptides saved to: {output_csv}")


Detected columns: ['MS2 Id,Peptide Sequence,XCorr Score,Precursor MZ,PPM Error,Rt,Corrected Ook0,Protein Group Name,Charge,Delta CN Score,Confidence Score,Calculated MH,Matched Ions,Predicted Ook0,TIMScore,Is Unique']
✅ Using column: MS2 Id,Peptide Sequence,XCorr Score,Precursor MZ,PPM Error,Rt,Corrected Ook0,Protein Group Name,Charge,Delta CN Score,Confidence Score,Calculated MH,Matched Ions,Predicted Ook0,TIMScore,Is Unique to extract UniProt IDs
✅ Fetched: Q80WW9
✅ Fetched: Q3TW96
✅ Fetched: O35226
✅ Fetched: P0C7N9
✅ Fetched: Q3UJB9
✅ Fetched: Q3UWL8
✅ Fetched: Q6NV83
✅ Fetched: Q9R1T4
✅ Fetched: P97807
✅ Fetched: P26039
✅ Fetched: P08882
✅ Fetched: Q62009
✅ Fetched: O70423
✅ Fetched: Q5NCR9
✅ Fetched: G5E8K5
✅ Fetched: Q61140
✅ Fetched: Q8VH51
✅ Fetched: Q9CRB9
✅ Fetched: Q9D8N0
✅ Fetched: Q13608
✅ Fetched: Q9JJY4
✅ Fetched: Q9JIF7
✅ Fetched: Q8C1B7
✅ Fetched: Q7TSG2
✅ Fetched: Q9EPU4
✅ Fetched: E9Q1P8
✅ Fetched: P28667
✅ Fetched: Q7M753
✅ Fetched: Q9CR00
✅ Fetched: Q8VDJ3
✅ Fetch

In [None]:
import pandas as pd
import re
import requests
import pyopenms as oms
import csv
import math

# ----------- Step 1: Read LC-MS/MS CSV and Clean Headers -----------
input_csv = "1.csv"
df = pd.read_csv(input_csv, sep="\t")
df.columns = df.columns.str.strip()

print("Detected columns:", df.columns.tolist())

# ----------- Step 2: Detect the Protein Group Column -----------
protein_col = None
for col in df.columns:
    if "protein" in col.lower() and "group" in col.lower():
        protein_col = col
        break

if not protein_col:
    raise KeyError(" Could not find a 'Protein Group Name'-like column in your file.")
print(f" Using column: {protein_col} to extract UniProt IDs")

# ----------- Step 3: Extract UniProt IDs -----------
def extract_uniprot_id(entry):
    match = re.search(r"\|([A-Z0-9]+)\|", str(entry))
    return match.group(1) if match else None

df['UniProt_ID'] = df[protein_col].apply(extract_uniprot_id)
unique_ids = df['UniProt_ID'].dropna().unique()

# ----------- Step 4: Fetch FASTA Sequences from UniProt -----------
fasta_file = "master_proteins.fasta"
base_url = "https://rest.uniprot.org/uniprotkb/{}.fasta"

with open(fasta_file, "w") as fasta_out:
    for uid in unique_ids:
        response = requests.get(base_url.format(uid))
        if response.status_code == 200:
            fasta_out.write(response.text)
            print(f" Fetched: {uid}")
        else:
            print(f" Failed to fetch {uid} (status {response.status_code})")

# ----------- Step 5: Parse FASTA File -----------
def parse_fasta(file_path):
    proteins = []
    with open(file_path, "r") as f:
        lines = f.readlines()
        name, seq = "", ""
        for line in lines:
            line = line.strip()
            if line.startswith(">"):
                if name and seq:
                    proteins.append((name, seq))
                name = line[1:]
                seq = ""
            else:
                seq += line
        if name and seq:
            proteins.append((name, seq))
    return proteins

proteins = parse_fasta(fasta_file)

# ----------- Step 6: Setup Digestion Parameters -----------
digestion = oms.ProteaseDigestion()
digestion.setEnzyme("Trypsin")
digestion.setMissedCleavages(1)
min_len = 6
max_len = 40

modifications = oms.ModifiedPeptideGenerator.getModifications([b"Oxidation (M)"])

# ----------- Step 7: Digest and Buffer All Output Rows -----------
all_rows = []

for protein_name, protein_seq in proteins:
    try:
        aa_seq = oms.AASequence.fromString(protein_seq)
    except Exception as e:
        print(f" Error parsing {protein_name}: {e}")
        continue

    peptides = []
    digestion.digest(aa_seq, peptides, min_len, max_len)

    modified_peptides = []
    for p in peptides:
        oms.ModifiedPeptideGenerator.applyVariableModifications(modifications, p, 1, modified_peptides, False)

    all_peptides = set([p.toString() for p in peptides] + [p.toString() for p in modified_peptides])
    all_peptides = [pep for pep in all_peptides if "X" not in pep]

    for pep_str in all_peptides:
        try:
            seq_obj = oms.AASequence.fromString(pep_str)
            formula = seq_obj.getFormula().toString()
            mass = seq_obj.getMonoWeight()
        except Exception as e:
            formula = f"Error: {e}"
            mass = ""

        name = f"{protein_name}@{pep_str}"
        row = ["", formula, name, mass, "", "", "", "", "", "", "", "", "", "", "", ""]
        all_rows.append(row)

# ----------- Step 8: Write into Chunks of 10,000 for MetaboScape -----------
chunk_size = 50000
total_parts = math.ceil(len(all_rows) / chunk_size)
output_prefix = "metaboscape_peptides_part"

print(f" Writing {len(all_rows)} peptides into {total_parts} MetaboScape-compatible files...")

for i in range(total_parts):
    chunk = all_rows[i * chunk_size:(i + 1) * chunk_size]
    filename = f"{output_prefix}{i + 1}.csv"
    with open(filename, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "RT", "Formula", "Name", "Mass", "CCS [M+H]+", "CCS [M+Na]+", "CCS [M-H]-",
            "KEGG", "CAS", "PubChem", "ChemSpider", "HMDB", "BioCyc",
            "Metlin", "LipidMaps", "UserID", "InChI"
        ])
        writer.writerows(chunk)
    print(f" {filename} saved with {len(chunk)} rows")

print("\n Digestion complete. All peptide chunks saved.")


Detected columns: ['MS2 Id,Peptide Sequence,XCorr Score,Precursor MZ,PPM Error,Rt,Corrected Ook0,Protein Group Name,Charge,Delta CN Score,Confidence Score,Calculated MH,Matched Ions,Predicted Ook0,TIMScore,Is Unique']
✅ Using column: MS2 Id,Peptide Sequence,XCorr Score,Precursor MZ,PPM Error,Rt,Corrected Ook0,Protein Group Name,Charge,Delta CN Score,Confidence Score,Calculated MH,Matched Ions,Predicted Ook0,TIMScore,Is Unique to extract UniProt IDs
✅ Fetched: Q80WW9
✅ Fetched: Q3TW96
✅ Fetched: O35226
✅ Fetched: P0C7N9
✅ Fetched: Q3UJB9
✅ Fetched: Q3UWL8
✅ Fetched: Q6NV83
✅ Fetched: Q9R1T4
✅ Fetched: P97807
✅ Fetched: P26039
✅ Fetched: P08882
✅ Fetched: Q62009
✅ Fetched: O70423
✅ Fetched: Q5NCR9
✅ Fetched: G5E8K5
✅ Fetched: Q61140
✅ Fetched: Q8VH51
✅ Fetched: Q9CRB9
✅ Fetched: Q9D8N0
✅ Fetched: Q13608
✅ Fetched: Q9JJY4
✅ Fetched: Q9JIF7
✅ Fetched: Q8C1B7
✅ Fetched: Q7TSG2
✅ Fetched: Q9EPU4
✅ Fetched: E9Q1P8
✅ Fetched: P28667
✅ Fetched: Q7M753
✅ Fetched: Q9CR00
✅ Fetched: Q8VDJ3
✅ Fetch

In [1]:
import pandas as pd
import pyopenms as oms
import csv
import math

# ----------- Step 1: Parse Local FASTA File (Whole Human Proteome) -----------
def parse_fasta(file_path):
    proteins = []
    with open(file_path, "r") as f:
        lines = f.readlines()
        name, seq = "", ""
        for line in lines:
            line = line.strip()
            if line.startswith(">"):
                if name and seq:
                    proteins.append((name, seq))
                name = line[1:]
                seq = ""
            else:
                seq += line
        if name and seq:
            proteins.append((name, seq))
    return proteins

# Path to your downloaded human proteome FASTA
fasta_file = "mcherry.fasta"
proteins = parse_fasta(fasta_file)

# ----------- Step 2: Setup Digestion Parameters -----------
digestion = oms.ProteaseDigestion()
digestion.setEnzyme("Trypsin")
digestion.setMissedCleavages(2)
min_len = 6
max_len = 40

modifications = oms.ModifiedPeptideGenerator.getModifications([b"Oxidation (M)"])

# ----------- Step 3: Digest Proteins and Prepare Peptides -----------
all_rows = []

for protein_name, protein_seq in proteins:
    try:
        aa_seq = oms.AASequence.fromString(protein_seq)
    except Exception as e:
        print(f"⚠️ Error parsing {protein_name}: {e}")
        continue

    peptides = []
    digestion.digest(aa_seq, peptides, min_len, max_len)

    modified_peptides = []
    for p in peptides:
        oms.ModifiedPeptideGenerator.applyVariableModifications(modifications, p, 1, modified_peptides, False)

    all_peptides = set([p.toString() for p in peptides] + [p.toString() for p in modified_peptides])
    all_peptides = [pep for pep in all_peptides if "X" not in pep]

    for pep_str in all_peptides:
        try:
            seq_obj = oms.AASequence.fromString(pep_str)
            formula = seq_obj.getFormula().toString()
            mass = seq_obj.getMonoWeight()
        except Exception as e:
            formula = f"Error: {e}"
            mass = ""

        name = f"{protein_name}@{pep_str}"
        row = ["", formula, name, mass, "", "", "", "", "", "", "", "", "", "", "", ""]
        all_rows.append(row)

# ----------- Step 4: Write into Chunks of 50,000 for MetaboScape -----------
chunk_size = 50000
total_parts = math.ceil(len(all_rows) / chunk_size)
output_prefix = "human_proteome_metaboscape_peptides_part"

print(f"📝 Writing {len(all_rows)} peptides into {total_parts} MetaboScape-compatible files...")

for i in range(total_parts):
    chunk = all_rows[i * chunk_size:(i + 1) * chunk_size]
    filename = f"{output_prefix}{i + 1}.csv"
    with open(filename, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "RT", "Formula", "Name", "Mass", "CCS [M+H]+", "CCS [M+Na]+", "CCS [M-H]-",
            "KEGG", "CAS", "PubChem", "ChemSpider", "HMDB", "BioCyc",
            "Metlin", "LipidMaps", "UserID", "InChI"
        ])
        writer.writerows(chunk)
    print(f"✅ {filename} saved with {len(chunk)} rows")

print("\n🎉 Digestion complete. All peptide chunks saved.")


📝 Writing 117 peptides into 1 MetaboScape-compatible files...
✅ human_proteome_metaboscape_peptides_part1.csv saved with 117 rows

🎉 Digestion complete. All peptide chunks saved.
