Code for Generating a peptide list from a fasta | Input - FASTA format


In [None]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "mcherry.fasta"  # Replace with your FASTA file name
output_csv = "mcherr76y.csv"

# Parse the FASTA file to extract protein names and sequences
def parse_fasta(file_path):
    proteins = []
    with open(file_path, "r") as f:
        lines = f.readlines()
        protein_name = ""
        protein_sequence = ""
        for line in lines:
            line = line.strip()
            if line.startswith(">"):
                if protein_name and protein_sequence:
                    proteins.append((protein_name, protein_sequence))
                protein_name = line.lstrip(">")
                protein_sequence = ""
            else:
                protein_sequence += line
        if protein_name and protein_sequence:
            proteins.append((protein_name, protein_sequence))
    return proteins

# Load all proteins from the FASTA file
proteins = parse_fasta(fasta_file)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()
digestion.setMissedCleavages(1)  # Set the number of missed cleavages
min_length = 6
max_length = 40

# Define the oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Prepare to write to the CSV
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    # Define output headers
    headers = ["RT", "Formula", "Name", "CCS [M+H]+", "CCS [M+Na]+", "CCS [M-H]-",
               "KEGG", "CAS", "PubChem", "ChemSpider", "HMDB", "BioCyc",
               "Metlin", "LipidMaps", "UserID", "InChI"]
    writer.writerow(headers)

    # Process each protein
    for protein_name, protein_sequence in proteins:
        protein = oms.AASequence.fromString(protein_sequence)
        result = []
        digestion.digest(protein, result, min_length, max_length)

        # Get peptides and apply modifications
        peptides = [peptide.toString() for peptide in result]
        modified_peptides = []
        for peptide in result:
            oms.ModifiedPeptideGenerator.applyVariableModifications(variable_modifications, peptide, 1, modified_peptides, False)

        # Combine original and modified peptides
        all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

        # Remove sequences with 'X' amino acid
        filtered_peptides = [peptide for peptide in all_peptides if 'X' not in peptide]

        # Compute molecular formulas and write to CSV
        for peptide in filtered_peptides:
            name = f"{protein_name}@{peptide}"  # Include the protein name
            try:
                seq_obj = oms.AASequence.fromString(peptide)
                molecular_formula = str(seq_obj.getFormula())
                # Placeholder values for other fields
                retention_time = ""
                ccs_mh_plus = ""
                ccs_mna_plus = ""
                ccs_mh_minus = ""
                kegg = ""
                cas = ""
                pubchem = ""
                chemspider = ""
                hmdb = ""
                biocyc = ""
                metlin = ""
                lipidmaps = ""
                user_id = ""
                inchi = ""
            except Exception as e:
                molecular_formula = f"Error: {e}"

            # Write the row
            writer.writerow([retention_time, molecular_formula, name, ccs_mh_plus,
                             ccs_mna_plus, ccs_mh_minus, kegg, cas, pubchem, chemspider,
                             hmdb, biocyc, metlin, lipidmaps, user_id, inchi])

print(f"Digestion complete. Peptides written to {output_csv}")


Digestion complete. Peptides written to mcherr77y.csv


Code for Fetching Protien sequences from uniprot | Input - Proteoscape output - CSV

In [None]:
import pandas as pd
import re
import requests
import pyopenms as oms
import csv

# ----------- Step 1: Read LC-MS/MS CSV and Clean Headers -----------
input_csv = "1.csv"  # Replace with your actual file path
df = pd.read_csv(input_csv, sep="\t")  # Change sep="," if needed
df.columns = df.columns.str.strip()  # Clean column names

print("Detected columns:", df.columns.tolist())

# ----------- Step 2: Try to Automatically Detect the Protein Group Column -----------
protein_col = None
for col in df.columns:
    if "protein" in col.lower() and "group" in col.lower():
        protein_col = col
        break

if not protein_col:
    raise KeyError("❌ Could not find a 'Protein Group Name'-like column in your file.")

print(f"✅ Using column: {protein_col} to extract UniProt IDs")

# ----------- Step 3: Extract UniProt IDs from the Detected Column -----------
def extract_uniprot_id(entry):
    match = re.search(r"\|([A-Z0-9]+)\|", str(entry))
    return match.group(1) if match else None

df['UniProt_ID'] = df[protein_col].apply(extract_uniprot_id)
unique_ids = df['UniProt_ID'].dropna().unique()

# ----------- Step 4: Fetch FASTA Sequences from UniProt -----------
fasta_file = "master_proteins.fasta"
base_url = "https://rest.uniprot.org/uniprotkb/{}.fasta"

with open(fasta_file, "w") as fasta_out:
    for uid in unique_ids:
        response = requests.get(base_url.format(uid))
        if response.status_code == 200:
            fasta_out.write(response.text)
            print(f"✅ Fetched: {uid}")
        else:
            print(f"❌ Failed to fetch {uid} (status {response.status_code})")

# ----------- Step 5: Parse FASTA File -----------
def parse_fasta(file_path):
    proteins = []
    with open(file_path, "r") as f:
        lines = f.readlines()
        name = ""
        seq = ""
        for line in lines:
            line = line.strip()
            if line.startswith(">"):
                if name and seq:
                    proteins.append((name, seq))
                name = line[1:]
                seq = ""
            else:
                seq += line
        if name and seq:
            proteins.append((name, seq))
    return proteins

proteins = parse_fasta(fasta_file)

# ----------- Step 6: Set Up Digestion Parameters -----------
digestion = oms.ProteaseDigestion()
digestion.setEnzyme("Trypsin")
digestion.setMissedCleavages(1)
min_len = 6
max_len = 40

modifications = oms.ModifiedPeptideGenerator.getModifications([b"Oxidation (M)"])

# ----------- Step 7: In Silico Digestion + Export to MetaboScape Format -----------
output_csv = "metaboscape_peptides.csv"
with open(output_csv, "w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow([
        "RT", "Formula", "Name", "Mass", "CCS [M+H]+", "CCS [M+Na]+", "CCS [M-H]-",
        "KEGG", "CAS", "PubChem", "ChemSpider", "HMDB", "BioCyc",
        "Metlin", "LipidMaps", "UserID", "InChI"
    ])

    for protein_name, protein_seq in proteins:
        try:
            aa_seq = oms.AASequence.fromString(protein_seq)
        except Exception as e:
            print(f"⚠️ Error parsing {protein_name}: {e}")
            continue

        peptides = []
        digestion.digest(aa_seq, peptides, min_len, max_len)

        modified_peptides = []
        for p in peptides:
            oms.ModifiedPeptideGenerator.applyVariableModifications(modifications, p, 1, modified_peptides, False)

        all_peptides = set([p.toString() for p in peptides] + [p.toString() for p in modified_peptides])
        all_peptides = [pep for pep in all_peptides if "X" not in pep]

        for pep_str in all_peptides:
            try:
                seq_obj = oms.AASequence.fromString(pep_str)
                formula = seq_obj.getFormula().toString()
                mass = seq_obj.getMonoWeight()
            except Exception as e:
                formula = f"Error: {e}"
                mass = ""

            name = f"{protein_name}@{pep_str}"
            writer.writerow(["", formula, name, mass, "", "", "", "", "", "", "", "", "", "", "", ""])

print(f"\n🎉 Digestion complete. Peptides saved to: {output_csv}")

Split peptide list into 50000 chunks

In [None]:
import pandas as pd
import re
import requests
import pyopenms as oms
import csv
import math

# ----------- Step 1: Read LC-MS/MS CSV and Clean Headers -----------
input_csv = "1.csv"
df = pd.read_csv(input_csv, sep="\t")
df.columns = df.columns.str.strip()

print("Detected columns:", df.columns.tolist())

# ----------- Step 2: Detect the Protein Group Column -----------
protein_col = None
for col in df.columns:
    if "protein" in col.lower() and "group" in col.lower():
        protein_col = col
        break

if not protein_col:
    raise KeyError(" Could not find a 'Protein Group Name'-like column in your file.")
print(f" Using column: {protein_col} to extract UniProt IDs")

# ----------- Step 3: Extract UniProt IDs -----------
def extract_uniprot_id(entry):
    match = re.search(r"\|([A-Z0-9]+)\|", str(entry))
    return match.group(1) if match else None

df['UniProt_ID'] = df[protein_col].apply(extract_uniprot_id)
unique_ids = df['UniProt_ID'].dropna().unique()

# ----------- Step 4: Fetch FASTA Sequences from UniProt -----------
fasta_file = "master_proteins.fasta"
base_url = "https://rest.uniprot.org/uniprotkb/{}.fasta"

with open(fasta_file, "w") as fasta_out:
    for uid in unique_ids:
        response = requests.get(base_url.format(uid))
        if response.status_code == 200:
            fasta_out.write(response.text)
            print(f" Fetched: {uid}")
        else:
            print(f" Failed to fetch {uid} (status {response.status_code})")

# ----------- Step 5: Parse FASTA File -----------
def parse_fasta(file_path):
    proteins = []
    with open(file_path, "r") as f:
        lines = f.readlines()
        name, seq = "", ""
        for line in lines:
            line = line.strip()
            if line.startswith(">"):
                if name and seq:
                    proteins.append((name, seq))
                name = line[1:]
                seq = ""
            else:
                seq += line
        if name and seq:
            proteins.append((name, seq))
    return proteins

proteins = parse_fasta(fasta_file)

# ----------- Step 6: Setup Digestion Parameters -----------
digestion = oms.ProteaseDigestion()
digestion.setEnzyme("Trypsin")
digestion.setMissedCleavages(1)
min_len = 6
max_len = 40

modifications = oms.ModifiedPeptideGenerator.getModifications([b"Oxidation (M)"])

# ----------- Step 7: Digest and Buffer All Output Rows -----------
all_rows = []

for protein_name, protein_seq in proteins:
    try:
        aa_seq = oms.AASequence.fromString(protein_seq)
    except Exception as e:
        print(f" Error parsing {protein_name}: {e}")
        continue

    peptides = []
    digestion.digest(aa_seq, peptides, min_len, max_len)

    modified_peptides = []
    for p in peptides:
        oms.ModifiedPeptideGenerator.applyVariableModifications(modifications, p, 1, modified_peptides, False)

    all_peptides = set([p.toString() for p in peptides] + [p.toString() for p in modified_peptides])
    all_peptides = [pep for pep in all_peptides if "X" not in pep]

    for pep_str in all_peptides:
        try:
            seq_obj = oms.AASequence.fromString(pep_str)
            formula = seq_obj.getFormula().toString()
            mass = seq_obj.getMonoWeight()
        except Exception as e:
            formula = f"Error: {e}"
            mass = ""

        name = f"{protein_name}@{pep_str}"
        row = ["", formula, name, mass, "", "", "", "", "", "", "", "", "", "", "", ""]
        all_rows.append(row)

# ----------- Step 8: Write into Chunks of 10,000 for MetaboScape -----------
chunk_size = 50000
total_parts = math.ceil(len(all_rows) / chunk_size)
output_prefix = "metaboscape_peptides_part"

print(f" Writing {len(all_rows)} peptides into {total_parts} MetaboScape-compatible files...")

for i in range(total_parts):
    chunk = all_rows[i * chunk_size:(i + 1) * chunk_size]
    filename = f"{output_prefix}{i + 1}.csv"
    with open(filename, "w", newline="") as f:
        writer = csv.writer(f)
        writer.writerow([
            "RT", "Formula", "Name", "Mass", "CCS [M+H]+", "CCS [M+Na]+", "CCS [M-H]-",
            "KEGG", "CAS", "PubChem", "ChemSpider", "HMDB", "BioCyc",
            "Metlin", "LipidMaps", "UserID", "InChI"
        ])
        writer.writerows(chunk)
    print(f" {filename} saved with {len(chunk)} rows")

print("\n Digestion complete. All peptide chunks saved.")
