In [2]:
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio import SeqIO
import pandas as pd

# List of sequences as SeqRecord objects
sequences = [
    SeqRecord(Seq("MKWVTFISLLFLFSSAYS"), id="Protein1", description="Description of Protein1"),
    SeqRecord(Seq("LLLKAGAGAGAWWDAGVVF"), id="Protein2", description="Description of Protein2"),
    SeqRecord(Seq("VVVVKKKRRRNNNDDD"), id="Protein3", description="Description of Protein3"),
]

# Write to FASTA file
with open("proteins.fasta", "w") as output_file:
    SeqIO.write(sequences, output_file, "fasta")


In [3]:
import csv

# Load the CSV file
file_path = "proteins.csv"  # Update with the path to your file
output_fasta = "proteins.fasta"

# Read CSV and write to FASTA
with open(file_path, "r") as csv_file, open(output_fasta, "w") as fasta_file:
    reader = csv.DictReader(csv_file)
    for row in reader:
        protein_id = row['Id']  # Column name for protein ID
        sequence = row['Sequence']    # Column name for sequence
        fasta_file.write(f">{protein_id}\n")  # Write header
        fasta_file.write(f"{sequence}\n")    # Write sequence


Validating Fasta Sequence


In [22]:
from Bio import SeqIO

def validate_fasta(file_path):
    try:
        # Try parsing the FASTA file
        for record in SeqIO.parse(file_path, "fasta"):
            if not record.seq:  # Check for empty sequences
                print(f"Warning: Sequence for {record.id} is empty.")
            else:
                print(f"Sequence {record.id} is valid.")
    except Exception as e:
        print(f"Error: {e}")

# Example usage
file_path = "insulin.fasta"
validate_fasta(file_path)


Sequence Insulin is valid.


Trypsin digestion | Output a CSV file

In [17]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "6503_trypsin.fasta"  # Replace with your FASTA file name
output_csv = "digested_peptides_6503_trypsin.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme (default is trypsin)
digestion.getEnzymeName()  # Should return "Trypsin"

# Set the number of missed cleavages
digestion.setMissedCleavages(2)

# Perform digestion with length constraints (7-40 residues)
min_length = 7
max_length = 40
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Write peptides to a CSV file
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Peptide Sequence"])
    for peptide in peptides:
        writer.writerow([peptide])

print(f"Digestion complete. {len(peptides)} peptides written to {output_csv}")



Digestion complete. 10199 peptides written to digested_peptides_6503_trypsin.csv


Lys-C Digestion

In [13]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "6503_lysC.fasta"  # Replace with your FASTA file name
output_csv = "digested_peptides_6503_lysC.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme to Lys-C
digestion.setEnzyme("Lys-C")

# Set the number of missed cleavages
digestion.setMissedCleavages(2)

# Perform digestion with length constraints (7-40 residues)
min_length = 7
max_length = 40
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Write peptides to a CSV file
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Peptide Sequence"])
    for peptide in peptides:
        writer.writerow([peptide])

print(f"Lys-C digestion complete. {len(peptides)} peptides written to {output_csv}")


Lys-C digestion complete. 1093 peptides written to digested_peptides_6503_lysC.csv


Digested peptides to a structured csv


In [30]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "insulin.fasta"  # Replace with your FASTA file name
output_csv = "digested_peptides_insulin_3.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme (default is trypsin)
digestion.getEnzymeName()  # Should return "Trypsin"

# Set the number of missed cleavages
digestion.setMissedCleavages(0)

# Perform digestion with length constraints (7-40 residues)
min_length = 0
max_length = 400
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Write peptides to a CSV file
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["id", "sequence"])
    for idx, peptide in enumerate(peptides, 1):
        writer.writerow([idx, peptide])

print(f"Digestion complete. {len(peptides)} peptides written to {output_csv}")


Digestion complete. 2 peptides written to digested_peptides_insulin_3.csv


Generate Molecular Formulas

In [32]:
import pandas as pd
import pyopenms as oms

# Load the CSV file
input_file = "md_bsa.csv"  # Replace with your file name
output_file = "mol_bsa.csv"
data = pd.read_csv(input_file)

# Function to compute molecular formula
def get_molecular_formula(sequence):
    try:
        
        seq_obj = oms.AASequence.fromString(sequence)
        return str(seq_obj.getFormula())
    except Exception as e:
        return f"Error: {e}"

# Apply the function to the Sequence column
data["MolecularFormula"] = data["sequence"].apply(get_molecular_formula)

# Save the results to a new CSV
data.to_csv(output_file, index=False)
print(f"Results saved to {output_file}")


Results saved to mol_bsa.csv


Added Oxidated Methionine modification


In [33]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "insulin.fasta"  # Replace with your FASTA file name
output_csv = "md_insulin.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme (default is trypsin)
digestion.getEnzymeName()  # Should return "Trypsin"

# Set the number of missed cleavages
digestion.setMissedCleavages(0)

# Perform digestion with length constraints (7-40 residues)
min_length = 0
max_length = 400
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Define the oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Apply the variable modifications to the digested peptides
modified_peptides = []
for peptide in result:
    oms.ModifiedPeptideGenerator.applyVariableModifications(variable_modifications, peptide, 1, modified_peptides, False)

# Combine original and modified peptides
all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

# Write peptides to a CSV file
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["id", "sequence"])
    for idx, peptide in enumerate(all_peptides, 1):
        writer.writerow([idx, peptide])

print(f"Digestion complete. {len(all_peptides)} peptides written to {output_csv}")


Digestion complete. 2 peptides written to md_insulin.csv


Modified column Names | outputs Csv with Name and Molecular Formulae

In [40]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "6504_lysC.fasta"  # Replace with your FASTA file name
output_csv = "6504_lysC_formulas.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme (default is trypsin)
digestion.getEnzymeName()  # Should return "Trypsin"

# Set the number of missed cleavages
digestion.setMissedCleavages(1)

# Perform digestion with length constraints (7-40 residues)
min_length = 7
max_length = 40
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Define the oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Apply the variable modifications to the digested peptides
modified_peptides = []
for peptide in result:
    oms.ModifiedPeptideGenerator.applyVariableModifications(variable_modifications, peptide, 1, modified_peptides, False)

# Combine original and modified peptides
all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

# Function to compute molecular formula
def get_molecular_formula(sequence):
    try:
        # Convert the sequence to an AASequence object
        seq_obj = oms.AASequence.fromString(sequence)
        # Get the molecular formula
        return str(seq_obj.getFormula())
    except Exception as e:
        return f"Error: {e}"

# Write peptides to a CSV file with Name and Molecular Formula columns
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Name", "Molecular Formula"])
    for peptide in all_peptides:
        name = f"{fasta_file.split('.')[0]}_{peptide}"
        molecular_formula = get_molecular_formula(peptide)
        writer.writerow([name, molecular_formula])

print(f"Digestion complete. {len(all_peptides)} peptides written to {output_csv}")


Digestion complete. 2338 peptides written to 6504_lysC_formulas.csv


Removed peptides with unknown amino acids X

In [44]:
import pyopenms as oms
import csv

# Specify the local FASTA file
fasta_file = "master.fasta"  # Replace with your FASTA file name
output_csv = "peptides_with_formulas.csv"

# Load the protein sequence from the FASTA file
with open(fasta_file, "r") as f:
    lines = f.readlines()
    protein_sequence = "".join(line.strip() for line in lines if not line.startswith(">"))  # Exclude the header

# Convert the protein sequence to an AASequence object
protein = oms.AASequence.fromString(protein_sequence)

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()

# Set digestion enzyme (default is trypsin)
digestion.getEnzymeName()  # Should return "Trypsin"

# Set the number of missed cleavages
digestion.setMissedCleavages(1)

# Perform digestion with length constraints (7-40 residues)
min_length = 6
max_length = 40
result = []
digestion.digest(protein, result, min_length, max_length)

# Prepare peptide data for CSV output
peptides = [peptide.toString() for peptide in result]

# Define the oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Apply the variable modifications to the digested peptides
modified_peptides = []
for peptide in result:
    oms.ModifiedPeptideGenerator.applyVariableModifications(variable_modifications, peptide, 1, modified_peptides, False)

# Combine original and modified peptides
all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

# Remove sequences with 'X' amino acid
filtered_peptides = [peptide for peptide in all_peptides if 'X' not in peptide]

# Function to compute molecular formula
def get_molecular_formula(sequence):
    try:
        # Convert the sequence to an AASequence object
        seq_obj = oms.AASequence.fromString(sequence)
        # Get the molecular formula
        return str(seq_obj.getFormula())
    except Exception as e:
        return f"Error: {e}"

# Write peptides to a CSV file with Name and Molecular Formula columns
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    writer.writerow(["Name", "Molecular Formula"])
    for peptide in filtered_peptides:
        name = f"{fasta_file.split('.')[0]}_{peptide}"
        molecular_formula = get_molecular_formula(peptide)
        writer.writerow([name, molecular_formula])

print(f"Digestion complete. {len(filtered_peptides)} peptides written to {output_csv}")


Digestion complete. 24843 peptides written to peptides_with_formulas.csv


Read files seperately and output to one file

In [None]:
import pyopenms as oms
import csv

# List of FASTA files to process
fasta_files = ["6503_lysC.fasta", "6503_trypsin.fasta", "6504_lysC.fasta", "6504_trypsin.fasta", "6507_lysC.fasta", "6507_trypsin.fasta"]
output_csv = "peptides_with_formulas_seperate.csv"

# Function to read the protein sequence from a FASTA file
def read_protein_sequence(fasta_file):
    with open(fasta_file, "r") as f:
        lines = f.readlines()
        # Concatenate all non-header lines
        return "".join(line.strip() for line in lines if not line.startswith(">"))

# Function to compute molecular formula for a sequence
def get_molecular_formula(sequence):
    try:
        # Convert the sequence to an AASequence object
        seq_obj = oms.AASequence.fromString(sequence)
        # Get the molecular formula
        return str(seq_obj.getFormula())
    except Exception as e:
        return f"Error: {e}"

# Initialize the protease digestion class
digestion = oms.ProteaseDigestion()
digestion.setMissedCleavages(1)  # Allow one missed cleavage

# Define digestion parameters
min_length = 6
max_length = 40

# Define oxidation modification for methionine
variable_mod_names = [b"Oxidation (M)"]
variable_modifications = oms.ModifiedPeptideGenerator.getModifications(variable_mod_names)

# Open the CSV file for writing
with open(output_csv, mode="w", newline="") as file:
    writer = csv.writer(file)
    # Write header
    writer.writerow(["Name", "Molecular Formula"])

    # Process each FASTA file
    for fasta_file in fasta_files:
        # Read protein sequence
        protein_sequence = read_protein_sequence(fasta_file)
        protein = oms.AASequence.fromString(protein_sequence)

        # Perform digestion
        result = []
        digestion.digest(protein, result, min_length, max_length)

        # Prepare peptides
        peptides = [peptide.toString() for peptide in result]

        # Apply variable modifications
        modified_peptides = []
        for peptide in result:
            oms.ModifiedPeptideGenerator.applyVariableModifications(
                variable_modifications, peptide, 1, modified_peptides, False
            )

        # Combine original and modified peptides
        all_peptides = peptides + [peptide.toString() for peptide in modified_peptides]

        # Filter peptides (remove those containing 'X')
        filtered_peptides = [peptide for peptide in all_peptides if 'X' not in peptide]

        # Write peptides and formulas to the CSV
        for peptide in filtered_peptides:
            name = f"{fasta_file.split('.')[0]}_{peptide}"
            molecular_formula = get_molecular_formula(peptide)
            writer.writerow([name, molecular_formula])

print(f"Digestion complete. Results written to {output_csv}.")


PermissionError: [Errno 13] Permission denied: 'peptides_with_formulas.csv'