In [12]:
#Combined Glycosylation data
def parse_glycosylation_file(file_path, glyco_type):
    glycosylation_sites = []

    try:
        with open(file_path, 'r') as file:
            for line in file:
                parts = line.strip().split('\t')
                if len(parts) != 6:
                    continue
                
                protein_id = parts[0]
                uniprot_id = parts[1]
                site_position = int(parts[2])
                modification_type = parts[3]
                pubmed_ids = parts[4].split(';')
                peptide_sequence = parts[5]

                glycosylation_sites.append({
                    'protein_id': protein_id,
                    'uniprot_id': uniprot_id,
                    'site_position': site_position,
                    'modification_type': modification_type,
                    'pubmed_ids': pubmed_ids,
                    'peptide_sequence': peptide_sequence,
                    'glyco_type': glyco_type
                })
    
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    
    return glycosylation_sites

def generate_glycosylation_entries(glycosylation_sites, uniprot_sequences):
    glycosylation_entries = []

    for site in glycosylation_sites:
        uniprot_id = site['uniprot_id']
        peptide_sequence = site['peptide_sequence']
        site_position = site['site_position']
        glyco_type = site['glyco_type']

        if uniprot_id in uniprot_sequences:
            protein_data = uniprot_sequences[uniprot_id]
            protein_sequence = protein_data['sequence']

            # Ensure site position is within the protein sequence length
            if site_position <= len(protein_sequence):
                # Annotate the modification
                modified_protein_sequence = list(protein_sequence)
                mod_position_in_protein = site_position - 1
                modification = "(oG)" if glyco_type == "O-linked Glycosylation" else "(nG)"
                modified_protein_sequence[mod_position_in_protein] += modification
                modified_protein_sequence = ''.join(modified_protein_sequence)

                # Create the new header with modification information
                original_header = protein_data['header']
                mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}{modification[1:3]}|"
                new_header = f"{original_header.split(' ')[0]}{mod_annotation} {' '.join(original_header.split(' ')[1:])}"

                # Add the modified protein entry
                glycosylation_entries.append((new_header, modified_protein_sequence))
    
    return glycosylation_entries

def load_uniprot_sequences(fasta_file):
    uniprot_sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        uniprot_sequences[record.id.split('|')[1]] = {
            'header': record.description,
            'sequence': str(record.seq)
        }
    return uniprot_sequences

def format_fasta_sequence(sequence, line_length=60):
    return '\n'.join([sequence[i:i+line_length] for i in range(0, len(sequence), line_length)])


def write_fasta(output_file, protein_entries, modification_entries):
    with open(output_file, 'w') as file:
        # Write original protein entries
        for data in protein_entries.values():
            header = data['header']
            sequence = format_fasta_sequence(data['sequence'])
            file.write(f">{header}\n{sequence}\n")
        
        # Write modification entries (phosphopeptides and glycopeptides)
        for header, sequence in modification_entries:
            formatted_sequence = format_fasta_sequence(sequence)
            file.write(f">{header}\n{formatted_sequence}\n")

# Example usage
o_glycosylation_file = 'C:\\Users\\maitr\\Downloads\\O-linked Glycosylation\\O-linked Glycosylation.txt'
n_glycosylation_file = 'C:\\Users\\maitr\\Downloads\\N-linked Glycosylation\\N-linked Glycosylation.txt'
uniprot_fasta_file = 'C:\\Users\\maitr\\Downloads\\uniprot_sprot.fasta\\uniprot_sprot.fasta'
output_fasta_file = 'C:\\Users\\maitr\\Downloads\\Glycosylation_combined.fasta'

# Parse the glycosylation files
o_glycosylation_sites = parse_glycosylation_file(o_glycosylation_file, "O-linked Glycosylation")
n_glycosylation_sites = parse_glycosylation_file(n_glycosylation_file, "N-linked Glycosylation")

# Load the UniProt sequences
uniprot_sequences = load_uniprot_sequences(uniprot_fasta_file)

# Generate the modification entries
o_glycosylation_entries = generate_glycosylation_entries(o_glycosylation_sites, uniprot_sequences)
n_glycosylation_entries = generate_glycosylation_entries(n_glycosylation_sites, uniprot_sequences)

# Combine original proteins with modification entries and write to FASTA
all_modification_entries = o_glycosylation_entries + n_glycosylation_entries
write_fasta(output_fasta_file, uniprot_sequences, all_modification_entries)


In [1]:
#Comprehensive database
import re
from Bio import SeqIO

def parse_modification_file(file_path, mod_type):
    modifications = []

    try:
        with open(file_path, 'r') as file:
            for line in file:
                parts = line.strip().split('\t')
                if len(parts) != 6:
                    continue
                
                protein_id = parts[0]
                uniprot_id = parts[1]
                site_position = int(parts[2])
                modification_type = parts[3]
                pubmed_ids = parts[4].split(';')
                peptide_sequence = parts[5]

                modifications.append({
                    'protein_id': protein_id,
                    'uniprot_id': uniprot_id,
                    'site_position': site_position,
                    'modification_type': modification_type,
                    'pubmed_ids': pubmed_ids,
                    'peptide_sequence': peptide_sequence,
                    'mod_type': mod_type
                })
    
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    
    return modifications

def load_uniprot_sequences(fasta_file):
    uniprot_sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        uniprot_sequences[record.id.split('|')[1]] = {
            'header': record.description,
            'sequence': str(record.seq)
        }
    return uniprot_sequences

def format_fasta_sequence(sequence, line_length=60):
    return '\n'.join([sequence[i:i+line_length] for i in range(0, len(sequence), line_length)])

def generate_modification_entries(modifications, uniprot_sequences):
    modification_entries = []

    for mod in modifications:
        uniprot_id = mod['uniprot_id']
        peptide_sequence = mod['peptide_sequence']
        site_position = mod['site_position']
        mod_type = mod['mod_type']

        if uniprot_id in uniprot_sequences:
            protein_data = uniprot_sequences[uniprot_id]
            protein_sequence = protein_data['sequence']

            # Ensure site position is within the protein sequence length
            if site_position <= len(protein_sequence):
                # Annotate the modification
                modified_protein_sequence = list(protein_sequence)
                mod_position_in_protein = site_position - 1

                if mod_type == "Phosphorylation":
                    modification = "(P)"
                    mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}P|"
                elif mod_type == "O-linked Glycosylation":
                    modification = "(oG)"
                    mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}oG|"
                elif mod_type == "N-linked Glycosylation":
                    modification = "(nG)"
                    mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}nG|"
                elif mod_type == "Acetylation":
                    modification = "(A)"
                    mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}A|"
                elif mod_type == "Ubiquitination":
                    modification = "(U)"
                    mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}U|"
                
                modified_protein_sequence[mod_position_in_protein] += modification
                modified_protein_sequence = ''.join(modified_protein_sequence)

                # Create the new header with modification information
                original_header = protein_data['header']
                new_header = f"{original_header.split(' ')[0]}{mod_annotation} {' '.join(original_header.split(' ')[1:])}"

                # Add the modified protein entry
                modification_entries.append((new_header, modified_protein_sequence))
    
    return modification_entries

def write_fasta(output_file, protein_entries, modification_entries):
    with open(output_file, 'w') as file:
        # Write original protein entries
        for data in protein_entries.values():
            header = data['header']
            sequence = format_fasta_sequence(data['sequence'])
            file.write(f">{header}\n{sequence}\n")
        
        # Write modification entries
        for header, sequence in modification_entries:
            formatted_sequence = format_fasta_sequence(sequence)
            file.write(f">{header}\n{formatted_sequence}\n")

# Example usage
phosphosite_file = 'C:\\Users\\maitr\\Downloads\\Phosphorylation\\Phosphorylation.txt'
o_glycosylation_file = 'C:\\Users\\maitr\\Downloads\\O-linked Glycosylation\\O-linked Glycosylation.txt'
n_glycosylation_file = 'C:\\Users\\maitr\\Downloads\\N-linked Glycosylation\\N-linked Glycosylation.txt'
acetylation_file = 'C:\\Users\\maitr\\Downloads\\Acetylation\\Acetylation.txt'
ubiquitination_file = 'C:\\Users\\maitr\\Downloads\\Ubiquitination\\Ubiquitination.txt'
uniprot_fasta_file = 'C:\\Users\\maitr\\Downloads\\uniprot_sprot.fasta\\uniprot_sprot.fasta'
output_fasta_file = 'C:\\Users\\maitr\\Downloads\\Combined_database.fasta'

# Parse the modification files
phosphosites = parse_modification_file(phosphosite_file, "Phosphorylation")
o_glycosylation_sites = parse_modification_file(o_glycosylation_file, "O-linked Glycosylation")
n_glycosylation_sites = parse_modification_file(n_glycosylation_file, "N-linked Glycosylation")
acetylation_sites = parse_modification_file(acetylation_file, "Acetylation")
ubiquitination_sites = parse_modification_file(ubiquitination_file, "Ubiquitination")

# Load the UniProt sequences
uniprot_sequences = load_uniprot_sequences(uniprot_fasta_file)

# Generate the modification entries
phosphopeptide_entries = generate_modification_entries(phosphosites, uniprot_sequences)
o_glycosylation_entries = generate_modification_entries(o_glycosylation_sites, uniprot_sequences)
n_glycosylation_entries = generate_modification_entries(n_glycosylation_sites, uniprot_sequences)
acetylation_entries = generate_modification_entries(acetylation_sites, uniprot_sequences)
ubiquitination_entries = generate_modification_entries(ubiquitination_sites, uniprot_sequences)

# Combine original proteins with modification entries and write to FASTA
all_modification_entries = phosphopeptide_entries + o_glycosylation_entries + n_glycosylation_entries + acetylation_entries + ubiquitination_entries
write_fasta(output_fasta_file, uniprot_sequences, all_modification_entries)


In [5]:
# Phosphorylation datatbase
import re
from Bio import SeqIO

def parse_modification_file(file_path, mod_type):
    modifications = []

    try:
        with open(file_path, 'r') as file:
            for line in file:
                parts = line.strip().split('\t')
                if len(parts) != 6:
                    continue
                
                protein_id = parts[0]
                uniprot_id = parts[1]
                site_position = int(parts[2])
                modification_type = parts[3]
                pubmed_ids = parts[4].split(';')
                peptide_sequence = parts[5]

                modifications.append({
                    'protein_id': protein_id,
                    'uniprot_id': uniprot_id,
                    'site_position': site_position,
                    'modification_type': modification_type,
                    'pubmed_ids': pubmed_ids,
                    'peptide_sequence': peptide_sequence,
                    'mod_type': mod_type
                })
    
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    
    return modifications

def load_uniprot_sequences(fasta_file):
    uniprot_sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        uniprot_sequences[record.id.split('|')[1]] = {
            'header': record.description,
            'sequence': str(record.seq)
        }
    return uniprot_sequences

def format_fasta_sequence(sequence, line_length=60):
    return '\n'.join([sequence[i:i+line_length] for i in range(0, len(sequence), line_length)])

def generate_phosphosite_entries(modifications, uniprot_sequences):
    phosphosite_entries = []

    for mod in modifications:
        uniprot_id = mod['uniprot_id']
        peptide_sequence = mod['peptide_sequence']
        site_position = mod['site_position']
        mod_type = mod['mod_type']

        if uniprot_id in uniprot_sequences and mod_type == "Phosphorylation":
            protein_data = uniprot_sequences[uniprot_id]
            protein_sequence = protein_data['sequence']

            # Ensure site position is within the protein sequence length
            if site_position <= len(protein_sequence):
                # Annotate the modification
                modified_protein_sequence = list(protein_sequence)
                mod_position_in_protein = site_position - 1

                modification = "(P)"
                mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}P|"
                modified_protein_sequence[mod_position_in_protein] += modification
                modified_protein_sequence = ''.join(modified_protein_sequence)

                # Create the new header with modification information
                original_header_parts = protein_data['header'].split(' ')
                protein_id_part = original_header_parts[0].split('|')
                new_header = f"{protein_id_part[0]}|{protein_id_part[1]}{mod_annotation}{protein_id_part[2]} {' '.join(original_header_parts[1:])}"

                # Add the modified protein entry
                phosphosite_entries.append((new_header, modified_protein_sequence))
    
    return phosphosite_entries

def write_fasta(output_file, phosphosite_entries):
    with open(output_file, 'w') as file:
        # Write modification entries
        for header, sequence in phosphosite_entries:
            formatted_sequence = format_fasta_sequence(sequence)
            file.write(f">{header}\n{formatted_sequence}\n")

# Example usage
phosphosite_file = 'C:\\Users\\maitr\\Downloads\\Phosphorylation\\Phosphorylation.txt'
uniprot_fasta_file = 'C:\\Users\\maitr\\Downloads\\uniprot_sprot.fasta\\uniprot_sprot.fasta'
output_fasta_file = 'C:\\Users\\maitr\\Downloads\\Phosphosite.fasta'

# Parse the modification files
phosphosites = parse_modification_file(phosphosite_file, "Phosphorylation")

# Load the UniProt sequences
uniprot_sequences = load_uniprot_sequences(uniprot_fasta_file)

# Generate the phosphosite entries
phosphosite_entries = generate_phosphosite_entries(phosphosites, uniprot_sequences)

# Write the FASTA file with phosphosite entries
write_fasta(output_fasta_file, phosphosite_entries)


In [2]:
#N-Linked Glycosylation database

import re
from Bio import SeqIO

def parse_modification_file(file_path, mod_type):
    modifications = []

    try:
        with open(file_path, 'r') as file:
            for line in file:
                parts = line.strip().split('\t')
                if len(parts) != 6:
                    continue
                
                protein_id = parts[0]
                uniprot_id = parts[1]
                site_position = int(parts[2])
                modification_type = parts[3]
                pubmed_ids = parts[4].split(';')
                peptide_sequence = parts[5]

                modifications.append({
                    'protein_id': protein_id,
                    'uniprot_id': uniprot_id,
                    'site_position': site_position,
                    'modification_type': modification_type,
                    'pubmed_ids': pubmed_ids,
                    'peptide_sequence': peptide_sequence,
                    'mod_type': mod_type
                })
    
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    
    return modifications

def load_uniprot_sequences(fasta_file):
    uniprot_sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        uniprot_sequences[record.id.split('|')[1]] = {
            'header': record.description,
            'sequence': str(record.seq)
        }
    return uniprot_sequences

def format_fasta_sequence(sequence, line_length=60):
    return '\n'.join([sequence[i:i+line_length] for i in range(0, len(sequence), line_length)])

def generate_glycosite_entries(modifications, uniprot_sequences):
    glycosite_entries = []

    for mod in modifications:
        uniprot_id = mod['uniprot_id']
        peptide_sequence = mod['peptide_sequence']
        site_position = mod['site_position']
        mod_type = mod['mod_type']

        if uniprot_id in uniprot_sequences and mod_type == "N-linked Glycosylation":
            protein_data = uniprot_sequences[uniprot_id]
            protein_sequence = protein_data['sequence']

            # Ensure site position is within the protein sequence length
            if site_position <= len(protein_sequence):
                # Annotate the modification
                modified_protein_sequence = list(protein_sequence)
                mod_position_in_protein = site_position - 1

                modification = "(nG)"
                mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}nG|"
                modified_protein_sequence[mod_position_in_protein] += modification
                modified_protein_sequence = ''.join(modified_protein_sequence)

                # Create the new header with modification information
                original_header_parts = protein_data['header'].split(' ')
                protein_id_part = original_header_parts[0].split('|')
                new_header = f"{protein_id_part[0]}|{protein_id_part[1]}{mod_annotation}{protein_id_part[2]} {' '.join(original_header_parts[1:])}"

                # Add the modified protein entry
                glycosite_entries.append((new_header, modified_protein_sequence))
    
    return glycosite_entries

def write_fasta(output_file, glycosite_entries):
    with open(output_file, 'w') as file:
        # Write modification entries
        for header, sequence in glycosite_entries:
            formatted_sequence = format_fasta_sequence(sequence)
            file.write(f">{header}\n{formatted_sequence}\n")

# Example usage
glycosite_file = 'C:\\Users\\maitr\\Downloads\\N-linked Glycosylation\\N-linked Glycosylation.txt'
uniprot_fasta_file = 'C:\\Users\\maitr\\Downloads\\uniprot_sprot.fasta\\uniprot_sprot.fasta'
output_fasta_file = 'C:\\Users\\maitr\\Downloads\\N-linked_Glycosite.fasta'

# Parse the modification files
glycosites = parse_modification_file(glycosite_file, "N-linked Glycosylation")

# Load the UniProt sequences
uniprot_sequences = load_uniprot_sequences(uniprot_fasta_file)

# Generate the glycosite entries
glycosite_entries = generate_glycosite_entries(glycosites, uniprot_sequences)

# Write the FASTA file with glycosite entries
write_fasta(output_fasta_file, glycosite_entries)


In [3]:
# O-linked Glycosylation database

import re
from Bio import SeqIO

def parse_modification_file(file_path, mod_type):
    modifications = []

    try:
        with open(file_path, 'r') as file:
            for line in file:
                parts = line.strip().split('\t')
                if len(parts) != 6:
                    continue
                
                protein_id = parts[0]
                uniprot_id = parts[1]
                site_position = int(parts[2])
                modification_type = parts[3]
                pubmed_ids = parts[4].split(';')
                peptide_sequence = parts[5]

                modifications.append({
                    'protein_id': protein_id,
                    'uniprot_id': uniprot_id,
                    'site_position': site_position,
                    'modification_type': modification_type,
                    'pubmed_ids': pubmed_ids,
                    'peptide_sequence': peptide_sequence,
                    'mod_type': mod_type
                })
    
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    
    return modifications

def load_uniprot_sequences(fasta_file):
    uniprot_sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        uniprot_sequences[record.id.split('|')[1]] = {
            'header': record.description,
            'sequence': str(record.seq)
        }
    return uniprot_sequences

def format_fasta_sequence(sequence, line_length=60):
    return '\n'.join([sequence[i:i+line_length] for i in range(0, len(sequence), line_length)])

def generate_oglycosite_entries(modifications, uniprot_sequences):
    oglycosite_entries = []

    for mod in modifications:
        uniprot_id = mod['uniprot_id']
        peptide_sequence = mod['peptide_sequence']
        site_position = mod['site_position']
        mod_type = mod['mod_type']

        if uniprot_id in uniprot_sequences and mod_type == "O-linked Glycosylation":
            protein_data = uniprot_sequences[uniprot_id]
            protein_sequence = protein_data['sequence']

            # Ensure site position is within the protein sequence length
            if site_position <= len(protein_sequence):
                # Annotate the modification
                modified_protein_sequence = list(protein_sequence)
                mod_position_in_protein = site_position - 1

                modification = "(oG)"
                mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}oG|"
                modified_protein_sequence[mod_position_in_protein] += modification
                modified_protein_sequence = ''.join(modified_protein_sequence)

                # Create the new header with modification information
                original_header_parts = protein_data['header'].split(' ')
                protein_id_part = original_header_parts[0].split('|')
                new_header = f"{protein_id_part[0]}|{protein_id_part[1]}{mod_annotation}{protein_id_part[2]} {' '.join(original_header_parts[1:])}"

                # Add the modified protein entry
                oglycosite_entries.append((new_header, modified_protein_sequence))
    
    return oglycosite_entries

def write_fasta(output_file, oglycosite_entries):
    with open(output_file, 'w') as file:
        # Write modification entries
        for header, sequence in oglycosite_entries:
            formatted_sequence = format_fasta_sequence(sequence)
            file.write(f">{header}\n{formatted_sequence}\n")

# Example usage
oglycosite_file = 'C:\\Users\\maitr\\Downloads\\O-linked Glycosylation\\O-linked Glycosylation.txt'
uniprot_fasta_file = 'C:\\Users\\maitr\\Downloads\\uniprot_sprot.fasta\\uniprot_sprot.fasta'
output_fasta_file = 'C:\\Users\\maitr\\Downloads\\O-linked_Glycosite.fasta'

# Parse the modification files
oglycosites = parse_modification_file(oglycosite_file, "O-linked Glycosylation")

# Load the UniProt sequences
uniprot_sequences = load_uniprot_sequences(uniprot_fasta_file)

# Generate the O-linked glycosite entries
oglycosite_entries = generate_oglycosite_entries(oglycosites, uniprot_sequences)

# Write the FASTA file with O-linked glycosite entries
write_fasta(output_fasta_file, oglycosite_entries)


In [4]:
# Acetylation database

import re
from Bio import SeqIO

def parse_modification_file(file_path, mod_type):
    modifications = []

    try:
        with open(file_path, 'r') as file:
            for line in file:
                parts = line.strip().split('\t')
                if len(parts) != 6:
                    continue
                
                protein_id = parts[0]
                uniprot_id = parts[1]
                site_position = int(parts[2])
                modification_type = parts[3]
                pubmed_ids = parts[4].split(';')
                peptide_sequence = parts[5]

                modifications.append({
                    'protein_id': protein_id,
                    'uniprot_id': uniprot_id,
                    'site_position': site_position,
                    'modification_type': modification_type,
                    'pubmed_ids': pubmed_ids,
                    'peptide_sequence': peptide_sequence,
                    'mod_type': mod_type
                })
    
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    
    return modifications

def load_uniprot_sequences(fasta_file):
    uniprot_sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        uniprot_sequences[record.id.split('|')[1]] = {
            'header': record.description,
            'sequence': str(record.seq)
        }
    return uniprot_sequences

def format_fasta_sequence(sequence, line_length=60):
    return '\n'.join([sequence[i:i+line_length] for i in range(0, len(sequence), line_length)])

def generate_acetylation_entries(modifications, uniprot_sequences):
    acetylation_entries = []

    for mod in modifications:
        uniprot_id = mod['uniprot_id']
        peptide_sequence = mod['peptide_sequence']
        site_position = mod['site_position']
        mod_type = mod['mod_type']

        if uniprot_id in uniprot_sequences and mod_type == "Acetylation":
            protein_data = uniprot_sequences[uniprot_id]
            protein_sequence = protein_data['sequence']

            # Ensure site position is within the protein sequence length
            if site_position <= len(protein_sequence):
                # Annotate the modification
                modified_protein_sequence = list(protein_sequence)
                mod_position_in_protein = site_position - 1

                modification = "(A)"
                mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}A|"
                modified_protein_sequence[mod_position_in_protein] += modification
                modified_protein_sequence = ''.join(modified_protein_sequence)

                # Create the new header with modification information
                original_header_parts = protein_data['header'].split(' ')
                protein_id_part = original_header_parts[0].split('|')
                new_header = f"{protein_id_part[0]}|{protein_id_part[1]}{mod_annotation}{protein_id_part[2]} {' '.join(original_header_parts[1:])}"

                # Add the modified protein entry
                acetylation_entries.append((new_header, modified_protein_sequence))
    
    return acetylation_entries

def write_fasta(output_file, acetylation_entries):
    with open(output_file, 'w') as file:
        # Write modification entries
        for header, sequence in acetylation_entries:
            formatted_sequence = format_fasta_sequence(sequence)
            file.write(f">{header}\n{formatted_sequence}\n")

# Example usage
acetylation_file = 'C:\\Users\\maitr\\Downloads\\Acetylation\\Acetylation.txt'
uniprot_fasta_file = 'C:\\Users\\maitr\\Downloads\\uniprot_sprot.fasta\\uniprot_sprot.fasta'
output_fasta_file = 'C:\\Users\\maitr\\Downloads\\Acetylation.fasta'

# Parse the modification files
acetylations = parse_modification_file(acetylation_file, "Acetylation")

# Load the UniProt sequences
uniprot_sequences = load_uniprot_sequences(uniprot_fasta_file)

# Generate the acetylation entries
acetylation_entries = generate_acetylation_entries(acetylations, uniprot_sequences)

# Write the FASTA file with acetylation entries
write_fasta(output_fasta_file, acetylation_entries)


In [6]:
# Ubiquitination database
import re
from Bio import SeqIO

def parse_modification_file(file_path, mod_type):
    modifications = []

    try:
        with open(file_path, 'r') as file:
            for line in file:
                parts = line.strip().split('\t')
                if len(parts) != 6:
                    continue
                
                protein_id = parts[0]
                uniprot_id = parts[1]
                site_position = int(parts[2])
                modification_type = parts[3]
                pubmed_ids = parts[4].split(';')
                peptide_sequence = parts[5]

                modifications.append({
                    'protein_id': protein_id,
                    'uniprot_id': uniprot_id,
                    'site_position': site_position,
                    'modification_type': modification_type,
                    'pubmed_ids': pubmed_ids,
                    'peptide_sequence': peptide_sequence,
                    'mod_type': mod_type
                })
    
    except Exception as e:
        print(f"Error reading file {file_path}: {e}")
    
    return modifications

def load_uniprot_sequences(fasta_file):
    uniprot_sequences = {}
    for record in SeqIO.parse(fasta_file, "fasta"):
        uniprot_sequences[record.id.split('|')[1]] = {
            'header': record.description,
            'sequence': str(record.seq)
        }
    return uniprot_sequences

def format_fasta_sequence(sequence, line_length=60):
    return '\n'.join([sequence[i:i+line_length] for i in range(0, len(sequence), line_length)])

def generate_ubiquitination_entries(modifications, uniprot_sequences):
    ubiquitination_entries = []

    for mod in modifications:
        uniprot_id = mod['uniprot_id']
        peptide_sequence = mod['peptide_sequence']
        site_position = mod['site_position']
        mod_type = mod['mod_type']

        if uniprot_id in uniprot_sequences and mod_type == "Ubiquitination":
            protein_data = uniprot_sequences[uniprot_id]
            protein_sequence = protein_data['sequence']

            # Ensure site position is within the protein sequence length
            if site_position <= len(protein_sequence):
                # Annotate the modification
                modified_protein_sequence = list(protein_sequence)
                mod_position_in_protein = site_position - 1

                modification = "(U)"
                mod_annotation = f"|{protein_sequence[mod_position_in_protein]}{site_position}U|"
                modified_protein_sequence[mod_position_in_protein] += modification
                modified_protein_sequence = ''.join(modified_protein_sequence)

                # Create the new header with modification information
                original_header_parts = protein_data['header'].split(' ')
                protein_id_part = original_header_parts[0].split('|')
                new_header = f"{protein_id_part[0]}|{protein_id_part[1]}{mod_annotation}{protein_id_part[2]} {' '.join(original_header_parts[1:])}"

                # Add the modified protein entry
                ubiquitination_entries.append((new_header, modified_protein_sequence))
    
    return ubiquitination_entries

def write_fasta(output_file, ubiquitination_entries):
    with open(output_file, 'w') as file:
        # Write modification entries
        for header, sequence in ubiquitination_entries:
            formatted_sequence = format_fasta_sequence(sequence)
            file.write(f">{header}\n{formatted_sequence}\n")

# Example usage
ubiquitination_file = 'C:\\Users\\maitr\\Downloads\\Ubiquitination\\Ubiquitination.txt'
uniprot_fasta_file = 'C:\\Users\\maitr\\Downloads\\uniprot_sprot.fasta\\uniprot_sprot.fasta'
output_fasta_file = 'C:\\Users\\maitr\\Downloads\\Ubiquitinsite.fasta'

# Parse the modification files
ubiquitinations = parse_modification_file(ubiquitination_file, "Ubiquitination")

# Load the UniProt sequences
uniprot_sequences = load_uniprot_sequences(uniprot_fasta_file)

# Generate the ubiquitination entries
ubiquitination_entries = generate_ubiquitination_entries(ubiquitinations, uniprot_sequences)

# Write the FASTA file with ubiquitination entries
write_fasta(output_fasta_file, ubiquitination_entries)


In [18]:
from Bio import SeqIO
import pandas as pd

# Define file paths
matrix_file_path = 'C:\\Users\\maitr\\Documents\\JHU\Software_development\\2024\\Phospho1\\step6-peptide_protein_quantitation\\2024_0404_P_aga_A_E1.txt'
fasta_file_path = 'C:\\Users\\maitr\\Downloads\\O-linked Glycosylation\\draft5.fasta'

# Load the matrix file
matrix_df = pd.read_csv(matrix_file_path, delimiter='\t')

# Extract unique protein IDs from the matrix file
matrix_protein_ids = set()
for accession_list in matrix_df['Protein.Group.Accessions']:
    for accession in accession_list.split(';'):
        protein_id = accession.split('|')[1] if '|' in accession else accession
        matrix_protein_ids.add(protein_id)

# Print matrix protein IDs
print(f"Total unique protein IDs in matrix: {len(matrix_protein_ids)}")

# Load the FASTA file
fasta_protein_ids = set()
for record in SeqIO.parse(fasta_file_path, "fasta"):
    protein_id = record.id.split('|')[1]
    fasta_protein_ids.add(protein_id)

# Print FASTA protein IDs
print(f"Total unique protein IDs in FASTA: {len(fasta_protein_ids)}")

# Find protein IDs in FASTA but not in matrix
not_in_matrix = fasta_protein_ids - matrix_protein_ids
# Find protein IDs in matrix but not in FASTA
not_in_fasta = matrix_protein_ids - fasta_protein_ids

# Print discrepancies
print(f"Total protein IDs in FASTA but not in matrix: {len(not_in_matrix)}")
print(f"Total protein IDs in matrix but not in FASTA: {len(not_in_fasta)}")

# Save the results to a CSV file
not_in_matrix_df = pd.DataFrame(not_in_matrix, columns=['Protein IDs not in Matrix'])

not_in_fasta_df = pd.DataFrame(not_in_fasta, columns=['Protein IDs not in FASTA'])

# Print the resulting DataFrames
print("Protein IDs in FASTA but not in Matrix:")
print(not_in_matrix_df)
print("Protein IDs in Matrix but not in FASTA:")
print(not_in_fasta_df)

# Count the number of entries in the FASTA file based on ">"
with open(fasta_file_path, 'r') as file:
    fasta_entries = file.read().split('>')
    # Subtract 1 because the split will create an empty string at the beginning
    total_entries = len(fasta_entries) - 1

print(f"Total number of entries in the FASTA file: {total_entries}")


Total unique protein IDs in matrix: 4226
Total unique protein IDs in FASTA: 4225
Total protein IDs in FASTA but not in matrix: 0
Total protein IDs in matrix but not in FASTA: 1
Protein IDs in FASTA but not in Matrix:
Empty DataFrame
Columns: [Protein IDs not in Matrix]
Index: []
Protein IDs in Matrix but not in FASTA:
  Protein IDs not in FASTA
0                   P0DPQ4
Total number of entries in the FASTA file: 14219
