In [None]:
import pandas as pd

def load_and_preprocess_ptm_file(filepath, ptm_types):
    
    columns = ["Protein_Name", "UniProt_ID", "Modified_Position", "PTM_Type", "PubMed_IDs", "Local_Sequence"]
    df = pd.read_csv(filepath, sep="\t", header=None, names=columns, engine="python")
    df.dropna(subset=["UniProt_ID", "Modified_Position", "Local_Sequence"], inplace=True)
    df["Modified_Position"] = pd.to_numeric(df["Modified_Position"], errors="coerce")
    df["Local_Sequence"] = df["Local_Sequence"].str.strip("-")
    if ptm_types is not None:
        df = df[df["PTM_Type"].isin(ptm_types)].copy()

    return df


In [None]:
my_ptms = ["Phosphorylation", "Acetylation", "Ubiquitination", "Methylation", "Sumoylation"]
phospho_df = load_and_preprocess_ptm_file("Phosphorylation", ptm_types=my_ptms)
print(phospho_df.head())

  Protein_Name UniProt_ID  Modified_Position         PTM_Type  \
0   MYSC_ACACA     P10569                311  Phosphorylation   
1   MYSB_ACACA     P19706                315  Phosphorylation   
2  14333_ARATH     P42644                162  Phosphorylation   
3  14333_ARATH     P42644                238  Phosphorylation   
4  14335_ARATH     P42645                267  Phosphorylation   

                                          PubMed_IDs         Local_Sequence  
0                                            2530230  TTGEQGRGRSSVYSCPQDPLG  
1                                            2530230  NTGGAGAKKMSTYNVPQNVEQ  
2                         30395287;22092075;25561503  VAYKSASDIATAELAPTHPIR  
3  30291188;24924143;30395287;19880383;20466843;2...  DNLTLWTSDMTDEAGDEIKEA  
4  30291188;23776212;19376835;27531888;30395287;2...           KVDEQAQPPPSQ  


In [None]:

# phospho_df = load_and_preprocess_ptm_file("Phosphorylation", ptm_types=my_ptms)
acetyl_df = load_and_preprocess_ptm_file("Acetylation", ptm_types=my_ptms)
ubiq_df = load_and_preprocess_ptm_file("Ubiquitination", ptm_types=my_ptms)
methy_df = load_and_preprocess_ptm_file("Methylation", ptm_types=my_ptms)
sumoy_df = load_and_preprocess_ptm_file("Sumoylation", ptm_types=my_ptms)


all_ptms = pd.concat([phospho_df, acetyl_df, ubiq_df, methy_df, sumoy_df], ignore_index=True)


In [None]:
print(all_ptms)

        Protein_Name UniProt_ID  Modified_Position         PTM_Type  \
0         MYSC_ACACA     P10569                311  Phosphorylation   
1         MYSB_ACACA     P19706                315  Phosphorylation   
2        14333_ARATH     P42644                162  Phosphorylation   
3        14333_ARATH     P42644                238  Phosphorylation   
4        14335_ARATH     P42645                267  Phosphorylation   
...              ...        ...                ...              ...   
2123238   ZSC9_HUMAN     O15535                215      Sumoylation   
2123239   ZSC9_HUMAN     O15535                238      Sumoylation   
2123240   ZZZ3_HUMAN     Q8IYH5                276      Sumoylation   
2123241   ZZZ3_HUMAN     Q8IYH5                647      Sumoylation   
2123242   ZZZ3_HUMAN     Q8IYH5                708      Sumoylation   

                                                PubMed_IDs  \
0                                                  2530230   
1                      

In [None]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m27.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [None]:
from Bio import SeqIO
def load_uniprot_fasta(fasta_path):
    seq_dict = {}
    for record in SeqIO.parse(fasta_path, "fasta"):
        uniprot_id = record.id.split("|")[1] if "|" in record.id else record.id
        seq_dict[uniprot_id] = str(record.seq)
    return seq_dict

def map_ptms_to_sequences(ptm_df, seq_dict):
    ptm_df["Full_Sequence"] = ptm_df["UniProt_ID"].map(seq_dict)

    def extract_window(row, window_size=10):
        seq = row["Full_Sequence"]
        pos = int(row["Modified_Position"]) - 1  
        if pd.isna(seq) or pos < 0 or pos >= len(seq):
            return None
        start = max(0, pos - window_size)
        end = min(len(seq), pos + window_size + 1)
        return seq[start:end]

    ptm_df["Sequence_Window"] = ptm_df.apply(extract_window, axis=1)
    return ptm_df


In [None]:
seqs = load_uniprot_fasta("uniprot_sprot.fasta")
ptm_df = map_ptms_to_sequences(all_ptms, seqs)
print(ptm_df)

        Protein_Name UniProt_ID  Modified_Position         PTM_Type  \
0         MYSC_ACACA     P10569                311  Phosphorylation   
1         MYSB_ACACA     P19706                315  Phosphorylation   
2        14333_ARATH     P42644                162  Phosphorylation   
3        14333_ARATH     P42644                238  Phosphorylation   
4        14335_ARATH     P42645                267  Phosphorylation   
...              ...        ...                ...              ...   
2123238   ZSC9_HUMAN     O15535                215      Sumoylation   
2123239   ZSC9_HUMAN     O15535                238      Sumoylation   
2123240   ZZZ3_HUMAN     Q8IYH5                276      Sumoylation   
2123241   ZZZ3_HUMAN     Q8IYH5                647      Sumoylation   
2123242   ZZZ3_HUMAN     Q8IYH5                708      Sumoylation   

                                                PubMed_IDs  \
0                                                  2530230   
1                      

In [None]:
ptm_df.to_csv("all_ptms_fullsequences")