### (1) import data

In [23]:
import pandas as pd

In [24]:
columns = ["Sequence", "Catalytic activity"]

In [25]:
df = pd.read_csv('uniprotkb_AND_reviewed_true_2023_10_23.tsv', sep='\t',usecols=columns)


In [26]:
df.head(2)

Unnamed: 0,Sequence,Catalytic activity
0,MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENA...,CATALYTIC ACTIVITY: Reaction=NAD(+) = 2'cADPR ...
1,MRFQVIVAAATITMITSYIPGVASQSTSDGDDLFVPVSNFDPKSIF...,CATALYTIC ACTIVITY: Reaction=Hydrolysis of (1-...


# "sequence,[reation,[chebi],[Rhea],EC],[reation,[chebi],[Rhea],EC]"

In [27]:

import pandas as pd
import re


def extract_data(activity):
    # Extracting the Reaction after CATALYTIC ACTIVITY: Reaction=
    reaction = re.search(r"CATALYTIC ACTIVITY: Reaction=(.*?);", activity)
    reaction = reaction.group(1) if reaction else " "

     # Extracting the numbers after ChEBI:CHEBI:
    chebi_numbers = re.findall(r"ChEBI:CHEBI:(\d+)", activity)
    chebi_numbers = chebi_numbers if chebi_numbers else [" "]

    # Extracting the string starting with Rhea:
    rhea = re.findall(r"Rhea:RHEA(?:-COMP)?:(\d+)", activity)
    rhea = rhea if rhea else [" "]

    # Extracting the number after EC=
    ec = re.search(r"EC=(\d+\.\d+\.\d+\.\d+);", activity)
    ec = ec.group(1) if ec else " "

    return [reaction, chebi_numbers, rhea, ec]

def process_row(row):
    sequence = row["Sequence"]
    activities_list = []
    if isinstance(row["Catalytic activity"], str):
        # Split by the pattern but without removing it
        activities = re.split(r"(?=CATALYTIC ACTIVITY: Reaction=)", row["Catalytic activity"])
        activities = [activity for activity in activities if activity]  # Removing any empty strings
        activities_data = [extract_data(activity) for activity in activities]
        activities_list.extend(activities_data)
    return [sequence, activities_list]

result = df.apply(process_row, axis=1).tolist()

# # Printing the result
# for item in result:
#     print(item)

    

In [28]:
exploded_data = []

for item in result:
    sequence, reactions = item
    if reactions:
        for reaction in reactions:
            exploded_data.append([sequence, reaction])
    else:
        exploded_data.append([sequence, ["", ]])

# Print first 10 exploded data points to inspect
for i in range(min(10, len(exploded_data))):
    print(exploded_data[i])

['MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKIDIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVSISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQKIDSGLRNSKYGTVVLSTDFIKKDWTNYELDGLVAREMNGHKMILPIWHKITKNDVLDYSPNLADKVALNTSVNSIEEIAHQLADVILNR', ["NAD(+) = 2'cADPR + H(+) + nicotinamide", ['15378', '17154', '57540', '194248'], ['75299', '75300'], ' ']]
['MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKIDIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVSISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQKIDSGLRNSKYGTVVLSTDFIKKDWTNYELDGLVAREMNGHKMILPIWHKITKNDVLDYSPNLADKVALNTSVNSIEEIAHQLADVILNR', ['H2O + NAD(+) = ADP-D-ribose + H(+) + nicotinamide', ['15377', '15378', '17154', '57540', '57967'], ['16301', '16302'], '3.2.2.6']]
['MSLEQKKGADIISKILQIQNSIGKTTSPSTLKTKLSEISRKEQENARIQSKLSDLQKKKIDIDNKLLKEKQNLIKEEILERKKLEVLTKKQQKDEIEHQKKLKREIDAIKASTQYITDVSISSYNNTIPETEPEYDLFISHASEDKEDFVRPLAETLQQLGVNVWYDEFTLKVGDSLRQKIDSGLRNSKYGTVVLSTDFIKKDWTNYEL

In [29]:
print(len(exploded_data))

652507


In [30]:
seqs_list = [one_data[0] for one_data in exploded_data]
rxn_list  = [one_data[1][0] for one_data in exploded_data]

seqs_set_list = list(set(seqs_list))
rxn_set_list = list(set(rxn_list))

print("len(seqs_set_list): ", len(seqs_set_list))
print("len(rxn_set_list): ", len(rxn_set_list))

len(seqs_set_list):  482036
len(rxn_set_list):  12355
