In [1]:

import pickle
import os
from tqdm import tqdm
import re

import requests
from urllib import request
from zeep import Client
import hashlib

from bs4 import BeautifulSoup
import ast
import pubchempy as pcp
from Bio import Entrez

import pandas as pd
import numpy as np
import math

from utils import *

# 1.Downloading origin data from BRENDA
Downloading data from BRENDA can take a couple of hours. To download the data from BRENDA, a registration is needed (https://www.brenda-enzymes.org/register.php). 

In [None]:
wsdl = "https://www.brenda-enzymes.org/soap/brenda_zeep.wsdl"

email = "your email address" # register in https://www.brenda-enzymes.org/
brenda_password='password'
password = hashlib.sha256(brenda_password.encode("utf-8")).hexdigest()
client = Client(wsdl)
parameters = (email,password)
EC_numbers = client.service.getEcNumbersFromEcNumber(*parameters)
print("There exist %s different EC numbers in the BRENDA database." % len(EC_numbers))

In [None]:
web_data_save_path = "./brenda_data_cache/EC_number_web_result/"
os.makedirs(web_data_save_path,exist_ok=True)
for ec in EC_numbers:
    url = f"https://www.brenda-enzymes.org/enzyme.php?ecno={ec}#TURNOVER%20NUMBER%20[1/s]"
    response = requests.get(url)
    if response.status_code==200:
        with open(web_data_save_path+f"{ec}.web",'w') as f:
            f.write(response.text)

In [None]:
def process_kcat_km(target):
    dataset = []
    target_DivId = 'tab44' if target=='kcat' else 'tab12'
    for EC_web_file_name in tqdm(os.listdir(web_data_save_path)):
        file = open(web_data_save_path+EC_web_file_name,'r').read()

        EC_number = EC_web_file_name.split('.web')[0]
        soup = BeautifulSoup(file, "html.parser")
        table_div = soup.find('div', id=target_DivId)
        # Make sure to find the header
        if table_div:
            # Get the table row, the subsequent rows of the table will be obtained from id='tab44/tab12'
            table_rows = table_div.find_all('div', recursive=False)
            for row in table_rows:
                cells = row.find_all('div', class_='cell')
                # Extract the contents of each cell
                if len(cells) == 7:  # Make sure the row has 7 columns of data
                    kinetic = cells[0].text.strip()  # kcat/km value
                    substrate = cells[1].text.strip()  
                    organism = cells[2].text.strip()  
                    uniprot = cells[3].text.strip()  # UniProt ID
                    commentary = cells[4].text.strip().lower()  
                    literature = cells[5].text.strip()  # Literature number
                    if 'entries' in organism:
                        continue
                    row_data = [EC_number,organism, uniprot,literature,substrate, kinetic, commentary]
                    dataset.append(row_data)

        else:
            print(EC_number)
    return dataset

kcat_dataset = process_kcat_km('kcat')
km_dataset = process_kcat_km('km')

# 2. Data cleaning
Get smiles and clean up missing entries or abnormal data. Downloading data from pubchem can take a couple of hours.

In [None]:
print(len(kcat_dataset))
kcat_dataset = [row for row in kcat_dataset if row[5] != 'additional information']
kcat_dataset = [row for row in kcat_dataset if 'entries' not in row[1]]
kcat_dataset = [row for row in kcat_dataset if len(row[2].split(';'))==1] # multi chain
kcat_dataset = [row for row in kcat_dataset if len(row[2].split(','))==1] # multi chain
kcat_dataset = [row for row in kcat_dataset if ('-' not in row[5]) and (float(row[5])<100000)]
kcat_dataset = [row for row in kcat_dataset if ('-' not in row[5]) and (float(row[5])>0.00001)]
print(len(kcat_dataset))

print(len(km_dataset))
km_dataset = [row for row in km_dataset if row[5] != 'additional information']
km_dataset = [row for row in km_dataset if 'entries' not in row[1]]
km_dataset = [row for row in km_dataset if len(row[2].split(';'))==1]
km_dataset = [row for row in km_dataset if len(row[2].split(','))==1]
km_dataset = [row for row in km_dataset if ('-' not in row[5]) and (float(row[5])<100000)]
km_dataset = [row for row in km_dataset if ('-' not in row[5]) and (float(row[5])>0.00001)]
print(len(km_dataset))

In [None]:
# Delete data for which SMILES cannot be obtained based on the substrate or for which the SMILES format is abnormal.
subs = list(set([row[4] for row in kcat_dataset]+[row[4] for row in km_dataset]))
smiles_dict={}
for sub in tqdm(subs):
    comp=get_comp(sub)
    if comp == -1: # retry  if failed
        comp=get_comp(sub)
    if comp == -1:
        smiles_dict[sub] = -1
        continue
    smiles = comp.canonical_smiles
    smiles_dict[sub]=-1 if not smiles or '.' in smiles else comp
pickle.dump(smiles_dict,open("./brenda_data_cache/smiles_dict.pkl",'wb'))

kcat_dataset = [row for row in kcat_dataset if smiles_dict[row[4]]!=-1]
km_dataset = [row for row in km_dataset if smiles_dict[row[4]]!=-1]
print(len(kcat_dataset))
print(len(km_dataset))

In [None]:
# Clean comments by normalizing stray Windows-1252 characters
kcat_dataset_clean = sanitize_column_text(kcat_dataset)
km_dataset_clean = sanitize_column_text(km_dataset)

# 3. Parse experimental conditions
Extract pH, temperature, cosubstrate and buffer from the **comments** using regular expressions and LLM, then perform an initial clustering.

Cluster records that share the same **EC number, species, UniProt ID, substrate, reference, temperature, pH, cosubstrate and buffer**. Then, discard clusters that contain neither wild-type nor mutant entries.

#### (a) Extract pH and temperature fields.

In [None]:
kcat_dataset_with_ph_temp = Add_temperature_pH_fieds(kcat_dataset_clean)
km_dataset_with_ph_temp = Add_temperature_pH_fieds(km_dataset_clean)

#### (b) Initial clustering
The initial clustering is intended to exclude unreasonable clusters and reduce the workload of subsequent LLM extraction and manual review.

In [None]:
def get_cluster(dataset):
    cluster_dict={}
    for row in dataset:
        pair_name = ';;;'.join([row[0],row[1],row[2],row[3],row[4],row[7],row[8]])
        if pair_name not in cluster_dict:
            cluster_dict[pair_name]=[]
        cluster_dict[pair_name].append(row)
    print("Same EC number, species, uniprotid, substrate, reference, temperature, pH, cluster: ",len(cluster_dict))

    for pair_name in list(cluster_dict.keys()):
        rows = cluster_dict[pair_name]
        # Keep only rows that are wild-type or have at least one extracted mutation; drop everything else
        new_rows = [] 
        for row in rows:
            mutant_extracted = extract_mutations(row[6])
            is_wildtype = 'wild' in row[6]
            if (len(mutant_extracted)==0 and not is_wildtype) or (len(mutant_extracted)!=0 and is_wildtype):
                continue
            new_rows.append(row)
        cluster_dict[pair_name]=new_rows
        
        # Extract the description fragment of cosubstrate and further subdivide the cluster
        comments=[row[6] for row in new_rows]
        cosub_extracted = [[i for i in c.split(',') if 'cosubstrate' in i or 'co-substrate' in i][0] if 'cosubstrate' in c or 'co-substrate' in c else -1 for c in comments]
        if len(set(cosub_extracted))<=1: # All are the same cosub
            continue
        cluster_dict.pop(pair_name) 
        for cosub_info in set(cosub_extracted):
            new_pair_name = pair_name+";;;"+str(cosub_info)
            cluster_dict[new_pair_name]=[]
        for cosub_info,row in zip(cosub_extracted,new_rows):
            cluster_dict[pair_name+";;;"+str(cosub_info)].append(row)
                        
    # Delete clusters without wildtype or mutant
    for pair_name in list(cluster_dict.keys()):
        new_rows = cluster_dict[pair_name]
        wildtype = [1 for row in new_rows if 'wild' in row[6]]
        muttype = [1 for row in new_rows if len(extract_mutations(row[6]))>0]
        if len(new_rows)<2 or sum(wildtype)==0 or sum(muttype)==0:
            cluster_dict.pop(pair_name)
    print("remain cluster: ",len(cluster_dict))

    return cluster_dict


kcat_cluster_init = get_cluster(kcat_dataset_with_ph_temp)
km_cluster_init = get_cluster(km_dataset_with_ph_temp)

#### (c) Extract buffer information

In [None]:
comments = [row[6] for _,rows in kcat_cluster_init.items() for row in rows]
comments += [row[6] for _,rows in km_cluster_init.items() for row in rows]
comments = list(set(comments))

with open("./brenda_data_cache/comment.txt",'w') as f:
    for com in comments:
        f.write(com+'\n')

Use an LLM and manual review on the saved `comment.txt` to extract buffer information from each comment, producing a dictionary mapping comment to buffer information. Then use this dictionary to further subdivide the existing clusters.

In [None]:
def buffer_split_cluster(cluster_dict, target):
    buffer_dict = pickle.load(open("./brenda_data_cache/buffer_mapping.pkl",'rb'))
    for pair_name in cluster_dict:
        rows = cluster_dict[pair_name]
        for i in range(len(rows)):
            print(len(rows[i]))
            rows[i].append(buffer_dict[rows[i][6]])
        cluster_dict[pair_name] = rows
    
    for pair_name in list(cluster_dict.keys()):
        rows = cluster_dict[pair_name]
        buffers = [row[9] for row in rows]
        # All data have the same buffer
        if len(set(buffers))==1:
            continue
        # There are different buffers in a cluster
        cluster_dict.pop(pair_name)
        for buffer in set(buffers):
            new_pair_name = pair_name+";;;"+str(buffer)
            new_rows = [row for row in rows if row[9] == buffer]
            type = [1 if 'wild' in row[6] else 0 for row in sub_grouped_buffer_rows]
            if type.count(1)==0 or type.count(0)==0: # exclude lacking wildtype or mutant cluster
                continue
            cluster_dict[new_pair_name] = new_rows
    return cluster_dict
kcat_cluster = buffer_split_cluster(kcat_cluster_init,"kcat")
km_cluster = buffer_split_cluster(km_cluster,'km')
len(kcat_cluster),len(km_cluster)

# 4. Enzyme information retrieval

In [None]:
kcat_cluster = pickle.load(open("./brenda_data_cache/06kcat_cluster.pkl",'rb'))
km_cluster = pickle.load(open("./brenda_data_cache/06km_cluster.pkl",'rb'))

#### (a) Download Uniprot ID by EC number and species

In [None]:
Entrez.email = "dkroundz@gmail.com"
def get_taxonomy_id(organism_name):
    # Search the Taxonomy ID by species name
    max_attempts = 5
    res = -1 
    for attempt in range(max_attempts):
        try:
            handle = Entrez.esearch(db="taxonomy", term=organism_name)
            record = Entrez.read(handle)
            handle.close()

            if record["IdList"]:
                tax_id = record["IdList"][0]  
                handle = Entrez.efetch(db="taxonomy", id=tax_id, retmode="xml")
                if handle:
                    records = Entrez.read(handle)
                    lineage = records[0]["Lineage"]  
                    res = lineage
                handle.close()
            break
        except:
            if attempt == max_attempts-1:
                print(organism_name)
    return res

In [None]:
# 0: EC number; 1: species
ec_organism_pair = {";;;".join([rows[0][0],rows[0][1]]):-1  for rows in list(kcat_cluster.values())+list(km_cluster.values()) if rows[0][2] == '-'}

# Construct the BRENDA client in step 1.
for ec_org in tqdm(list(ec_organism_pair.keys())):
    max_attempts = 5
    for attempt in range(max_attempts): 
        try:
            ec,org = ec_org.split(";;;")
            parameters = (email,password,f"ecNumber*{ec}", "sequence*","noOfAminoAcids*", "firstAccessionCode*","source*","id*",
                        f"organism*{org}")
            sequence = client.service.getSequence(*parameters)
            break
        except Exception:
            if attempt == max_attempts - 1:
                print(f"Attempt {attempt + 1} times. Exception: {Exception}")
                sequence = -1  
    if sequence==-1 or len(list(sequence))!=1:
        ec_organism_pair.pop(ec_org)
        continue
    
    # Check if the species is bacteria
    organism_name = ec_org.split(";;;")[1]  
    lineage = get_taxonomy_id(organism_name)
    if lineage and ('Bacteria' in lineage or 'bacteria' in lineage):
        ec_organism_pair[ec_org]= list(sequence)[0]
    else:
        ec_organism_pair.pop(ec_org)
len(ec_organism_pair)

In [None]:
# To fill in the missing Uniprot IDs and discard those without IDs
def add_uniprotId(cluster_dict):
    for pair_name in list(cluster_dict.keys()):
        rows = cluster_dict[pair_name]
        uniprot = rows[0][2]
        ec_org = ";;;".join([rows[0][0],rows[0][1]])
        if uniprot != '-':
            continue
        if ec_org not in ec_organism_pair:
            cluster_dict.pop(pair_name)
            continue
        new_uniprot = ec_organism_pair[ec_org][0]['firstAccessionCode']
        for i in range(len(rows)):
            rows[i][2]=new_uniprot
        cluster_dict[pair_name]=rows
    return cluster_dict

kcat_cluster = add_uniprotId(kcat_cluster)
km_cluster = add_uniprotId(km_cluster)
len(kcat_cluster),len(km_cluster)

#### (b) Download sequences and verify mutation.

In [None]:
UIDs = list(set([rows[0][2] for _,rows in kcat_cluster.items()] + [rows[0][2] for _,rows in km_cluster.items()]))
UID_Seq_dict = dict()
for id in tqdm(UIDs):
    url = "https://www.uniprot.org/uniprot/%s.fasta" % id
    try :
        data = request.urlopen(url)
        respdata = data.read().decode("utf-8").strip()
        seq = ''.join([i for i in respdata.split('\n')[1:]])
        UID_Seq_dict[id] =  seq
    except :
        print(id, "can not find from uniprot!")

In [None]:
error_aa = ['U','O','X','B','J','Z']
def check_aa(seq):
    for aa in seq:
        if aa in error_aa:
            return False
    return True

def check_mutant(cluster_dict):
    for pair_name in list(cluster_dict.keys()):
        rows = cluster_dict[pair_name]
        uniprotId = rows[0][2]
        seq = UID_Seq_dict[uniprotId]
        if not check_aa(seq):
            cluster_dict.pop(pair_name)
            
        
        wt_rows = [row for row in rows  if len(extract_mutations(row[6]))==0]
        mut_rows = [row for row in rows  if len(extract_mutations(row[6]))!=0]
        mut_rows_new = []
        for row in mut_rows:
            flag=True
            mut_loc = extract_mutations(row[6])
            for mut in mut_loc:
                loc = int(mut[1:-1])
                if loc >= len(seq)-1 or mut[0] not in [seq[loc-1],seq[loc],seq[loc+1]]: 
                    flag=False
                    break
            if flag:
                # If it is a multiple mutation, it is required to be at the position of +1-1 or 0
                all_flag=False
                for dev in range(-1,2,1):
                    flag=True
                    for mut in mut_loc:
                        loc = int(mut[1:-1])+dev
                        if mut[0] != seq[loc]:
                            flag=False
                            break
                    if flag:
                        all_flag=True
                        break
                if all_flag: 
                    mut_rows_new.append(row)
        if len(mut_rows_new)==0: 
            cluster_dict.pop(pair_name)
        else:
            cluster_dict[pair_name] = wt_rows + mut_rows_new
    return cluster_dict
kcat_cluster = check_mutant(kcat_cluster)
km_cluster = check_mutant(km_cluster)
len(kcat_cluster),len(km_cluster)

#### (c) Deduplication of identical data
Deduplication of multiple wildtypes or multiple identical mutant data within a cluster requires manual intervention to avoid data loss due to errors. Here, we use a method to print the comments in the cluster to a text file for manual processing when there are controversial duplicate data in the cluster. For example, some clusters may have modified suffixes that need to be removed.

In [None]:
def remove_duplicate(cluster_dict):
    for pair_name in cluster_dict:
        wt_rows = [row for row in cluster_dict[pair_name] if 'wild' in row[6]]
        mut_rows = [row for row in cluster_dict[pair_name] if len(extract_mutations(row[6]))>0]
        # wt
        if len(wt_rows)!=1:
            K = ",".join([row[5] for row in wt_rows])
            wt_rows[0][5] = K
        wt_row = wt_rows[0]
        # mut
        mut_locs = [",".join(extract_mutations(row[6])) for row in mut_rows]
        if len(set(mut_locs))!=len(mut_locs):
            # Put the same mutation points in the same list, 
            grouped_mut_rows = [[row for row in mut_rows if ",".join(extract_mutations(row[6]))==mut_loc] for mut_loc in set(mut_locs)]
            # Then put the kcat of all records in each sublist together and throw it to the first record in the sublist
            mut_rows = []
            for sub_grouped_mut_rows in grouped_mut_rows:
                K = ",".join([row[5] for row in sub_grouped_mut_rows])
                sub_grouped_mut_rows[0][5] = K # The kcat of the first record stores all kcats of the same mutation
                mut_rows.append(sub_grouped_mut_rows[0])
        cluster_dict[pair_name] = [wt_row] + mut_rows
    return cluster_dict
    
# The cluster here is the cluster that has been manually confirmed
kcat_cluster = remove_duplicate(kcat_cluster)
km_cluster = remove_duplicate(km_cluster)
len(kcat_cluster),len(km_cluster)

# 5. Construct mutation effect pairs

In [None]:
def revised_mut_loc(mut_loc,seq):    
    # If it is a multiple mutation, it is required to be at the position of +1-1 or 0
    all_flag=False
    true_dev = False
    for dev in range(-1,2,1):
        flag=True
        for mut in mut_loc:
            loc = int(mut[1:-1])+dev
            if mut[0] != seq[loc]:
                flag=False
                break
        if flag:
            all_flag=True
            true_dev=dev
            break
    if all_flag:
        mut_loc_new = [mut[0]+str(int(mut[1:-1])+true_dev)+mut[-1] for mut in mut_loc]
        return mut_loc_new
    else:
        return -1

In [None]:
def create_df(cluster_dict,target):
    EcNumber = []
    organism = []
    substrate = []
    UniprotId = []
    pubmedId = []
    temperature = []
    pH = []
    buffer = []
    sequence = []
    mutant = [] 
    wt_ks = []
    mut_ks = []
    delta_ks = []

    for pair_name in cluster_dict:
        wt_row = [row for row in cluster_dict[pair_name] if 'wild' in row[6]][0]
        UID,wt_K = wt_row[2],wt_row[5]
        wt_K = [float(i) for i in wt_row[5].split(',')]
        wt_K = np.mean([math.log10(i) for i in wt_K])
        seq = IdSeq_dict[UID]
        
        mut_rows = [row for row in cluster_dict[pair_name] if len(extract_mutations(row[6]))>0]
        for mut_row in mut_rows:
            mutant_info = revised_mut_loc(extract_mutations(mut_row[6]),seq)
            if mutant_info==-1: # Dislocation mutation point
                print(extract_mutations(mut_row[6]),seq)
                continue
            
            mut_K = mut_row[5]
            mut_K = [float(i) for i in mut_row[5].split(",")]
            mut_K = np.mean([math.log10(i) for i in mut_K])

            EcNumber.append(wt_row[0])
            organism.append(wt_row[1].lower())
            substrate.append(wt_row[4].lower())
            UniprotId.append(UID)
            pubmedId.append(wt_row[3])
            temperature.append(wt_row[7])
            pH.append(wt_row[8])
            buffer.append(wt_row[9])
            sequence.append(seq)
            mutant.append(",".join(mutant_info))
            wt_ks.append(wt_K)
            mut_ks.append(mut_K)
            delta_ks.append(mut_K-wt_K)
    df = pd.DataFrame({
        'EcNumber':EcNumber,'Organism':organism,"Substrate":substrate,
        'UniprotId':UniprotId,'brenda_Ref_Id':pubmedId,
        'Temperature':temperature,'pH':pH,'buffer':buffer,
        'sequence':sequence,'mutant':mutant,
        f'wt_{target}_log10':wt_ks,f'mut_{target}_log10':mut_ks,f'delta_{target}_log10':delta_ks,
    })
    return df
kcat_df = create_df(kcat_cluster,'kcat')
km_df = create_df(km_cluster,'km')

In [None]:
kcat_df_avg = kcat_df.groupby(['EcNumber', 'Organism','Substrate','UniprotId',
                               'brenda_Ref_Id','Temperature','pH','buffer','sequence','mutant',
                               'wt_kcat_log10','mut_kcat_log10'], as_index=False).agg({'delta_kcat_log10': 'mean'})
km_df_avg = km_df.groupby(['EcNumber', 'Organism','Substrate','UniprotId',
                               'brenda_Ref_Id','Temperature','pH','buffer','sequence','mutant',
                               'wt_km_log10','mut_km_log10'], as_index=False).agg({'delta_km_log10': 'mean'})

kcat_df_avg.to_csv("./brenda_data_cache/brenda_delta_kcat_df.csv",index=False)
km_df_avg.to_csv("./brenda_data_cache/brenda_delta_km_df.csv",index=False)