In [1]:
import os
from os.path import join
import pandas as pd
import numpy as np
CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

d:\Python\aox\enzyme-mining-aox


The graph includes the nodes reference literature (ref), organism (org) and enzyme sequence (accession) (acc) and the edges between them. Data sources included Brenda and UniProt, with the former providing mainly (ref, org) associations. The latter complements (ref, org) and provides information on (ref, acc) and (ref, org). The code here is used to generate data suitable for [CytoScape](https://cytoscape.org/) import, i.e. an `edge` table and a `node` table.

The files are based on the files generated by the `[5-7]-[ref/seq/org]-annotation.ipynb` scripts.

In [2]:
DATADIR = os.path.join(CURRENT_DIR, "data", "aox")
GRAPHDIR = os.path.join(DATADIR, "graph", "ref_org_acc")
CURATEDIR = os.path.join(GRAPHDIR, "curate")

filenames = {
    # download or curate from the database website
    ## edges
    "brenda_ref_organism": join(DATADIR, "raw", "brenda_ref_organism.txt"), # the publiaction list from brenda (the brenda-reference ID)
    "pubmed_ref_organism": join(DATADIR, "raw", "pubmed_ref_organism.txt"), # the publication list from uniprot (the pubmed id)
    "pubmed_ref_sequence": join(DATADIR, "raw", "pubmed_ref_sequence.txt"), # the sequence list from uniprot (the uniprot accession)
    "brenda_reference": join(DATADIR, "raw", "brenda_reference.tsv"),
    "uniprot_sequence": join(DATADIR, "raw", "uniprot_sequence.tsv"),

    ## nodes
    "references": join(CURATEDIR, "references.tsv"),
    "organisms": join(CURATEDIR, "organisms.tsv"),
    "sequences": join(CURATEDIR, "sequences.tsv"),

    # results
    "edge": join(GRAPHDIR, "edges.tsv"),
    "node": join(GRAPHDIR, "nodes.tsv"),
}

### The edges

In [3]:
def org_split(str_orgs):
    orgs = []
    str_orgs = str_orgs.strip().replace('[','').replace(']','')
    for str_org in str_orgs.split(','):
        s = str_org.strip().split(' ')
        if len(s) >= 2:
            orgs.append(f"{s[0]} {s[1]}")
    return orgs

def refs_split(str_refs):
    ref = []
    if isinstance(str_refs, str):
        ref = [ int(i.strip()) for i in str_refs.split(';')]
    else:
        if not np.isnan(str_refs):
            ref = [int(str_refs)]
    return ref

In [4]:
edge_reference_organism = set()
edge_reference_accession = set()
edge_accession_organism = set()

# make the edge between reference and organism (from brenda)
with open(filenames['brenda_ref_organism'], "r") as f:
    brenda_ref_organism = f.read().split('\n')
brenda_ref_organism = set([int(i) for i in brenda_ref_organism])

ec_reference = pd.read_csv(filenames['brenda_reference'], sep='\t')
ec_reference.rename(columns={
    col: col.strip() for col in ec_reference.columns
}, inplace=True)

for ref, orgs, pid in zip(ec_reference['REF'], ec_reference['ORGANISM (UNIPROT)'], ec_reference['PUBMED ID']):
    if ref in brenda_ref_organism:
        for org in org_split(orgs):
            pre = f"Brenda{ref}"
            edge_reference_organism.add((pre, org, "brenda"))
            # print(pre, org)

print(f"Number of edge(ref, org): {len(edge_reference_organism)} (Adding Brenda)")

# make the edge between reference and organism (from uniprot)
with open(filenames['pubmed_ref_organism'], "r") as f:
    pubmed_ref_organism = f.read().split('\n')
pubmed_ref_organism = set([int(i) for i in pubmed_ref_organism])

uniprot_ec = pd.read_csv(filenames['uniprot_sequence'], sep="\t")
for refs, orgs in zip(uniprot_ec['PubMed ID'], uniprot_ec['Organism']):
    for ref in refs_split(refs):
        if ref in pubmed_ref_organism:
            for org in org_split(orgs):
                pre = f"Pubmed{ref}"
                edge_reference_organism.add((pre, org, "uniprot"))

print(f"Number of edge(ref, org): {len(edge_reference_organism)} (Adding UniProt)")

# make the edge between reference and accession (sequence) (from uniprot)
with open(filenames['pubmed_ref_sequence'], "r") as f:
    pubmed_ref_sequence = f.read().split('\n')
pubmed_ref_sequence = set([int(i) for i in pubmed_ref_sequence])

for refs, accession in zip(uniprot_ec['PubMed ID'], uniprot_ec['Entry Name']):
    for ref in refs_split(refs):
        if ref in pubmed_ref_sequence:
            pre = f"Pubmed{ref}"
            edge_reference_accession.add((pre, accession, "uniprot"))
            # print(pre, accession)

print(f"Number of edge(ref, seq): {len(edge_reference_accession)} (Adding UniProt)")


# make the edge between accession and organism (from uniprot)
for orgs, accession in zip(uniprot_ec['Organism'], uniprot_ec['Entry Name']):
    for org in org_split(orgs):
        edge_accession_organism.add((org, accession, "uniprot"))
        # print(org, accession)
print(f"Number of edge(seq, org): {len(edge_accession_organism)} (Adding UniProt)")


# output the edges to file
edges = pd.concat([
    pd.DataFrame(edge_reference_organism),
    pd.DataFrame(edge_reference_accession),
    pd.DataFrame(edge_accession_organism),
])
edges.columns = ['source', 'target', 'db']
# edges.to_csv(filenames['edge'], sep="\t", index=False)


Number of edge(ref, org): 117 (Adding Brenda)
Number of edge(ref, org): 135 (Adding UniProt)
Number of edge(ref, seq): 16 (Adding UniProt)
Number of edge(seq, org): 104 (Adding UniProt)


### The nodes

In [5]:
ec = "1.1.3.13"
urls = {
    "brenda": f"https://www.brenda-enzymes.org/literature.php?e={ec}" + "&r={}",
    "pubmed": "https://pubmed.ncbi.nlm.nih.gov/{}/",
    "uniprot": "https://www.uniprot.org/uniprotkb/{}/entry",
    "ncbi_taxonomy": "https://www.ncbi.nlm.nih.gov/taxonomy/?term={}"
}
def make_url(_id:str, _type:str):
    if _type == "reference":
        if _id.startswith("Pubmed"):
            url = urls['pubmed'].format(_id[6:])
        elif _id.startswith("Brenda"):
            url = urls['brenda'].format(_id[6:])
    elif _type == "organism":
        url = urls['ncbi_taxonomy'].format(_id)
    elif _type == "accession":
        url = urls['uniprot'].format(_id)
    else:
        url = f"https://cn.bing.com/search?q={_id}+{_type}"
    url = url.replace(" ", "+")
    return url

In [6]:
# # make the nodes from the edges
# # using the set to store the nodes
# node_reference = set()
# node_organism = set()
# node_accession = set()

# for r,o in edge_reference_organism:
#     node_reference.add((r, "reference", make_url(r, "reference")))
#     node_organism.add((o, "organism", make_url(o, "organism")))
# for r,a in edge_reference_accession:
#     node_reference.add((r, "reference", make_url(r, "reference")))
#     node_accession.add((a, "accession", make_url(a, "accession")))
# for a, o in edge_accession_organism:
#     node_organism.add((o, "organism", make_url(o, "organism")))
#     node_accession.add((a, "accession", make_url(a, "accession")))


# # output the nodes to file
# df_node_reference = pd.DataFrame(node_reference, columns = ["name", "type", "url"])
# df_node_reference['representation'] = df_node_reference.index

# df_node_organism = pd.DataFrame(node_organism, columns = ["name", "type", "url"])
# df_node_organism['representation'] = df_node_organism.index

# df_node_accession = pd.DataFrame(node_accession, columns = ["name", "type", "url"])
# df_node_accession['representation'] = df_node_accession.index

# nodes = pd.concat([
#     df_node_reference,
#     df_node_organism,
#     df_node_accession
# ])
# nodes.to_csv(filenames['node'], sep="\t", index=False)


In [7]:
references = pd.read_csv(filenames['references'], sep='\t')
content = []
for i, row in references.iterrows():
    _id = f"r{row['rid']}"
    representation = f"Brenda{str(int(row['brenda id']))}" if ~ np.isnan(row['brenda id']) else f"Pubmed{row['pubmed id']}"
    _type = "reference"
    url = make_url(representation, _type)
    ref_type = row['type']
    org_type = -1
    seq_type = -1
    # Exclusion of irrelevant literature
    if ref_type != 2:
        content.append([_id, representation, _type, url, ref_type, org_type, seq_type])

node_reference = pd.DataFrame(
    content,
    columns = ['id', 'representation', 'type', 'url', 'ref_type', 'org_type', 'seq_type']
)
print(len(node_reference)) # 91
node_reference.head(5)

91


Unnamed: 0,id,representation,type,url,ref_type,org_type,seq_type
0,r0,Brenda484905,reference,https://www.brenda-enzymes.org/literature.php?...,1,-1,-1
1,r1,Brenda484906,reference,https://www.brenda-enzymes.org/literature.php?...,1,-1,-1
2,r2,Brenda484907,reference,https://www.brenda-enzymes.org/literature.php?...,1,-1,-1
3,r3,Brenda484908,reference,https://www.brenda-enzymes.org/literature.php?...,1,-1,-1
4,r4,Brenda484909,reference,https://www.brenda-enzymes.org/literature.php?...,1,-1,-1


In [8]:
organisms = pd.read_csv(filenames['organisms'], sep='\t')
content = []
for i, row in organisms.iterrows():
    _id = f"t{row['taxid']}"
    representation = " ".join(row['name'].split(' ')[:2]).replace('[', '').replace(']', '')
    # use the first two words as representation (Genus + Species)
    _type = "organism"
    url = make_url(representation, _type)
    ref_type = -1
    org_type = row['type']
    seq_type = -1
    content.append([_id, representation, _type, url, ref_type, org_type, seq_type])

node_organism = pd.DataFrame(
    content,
    columns = ['id', 'representation', 'type', 'url', 'ref_type', 'org_type', 'seq_type']
)
node_organism = node_organism.drop_duplicates(subset=['representation'])
print(len(node_organism)) # 71
node_organism.head(5)

71


Unnamed: 0,id,representation,type,url,ref_type,org_type,seq_type
0,t1442373,Achatina achatina,organism,https://www.ncbi.nlm.nih.gov/taxonomy/?term=Ac...,-1,1,-1
1,t145126,Arion ater,organism,https://www.ncbi.nlm.nih.gov/taxonomy/?term=Ar...,-1,1,-1
2,t1665,Arthrobacter globiformis,organism,https://www.ncbi.nlm.nih.gov/taxonomy/?term=Ar...,-1,1,-1
3,t40380,Aspergillus ochraceus,organism,https://www.ncbi.nlm.nih.gov/taxonomy/?term=As...,-1,1,-1
4,t33178,Aspergillus terreus,organism,https://www.ncbi.nlm.nih.gov/taxonomy/?term=As...,-1,1,-1


In [9]:
sequences = pd.read_csv(filenames['sequences'], sep='\t')
content = []
for i, row in sequences.iterrows():
    _id = f"s{row['sid']}"
    representation = row['uniprot id']
    _type = "accession"
    url = make_url(representation, _type)
    ref_type = -1
    org_type = -1
    seq_type = row['type']
    content.append([_id, representation, _type, url, ref_type, org_type, seq_type])

node_sequence = pd.DataFrame(
    content,
    columns = ['id', 'representation', 'type', 'url', 'ref_type', 'org_type', 'seq_type']
)
print(len(node_sequence)) # 104
node_sequence.head(5)

104


Unnamed: 0,id,representation,type,url,ref_type,org_type,seq_type
0,s0,I7CMK2_MONPR,accession,https://www.uniprot.org/uniprotkb/I7CMK2_MONPR...,-1,-1,1
1,s1,A8DPS4_GLOTR,accession,https://www.uniprot.org/uniprotkb/A8DPS4_GLOTR...,-1,-1,1
2,s2,A0A977TIR6_PHACH,accession,https://www.uniprot.org/uniprotkb/A0A977TIR6_P...,-1,-1,1
3,s3,Q5S057_CANBO,accession,https://www.uniprot.org/uniprotkb/Q5S057_CANBO...,-1,-1,1
4,s4,ALOX1_KOMPC,accession,https://www.uniprot.org/uniprotkb/ALOX1_KOMPC/...,-1,-1,1


In [10]:
# a reviewed version for the nodes
nodes = pd.concat([
    node_reference, 
    node_organism, 
    node_sequence
], axis=0, ignore_index=True, sort=False).fillna("")
nodes

nodes.to_csv(filenames['node'], sep="\t", index=False)

### The egdes (after review)

In [11]:
set_node_representations = set(nodes['representation'])
len(set_node_representations) # 266

266

In [12]:
df_pubmed2brenda = references[(references['pubmed id'] != '-') & (~references['brenda id'].isna())][['pubmed id', 'brenda id']]
pubmed2brenda = {f"Pubmed{pubmed_id}": f"Brenda{brenda_id:.0f}" for pubmed_id, brenda_id in zip(df_pubmed2brenda['pubmed id'], df_pubmed2brenda['brenda id'])}

rename_dict = pubmed2brenda

In [13]:
new_edges = []
for i, row in edges.iterrows():
    a, b = row[0], row[1]
    # rename
    _edge = [rename_dict.get(row[0], row[0]), rename_dict.get(row[1], row[1]), row[2]]
    # check the existence
    if (_edge[0] in set_node_representations) and (_edge[1] in set_node_representations):
        new_edges.append(_edge)
reviewed_edges = pd.DataFrame(new_edges, columns=edges.columns)
print(len(reviewed_edges)) # 228
reviewed_edges.to_csv(filenames['edge'], sep="\t", index=False)

227
