This script is used for the identification and disambiguation of organisms.

In [1]:
import os
from os.path import join
import pandas as pd
import numpy as np
CURRENT_DIR = os.getcwd()
print(CURRENT_DIR)

d:\Python\aox\enzyme-mining-aox


In [2]:
DATADIR = os.path.join(CURRENT_DIR, "data", "aox")
CURATEDIR = os.path.join(DATADIR, "graph", "ref_org_acc", "curate")
CACHEDIR = join(DATADIR, "cache") # for sequence mining and analysis

filenames = {
    # download or curate from the database website
    "brenda_organism_xml": join(DATADIR, "raw", "brenda_organism.xml"), # organism part xml from brenda
    "uniprot_sequence": join(DATADIR, "raw", "uniprot_sequence.tsv"),
    "uniprot_sequence_enhanced": join(DATADIR, "raw", "uniprot_sequence_enhanced.tsv"), # the same accession with the organism ID

    # results
    "organism": join(CURATEDIR, "organisms.tsv")
}

In [3]:
import re

file_path = 'your_file.txt'
pattern = r'javascript:Org\(\'([-\d]+)\'\)\">([^<]+)</a>'
with open(filenames['brenda_organism_xml'], 'r', encoding='utf-8') as file:
    content = file.read()

matches = re.findall(pattern, content)

len(matches) # 107

107

In [14]:
brenda_organism = pd.DataFrame(matches, columns=['taxid', 'name'])
len(brenda_organism['taxid'].unique()) # 61

61

In [11]:
uniprot_organism = pd.read_csv(filenames['uniprot_sequence_enhanced'], sep='\t')[['Organism', 'Organism (ID)']]
uniprot_organism = uniprot_organism.rename(columns={
    "Organism": 'name',
    'Organism (ID)': 'taxid'
})
len(uniprot_organism['taxid'].unique())

57

In [6]:
organisms = pd.concat([brenda_organism, uniprot_organism], axis=0, ignore_index=True, sort=False).fillna("")
organisms['taxid'] = organisms['taxid'].astype(str)
organisms = organisms.drop_duplicates(subset=['taxid', 'name'])
organisms # 105

Unnamed: 0,taxid,name
0,1442373,Achatina achatina
1,145126,Arion ater
2,1665,Arthrobacter globiformis
3,40380,Aspergillus ochraceus
5,-777,Aspergillus ochraceus AIU 031
...,...,...
180,425011,Aspergillus niger (strain ATCC MYA-4892 / CBS ...
195,104355,Gloeophyllum trabeum (Brown rot fungus) (Agari...
196,6945,Ixodes scapularis (Black-legged tick) (Deer tick)
197,441959,Talaromyces stipitatus (strain ATCC 10500 / CB...


map the id to ranks

In [7]:
# TAXDUMPDIR = "D:\\bioinformatics\\taxdump\\db"
# taxid2rank = pd.read_csv(join(TAXDUMPDIR, "taxid_rank.tsv"), sep="\t", index_col=0)['rank'].to_dict()

# the cached
TAXDUMPDIR_L = join(CACHEDIR, "taxdump_db")
taxid2rank = pd.read_csv(join(TAXDUMPDIR_L, "taxid_rank.tsv"), sep="\t", index_col=0)['rank'].to_dict()

organisms['rank'] = organisms['taxid'].apply(lambda x: taxid2rank.get(int(x), ""))

In [15]:
_type = []
for i, row in organisms.iterrows():
    i_type = 2 # kingdom, phylum
    if row['rank'] in ['species', 'no rank']:
        i_type = 1
    elif row['rank'] in ['', 'strain']:
        i_type = -1 # non-map in NCBI Tax
    _type.append(i_type)

organisms['type'] = _type
organisms_sorted = organisms.sort_values(by=['name'], ascending=[True]).reset_index(drop=True)
organisms_sorted[organisms_sorted['type'] != -1].to_csv(filenames["organism"], sep='\t', index=None)