Download the files from the [NCBI Taxonomy ftp](https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/) site and preprocessed to obtain the required mapping files.

The script only needs to be ran once for the downloaded database, and the cached results can be reused for each evaluation.

In [1]:
import os
from os.path import join
import pandas as pd

OLD_DATADIR = "D:\\bioinformatics\\taxdump\\new_taxdump"
NEW_DATADIR = "D:\\bioinformatics\\taxdump\\db"
paths = {
    # original
    "name": join(OLD_DATADIR, "names.dmp"),
    "rankedlineage": join(OLD_DATADIR, "rankedlineage.dmp"),

    # cached, processed
    "taxid_name": join(NEW_DATADIR, "taxid_name.tsv"),
    "name_taxid": join(NEW_DATADIR, "name_taxid.tsv"),
    "taxid_rankedlineage": join(NEW_DATADIR, "taxidrankedlineage.tsv"),
    "taxid_counts": join(NEW_DATADIR, "taxid_counts.tsv")
}

name -> taxid and taxid -> name

In [None]:
names = pd.read_csv(paths['name'], sep='|', header=None)
names = names.drop(columns=[4])
names.columns = ["tax_id", "name_txt", "unique name", "name class"]
for col in ['name_txt', 'unique name', 'name class']:
    names[col] = names[col].apply(lambda x: x.strip("\t"))
taxid2name = names[names['name class'] == "scientific name"][['tax_id', 'name_txt']]
taxid2name.to_csv(paths['taxid_name'], sep="\t", index=False)

name2taxid = names[names['name_txt'].duplicated() == False][["name_txt", "tax_id"]]
name2taxid.to_csv(paths['name_taxid'], sep="\t", index=False)

taxid -> rankedlineage

In [None]:
import pandas as pd
name2taxid_dict = pd.read_csv(paths['name_taxid'], sep="\t", index_col=0)['tax_id'].to_dict()
rankedlineage = pd.read_csv(paths["rankedlineage"], sep='\t', header=None)
rankedlineage = rankedlineage.drop(columns=[1,3,5,7,9,11,13,15,17,19]).set_index(0).fillna('')
rankedlineage.columns = ["tax_name", "species", "genus", "family", "order", "class", "phylum", "kingdom", "superkingdom"]
for c in rankedlineage.columns:
    rankedlineage[f"{c}_tax_id"] = rankedlineage[c].apply(lambda x: name2taxid_dict[x] if x in name2taxid_dict.keys() else 0)
rankedlineage[["tax_name_tax_id", "species_tax_id", "genus_tax_id", "family_tax_id", "order_tax_id", "class_tax_id", "phylum_tax_id", "kingdom_tax_id", "superkingdom_tax_id"]].to_csv(paths['taxid_rankedlineage'], sep="\t", index=False)

taxid -> counts

In [2]:
taxidrankedlineage = pd.read_csv(paths['taxid_rankedlineage'], sep="\t", index_col=0)
taxidrankedlineage['tax_name_tax_id'] = taxidrankedlineage.index
taxidcounts = pd.DataFrame(taxidrankedlineage.stack().value_counts(), columns=['count'])
taxidcounts['score'] = taxidcounts['count'].apply(lambda x: 1/x)
taxidcounts.loc[0, "score"] = 0
taxidcounts.to_csv(paths['taxid_counts'], sep='\t', index=True)