## Do something interesting with NCBI's taxonomy data

I'm not sure where I'm going with this; maybe I'll create embeddings useful for a future AI or something:

## Import useful libraries

In [17]:
import pandas as pd
from neo4j import GraphDatabase

## User settings

In [2]:
path_nodes = '/Volumes/LaCie/EDA-Data/taxonomy/nodes.dmp'
path_names = '/Volumes/LaCie/EDA-Data/taxonomy/names.dmp'

URI = 'bolt://localhost:7687'
AUTH = ('neo4j', 'aoeuI823')

## Process the taxonomy names file

In [3]:
df_names = pd.read_csv(path_names, sep = '|', header=None)[[0, 1, 2, 3]]
df_names.columns = ['tax_id', 'name_txt', 'unique_name', 'name_class']

for column_name in ['name_txt', 'unique_name', 'name_class']:
    df_names[column_name] = [x.replace('\t', '').strip() for x in df_names[column_name]]

In [4]:
df_names[df_names['name_class'] == 'scientific name'][['tax_id', 'name_txt']]

Unnamed: 0,tax_id,name_txt
1,1,root
2,2,Bacteria
12,6,Azorhizobium
15,7,Azorhizobium caulinodans
27,9,Buchnera aphidicola
...,...,...
4016655,3137766,Vreelandella
4016657,3137844,Acrophialophora guangdongensis
4016670,3137845,Acrophialophora minuta
4016683,3137846,Acrophialophora multiforma


In [5]:
df_names[df_names['tax_id'] == 9605]

Unnamed: 0,tax_id,name_txt,unique_name,name_class
41215,9605,"Homo Linnaeus, 1758",,authority
41216,9605,Homo,,scientific name
41217,9605,humans,,common name


## Process the taxonomy relationships file

In [6]:
df_relation = pd.read_csv(path_nodes, sep = '|', header=None)[[0, 1, 2]]
df_relation.columns = ['tax_id', 'parent_tax_id', 'rank']
df_relation['rank'] = [x.replace('\t', '').strip() for x in df_relation['rank']]

In [7]:
df_relation

Unnamed: 0,tax_id,parent_tax_id,rank
0,1,1,no rank
1,2,131567,superkingdom
2,6,335928,genus
3,7,6,species
4,9,32199,species
...,...,...,...
2570965,3137766,28256,genus
2570966,3137844,389487,species
2570967,3137845,389487,species
2570968,3137846,389487,species


In [8]:
df_relation[df_relation['tax_id'] == 9605]

Unnamed: 0,tax_id,parent_tax_id,rank
7723,9605,207598,genus


## Merge

In [9]:
df_taxonomy_nodes = (
    pd.merge(
        df_names[df_names['name_class'] == 'scientific name'][['tax_id', 'name_txt']],
        df_relation[['tax_id', 'rank']],
        on = 'tax_id',
        how = 'left'
    )
)

In [10]:
df_taxonomy_nodes[df_taxonomy_nodes['tax_id'] == 9605]

Unnamed: 0,tax_id,name_txt,rank
7723,9605,Homo,genus


## Final nodes to load into Neo4j

In [11]:
df_taxonomy_nodes

Unnamed: 0,tax_id,name_txt,rank
0,1,root,no rank
1,2,Bacteria,superkingdom
2,6,Azorhizobium,genus
3,7,Azorhizobium caulinodans,species
4,9,Buchnera aphidicola,species
...,...,...,...
2570965,3137766,Vreelandella,genus
2570966,3137844,Acrophialophora guangdongensis,species
2570967,3137845,Acrophialophora minuta,species
2570968,3137846,Acrophialophora multiforma,species


## Final relationships to load into Neo4j

In [12]:
df_final_relationships = df_relation[['tax_id', 'parent_tax_id']].copy()
df_final_relationships = df_final_relationships[df_final_relationships['tax_id'] != df_final_relationships['parent_tax_id']]

In [13]:
df_final_relationships

Unnamed: 0,tax_id,parent_tax_id
1,2,131567
2,6,335928
3,7,6
4,9,32199
5,10,1706371
...,...,...
2570965,3137766,28256
2570966,3137844,389487
2570967,3137845,389487
2570968,3137846,389487


## QA

In [14]:
all_ids = list(df_relation['tax_id'])
all_ids.extend(list(df_relation['parent_tax_id']))
unique_tax_id_list = sorted(list(set(all_ids)))

In [15]:
print(len(unique_tax_id_list))

2570970


## Load into Neo4j

#### Nodes

Only run this once! I should have used a MERGE statement instead of a CREATE statement.

In [None]:
def load_taxonomy_nodes_tx(tx, tax_id, name, rank):
    result = tx.run(
        """CREATE (t:Taxonomy {ncbi_tax_id: $tax_id, name: $name, rank: $rank})""",
        tax_id = tax_id, name = name, rank = rank
    )    

with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        for i, row in df_final_taxonomy_nodes.iterrows():
            session.execute_write(load_taxonomy_nodes_tx, row['tax_id'], row['name_txt'], row['rank'])

#### Edges

In [None]:
def load_taxonomy_edges_tx(tx, tax_id, parent_tax_id):
    result = tx.run("""
        MATCH (t:Taxonomy {ncbi_tax_id: $tax_id}), (p:Taxonomy {ncbi_tax_id: $parent_tax_id})
        MERGE (t)-[r:HAS_TAXONOMY_PARENT]->(p)
        RETURN t.ncbi_tax_id AS ncbi_tax_id, p.ncbi_tax_id AS parent_tax_id
        """,
                    tax_id = tax_id,
                    parent_tax_id = parent_tax_id,
    )
    
with GraphDatabase.driver(URI, auth=AUTH) as driver:
    with driver.session(database="neo4j") as session:
        for i, row in df_final_relationships.iterrows():
            session.execute_write(load_taxonomy_edges_tx, row['tax_id'], row['parent_tax_id'])