In [1]:
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
import pandas as pd

In [2]:
# Load data
df = pd.read_csv('./data/230106_frozen_metadata.csv.gz', low_memory=False)
df = df.dropna(subset=['organism_name']).reset_index(drop=True)

In [3]:
#Remove duplicate organism-molecule pair
df_agg = df.groupby(['organism_name',
                     'structure_smiles_2D']).size().reset_index(name='reference_wikidata')

df_agg = df.groupby(['organism_name', 'structure_smiles_2D']).agg({
    'reference_wikidata': 'size',
    'organism_taxonomy_08genus': 'first',
    'organism_taxonomy_06family': 'first',
    'organism_taxonomy_05order': 'first',
    'organism_taxonomy_04class': 'first',
    'organism_taxonomy_03phylum': 'first',
    'organism_taxonomy_02kingdom': 'first',
    'organism_taxonomy_01domain': 'first',
    'structure_taxonomy_classyfire_01kingdom': 'first',
    'structure_taxonomy_classyfire_02superclass': 'first',
    'structure_taxonomy_classyfire_03class': 'first',
    'structure_taxonomy_classyfire_04directparent' : 'first'
    # add other columns here as needed
}).reset_index()

df_agg['total_papers_molecule'] = df_agg.groupby(
    'structure_smiles_2D')['reference_wikidata'].transform('sum')
df_agg['total_papers_species'] = df_agg.groupby(
    'organism_name')['reference_wikidata'].transform('sum')

#get random subset of the database (comment to have the full DB)
df_agg = df_agg.sample(n=50000, random_state=42).reset_index(drop=True)
#df_agg = df_agg.dropna(subset=['organism_taxonomy_01domain']).reset_index(drop=True)

In [4]:
unique_species_df = df_agg.drop_duplicates(subset=['organism_name'])
species_features_df = unique_species_df[['organism_taxonomy_01domain', 'organism_taxonomy_02kingdom',
          'organism_taxonomy_03phylum', 'organism_taxonomy_04class',
         'organism_taxonomy_05order', 'organism_taxonomy_06family',
         'organism_taxonomy_08genus', 'organism_name']]

In [5]:
import networkx as nx
import pandas as pd
import numpy as np

# Initialize a directed graph
phylo_tree = nx.DiGraph()

# Iterate over rows in the DataFrame
for index, row in unique_species_df.iterrows():
    # Get the taxonomic ranks for this organism
    taxonomy = row[['organism_taxonomy_01domain', 'organism_taxonomy_02kingdom',
                    'organism_taxonomy_03phylum', 'organism_taxonomy_04class',
                    'organism_taxonomy_05order', 'organism_taxonomy_06family',
                    'organism_taxonomy_08genus', 'organism_name']]
    taxonomy_list = taxonomy[::-1].to_list()
    
    # Replace None with np.nan for consistency
    taxonomy_list = [np.nan if x is None else x for x in taxonomy_list]
    
    i = 0
    while i < len(taxonomy_list) - 1:
        if pd.isna(taxonomy_list[i]):
            j = i + 1
            while j < len(taxonomy_list):
                if pd.notna(taxonomy_list[j]):
                    # Add an edge from rank j to rank i, skipping if organism_name and genus are the same
                    if not (i == len(taxonomy_list) - 1 and j == len(taxonomy_list) - 2 and taxonomy_list[i] == taxonomy_list[j]):
                        phylo_tree.add_edge(taxonomy_list[j], taxonomy_list[i])
                    break
                j += 1
        else:
            # Add an edge from rank i+1 to rank i, skipping if organism_name and genus are the same
            if not (taxonomy_list[i] == taxonomy_list[i+1]):
                phylo_tree.add_edge(taxonomy_list[i+1], taxonomy_list[i])
        i += 1


In [28]:
# Extract leaf nodes (organisms) from the graph
leaf_nodes = [i for i in unique_species_df.organism_name]

# Create an undirected version of the graph
undirected_phylo_tree = phylo_tree.to_undirected()

# Calculate shortest path lengths for all pairs
all_path_lengths = dict(nx.all_pairs_shortest_path_length(undirected_phylo_tree))

KeyboardInterrupt: 

In [None]:
all_path_lengths = {k: {sub_k: sub_v for sub_k, sub_v in v.items() if sub_k in leaf_nodes} 
                    for k, v in all_path_lengths.items() if k in leaf_nodes}

In [9]:
# Create a set for efficient lookup
leaf_nodes_set = set(leaf_nodes)

# Use dictionary comprehension to build the leaf_distance_matrix
leaf_distance_matrix = {leaf1: {leaf2: all_path_lengths[leaf1].get(leaf2, float('inf')) for leaf2 in leaf_nodes_set} 
                        for leaf1 in leaf_nodes_set if leaf1 in all_path_lengths}


leaf_distance_matrix = pd.DataFrame(leaf_distance_matrix)

KeyboardInterrupt: 

In [None]:
from sklearn.manifold import MDS

# Assume `dist_matrix` is your distance matrix.
mds = MDS(n_components=10, dissimilarity='precomputed', random_state=42, n_jobs=-1)
embeddings = mds.fit_transform(leaf_distance_matrix)

In [None]:
embeddings = pd.DataFrame(embeddings)

In [None]:
embeddings.index = [i for i in species_features_df.organism_name]

In [None]:
embeddings.to_csv("./data/species_Multi_dim_scale.csv.gz", compression="gzip")