In [1]:
import networkx as nx
import itertools
import pandas as pd
import numpy as np
from stellargraph import StellarGraph
from rdkit.Chem import AllChem, DataStructs
import category_encoders as ce

# Load data
df = pd.read_csv('./data/230106_frozen_metadata.csv.gz', low_memory=False)
df = df.dropna(subset=['organism_name']).reset_index(drop=True)

2023-07-25 11:34:02.550618: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [9]:
df.columns

Index(['structure_wikidata', 'structure_inchikey', 'structure_inchi',
       'structure_smiles', 'structure_molecular_formula',
       'structure_exact_mass', 'structure_xlogp', 'structure_smiles_2D',
       'structure_cid', 'structure_nameIupac', 'structure_nameTraditional',
       'structure_stereocenters_total', 'structure_stereocenters_unspecified',
       'structure_taxonomy_npclassifier_01pathway',
       'structure_taxonomy_npclassifier_02superclass',
       'structure_taxonomy_npclassifier_03class',
       'structure_taxonomy_classyfire_chemontid',
       'structure_taxonomy_classyfire_01kingdom',
       'structure_taxonomy_classyfire_02superclass',
       'structure_taxonomy_classyfire_03class',
       'structure_taxonomy_classyfire_04directparent', 'organism_wikidata',
       'organism_name', 'organism_taxonomy_gbifid', 'organism_taxonomy_ncbiid',
       'organism_taxonomy_ottid', 'organism_taxonomy_01domain',
       'organism_taxonomy_02kingdom', 'organism_taxonomy_03phylum'

In [83]:
#Remove duplicate organism-molecule pair
df_agg = df.groupby(['organism_name',
                     'structure_smiles_2D']).size().reset_index(name='reference_wikidata')

df_agg = df.groupby(['organism_name', 'structure_smiles_2D']).agg({
    'reference_wikidata': 'size',
    'organism_taxonomy_08genus': 'first',
    'organism_taxonomy_06family': 'first',
    'organism_taxonomy_05order': 'first',
    'organism_taxonomy_04class': 'first',
    'organism_taxonomy_03phylum': 'first',
    'organism_taxonomy_02kingdom': 'first',
    'organism_taxonomy_01domain': 'first',
    'structure_taxonomy_classyfire_01kingdom': 'first',
    'structure_taxonomy_classyfire_02superclass': 'first',
    'structure_taxonomy_classyfire_03class': 'first',
    'structure_taxonomy_classyfire_04directparent' : 'first'
    # add other columns here as needed
}).reset_index()

df_agg['total_papers_molecule'] = df_agg.groupby(
    'structure_smiles_2D')['reference_wikidata'].transform('sum')
df_agg['total_papers_species'] = df_agg.groupby(
    'organism_name')['reference_wikidata'].transform('sum')

#get random subset of the database (comment to have the full DB)
#df_agg = df_agg.sample(n=50000, random_state=42).reset_index(drop=True)

In [82]:
gbif = pd.read_csv("./data_gbif/GBIF.csv.gz", index_col=0)

In [84]:
df_agg = df_agg.merge(gbif, on='organism_name')

In [104]:
df_agg[pd.isna(df_agg.genus)]

Unnamed: 0,organism_name,structure_smiles_2D,reference_wikidata,organism_taxonomy_08genus,organism_taxonomy_06family,organism_taxonomy_05order,organism_taxonomy_04class,organism_taxonomy_03phylum,organism_taxonomy_02kingdom,organism_taxonomy_01domain,...,classKey,orderKey,familyKey,genusKey,synonym,class,species,speciesKey,acceptedUsageKey,note
1493,Acacia glauca,CC1OC(Oc2c(-c3ccc(O)c(O)c3)oc3cc(O)cc(O)c3c2=O...,1,Acaciella,Fabaceae,Fabales,Magnoliopsida,Streptophyta,Archaeplastida,Eukaryota,...,220.0,1370.0,5386.0,,False,Magnoliopsida,,,,
1494,Acacia glauca,O=C(O)C1CCC(O)CN1,4,Acaciella,Fabaceae,Fabales,Magnoliopsida,Streptophyta,Archaeplastida,Eukaryota,...,220.0,1370.0,5386.0,,False,Magnoliopsida,,,,
1519,Acacia horrida,COc1cc(-c2oc3cc(O)cc(O)c3c(=O)c2OC2OC(COC3OC(C...,1,Vachellia,Fabaceae,Fabales,Magnoliopsida,Streptophyta,Archaeplastida,Eukaryota,...,220.0,1370.0,5386.0,,False,Magnoliopsida,,,,
1520,Acacia horrida,O=C(O)c1cc(O)c(O)c(O)c1,2,Vachellia,Fabaceae,Fabales,Magnoliopsida,Streptophyta,Archaeplastida,Eukaryota,...,220.0,1370.0,5386.0,,False,Magnoliopsida,,,,
1521,Acacia horrida,Oc1cc(O)c2c(c1)OC(c1cc(O)c(O)c(O)c1)C(O)C2,1,Vachellia,Fabaceae,Fabales,Magnoliopsida,Streptophyta,Archaeplastida,Eukaryota,...,220.0,1370.0,5386.0,,False,Magnoliopsida,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
437953,Zygophyllum coccineum,CC1CCC2(C(=O)OC3OC(CO)C(O)C(O)C3O)CCC3(C(=O)O)...,2,Tetraena,Zygophyllaceae,Zygophyllales,Magnoliopsida,Streptophyta,Archaeplastida,Eukaryota,...,220.0,7219293.0,2394.0,,False,Magnoliopsida,,,,
437954,Zygophyllum coccineum,CC1CCC2(C(=O)OC3OC(CO)C(O)C(O)C3O)CCC3(C(=O)O)...,1,Tetraena,Zygophyllaceae,Zygophyllales,Magnoliopsida,Streptophyta,Archaeplastida,Eukaryota,...,220.0,7219293.0,2394.0,,False,Magnoliopsida,,,,
437955,Zygophyllum coccineum,CC1CCC2(C(=O)OC3OC(CO)C(O)C(O)C3O)CCC3(C(=O)O)...,1,Tetraena,Zygophyllaceae,Zygophyllales,Magnoliopsida,Streptophyta,Archaeplastida,Eukaryota,...,220.0,7219293.0,2394.0,,False,Magnoliopsida,,,,
437956,Zygophyllum coccineum,CCC(CCC(C)C1CCC2C3CC=C4CC(OC5OC(CO)C(O)C(O)C5O...,1,Tetraena,Zygophyllaceae,Zygophyllales,Magnoliopsida,Streptophyta,Archaeplastida,Eukaryota,...,220.0,7219293.0,2394.0,,False,Magnoliopsida,,,,


In [100]:
df_agg.columns

Index(['organism_name', 'structure_smiles_2D', 'reference_wikidata',
       'organism_taxonomy_08genus', 'organism_taxonomy_06family',
       'organism_taxonomy_05order', 'organism_taxonomy_04class',
       'organism_taxonomy_03phylum', 'organism_taxonomy_02kingdom',
       'organism_taxonomy_01domain', 'structure_taxonomy_classyfire_01kingdom',
       'structure_taxonomy_classyfire_02superclass',
       'structure_taxonomy_classyfire_03class',
       'structure_taxonomy_classyfire_04directparent', 'total_papers_molecule',
       'total_papers_species', 'usageKey', 'scientificName', 'canonicalName',
       'rank', 'status', 'confidence', 'matchType', 'kingdom', 'phylum',
       'order', 'family', 'genus', 'kingdomKey', 'phylumKey', 'classKey',
       'orderKey', 'familyKey', 'genusKey', 'synonym', 'class', 'species',
       'speciesKey', 'acceptedUsageKey', 'note'],
      dtype='object')

In [None]:
df_agg.structure_smiles_2D.to_csv("./data/smiles_struct.csv")

In [None]:
# Fetch unique species and molecules and their respective features
unique_species_df = df_agg.drop_duplicates(subset=['organism_name'])
unique_molecules_df = df_agg.drop_duplicates(subset=['structure_smiles_2D'])

# Fetch the corresponding features
species_features_df = unique_species_df[['organism_taxonomy_01domain', 'organism_taxonomy_02kingdom',
          'organism_taxonomy_03phylum', 'organism_taxonomy_04class',
         'organism_taxonomy_05order', 'organism_taxonomy_06family',
         'organism_taxonomy_08genus', 'organism_name']]
molecule_features_df = unique_molecules_df[['structure_taxonomy_classyfire_01kingdom',
                                            'structure_taxonomy_classyfire_02superclass',
                                            'structure_taxonomy_classyfire_03class',
                                            'structure_taxonomy_classyfire_04directparent']]


# create features
encoder = ce.BinaryEncoder(cols=[col for col in species_features_df.columns])
species_features_dummy = encoder.fit_transform(species_features_df)

#molecule_features_dummy = pd.get_dummies(molecule_features_df)
species_features_dummy.index = [i for i in unique_species_df['organism_name']]
#molecule_features_dummy.index = [i for i in unique_molecules_df['structure_smiles_2D']]

In [27]:
g = nx.DiGraph()
for i, row in df_agg.iterrows():
    g.add_edge(row['structure_smiles_2D'],
               row['organism_name'],
              label="present_in")

    #create edge in oppsite direction
    g.add_edge(row['organism_name'],
               row['structure_smiles_2D'],
              label="has")
    nx.set_node_attributes(g,
                           {row['structure_smiles_2D']: 'molecule',
                            row['organism_name']: 'species'},
                           "label")
    #nx.set_edge_attributes(g,
    #                      {(row['structure_smiles_2D'],
    #                        row['organism_name']):
    #                       {'weight':row['reference_wikidata']}})
    #nx.set_edge_attributes(g,
    #                      {(row['organism_name'],
    #                       row['structure_smiles_2D']):
    #                       {'weight':row['reference_wikidata']}})

In [28]:
from itertools import combinations

# create a subset of df_agg where 'organism_taxonomy_08genus' is not null
df_agg_sub = df_agg[df_agg['organism_taxonomy_08genus'].notnull()]

# group the dataframe by 'organism_taxonomy_08genus'
grouped = df_agg_sub.groupby('organism_taxonomy_08genus')

# for each group (i.e., each unique genus), add edges between all pairs of species in the group
for genus, group in grouped:
    for species1, species2 in combinations(group['organism_name'], 2):
        g.add_edge(species1, species2, label="same_genus")
        g.add_edge(species2, species1, label="same_genus")

In [None]:
fps = [AllChem.MolFromSmiles(i) for i in unique_molecules_df['structure_smiles_2D']]
mols  = [AllChem.GetMorganFingerprintAsBitVect(m, radius=2, nBits=1024) for m in fps]
mol_dum = [np.array(i) for i in mols]
mol_dum = pd.DataFrame(mol_dum)
mol_dum.index = [i for i in unique_molecules_df['structure_smiles_2D']]

# Iterate over each pair of molecules
#for i, j in itertools.combinations(range(len(fps)), 2):
#    # Calculate the Tanimoto Similarity
#    similarity = DataStructs.TanimotoSimilarity(mols[i], mols[j])
#    # If the similarity is above 0.9, add an edge
#    if similarity >= 0.8:
#        g.add_edge(unique_molecules_df['structure_smiles_2D'].values[i], 
#                  unique_molecules_df['structure_smiles_2D'].values[j],
#                  label="similar_to")
#        g.add_edge(unique_molecules_df['structure_smiles_2D'].values[j], 
#                  unique_molecules_df['structure_smiles_2D'].values[i],
#                  label="similar_to")
#        nx.set_edge_attributes(g,
#                              {(unique_molecules_df['structure_smiles_2D'].values[i],
#                              unique_molecules_df['structure_smiles_2D'].values[j]):{'weight': similarity}})
#        nx.set_edge_attributes(g,
#                              {(unique_molecules_df['structure_smiles_2D'].values[j],
#                              unique_molecules_df['structure_smiles_2D'].values[i]):{'weight': similarity}})

In [None]:
nx.write_graphml(g, "./graph/lotus_DB_as_graph.gml")

In [None]:
#molecule_features_dummy.to_csv("./data/molecule_features_dummy.csv.gz", compression="gzip")
species_features_dummy.to_csv("./data/species_features_dummy.csv.gz", compression="gzip")
df_agg.to_csv("./data/lotus_aggregated.csv")
mol_dum.to_csv("./data/mol_dummy_rdkit.csv.gz", compression="gzip")