In [1]:
import pandas as pd
import numpy as np
import torch
from torch_geometric.data import Data, Dataset
from sklearn.model_selection import train_test_split

In [2]:
# Load data
df = pd.read_csv('./data/230106_frozen_metadata.csv.gz')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [3]:
#Remove duplicate organism-molecule pair
df_agg = df.groupby(['organism_taxonomy_09species',
                     'structure_smiles_2D']).size().reset_index(name='reference_wikidata')

df_agg = df.groupby(['organism_taxonomy_09species', 'structure_smiles_2D']).agg({
    'reference_wikidata': 'size',
    'organism_taxonomy_08genus': 'first',
    'organism_taxonomy_06family': 'first',
    'organism_taxonomy_05order': 'first',
    'organism_taxonomy_04class': 'first',
    'organism_taxonomy_03phylum': 'first',
    'organism_taxonomy_02kingdom': 'first',
    'organism_taxonomy_01domain': 'first',
    'structure_taxonomy_npclassifier_01pathway': 'first',
    'structure_taxonomy_npclassifier_02superclass': 'first',
    'structure_taxonomy_npclassifier_03class': 'first'
    # add other columns here as needed
}).reset_index()

df_agg['total_papers_molecule'] = df_agg.groupby(
    'structure_smiles_2D')['reference_wikidata'].transform('sum')
df_agg['total_papers_species'] = df_agg.groupby(
    'organism_taxonomy_09species')['reference_wikidata'].transform('sum')

#get random subset of the database (comment to have the full DB)
df_agg = df_agg.sample(n=100000).reset_index(drop=True)

In [4]:
df_agg.to_csv("./data/lotus_aggregated.csv.gz", compression="gzip")

In [5]:
# Load csv files
#df_agg = pd.read_csv("./data/lotus_aggregated.csv")
df_molecules_species = df_agg[['organism_taxonomy_09species', 'structure_smiles_2D']]
#df_species_features = pd.read_csv("./data/species_features_dummy.csv", index_col=0) # Considering species name as index
#df_species_features = df_species_features[df_species_features.columns[::-1]]
#df_molecules_features = pd.read_csv("./data/mol_dummy_rdkit.csv", index_col=0) # Considering molecule structure as index

In [6]:
# Here, we'll use a simple co-occurrence matrix as our graph, where species and molecules are nodes
species = df_molecules_species["organism_taxonomy_09species"].unique()
molecules = df_molecules_species["structure_smiles_2D"].unique()
num_species = len(species)
num_molecules = len(molecules)

In [7]:
species = pd.DataFrame(data={
    'name' : species,
    "ID": pd.RangeIndex(len(species)),
    "type": "species"
})

In [8]:
molecules = pd.DataFrame(data={
    'name' : molecules,
    "ID": pd.RangeIndex(start=len(species), stop=len(species)+len(molecules)),
    "type": "molecule"
})

In [9]:
species_to_species_id = pd.merge(df_molecules_species['organism_taxonomy_09species'],
         species,
         left_on='organism_taxonomy_09species',
        right_on='name',
         how='left')

In [10]:
mol_to_mol_id = pd.merge(df_molecules_species['structure_smiles_2D'],
         molecules,
         left_on='structure_smiles_2D',
        right_on='name',
         how='left')

In [11]:
species_to_species_id = species_to_species_id['ID'].values
mol_to_mol_id = mol_to_mol_id['ID'].values

In [12]:
edges = pd.DataFrame(data={"species": species_to_species_id,
                  "molecule": mol_to_mol_id,
                  "weight": df_agg['reference_wikidata']})

In [13]:
edges.to_csv("./data/edges_list.csv")

In [14]:
node_types_and_names = pd.concat([species, molecules], axis=0)

In [15]:
node_types_and_names.to_csv("./data/node_types_and_names.csv")

In [21]:
pd.DataFrame(df_agg.structure_smiles_2D.unique(),
             columns=['structure_smiles_2D']).to_csv("./data/molecule_SMILES.csv")

In [17]:
unique_species_df = df_agg.drop_duplicates(subset=['organism_taxonomy_09species'])
# Fetch the corresponding features
species_features_df = unique_species_df[['organism_taxonomy_09species','organism_taxonomy_08genus',
                                         'organism_taxonomy_06family', 'organism_taxonomy_05order',
                                         'organism_taxonomy_04class', 'organism_taxonomy_03phylum',
                                         'organism_taxonomy_02kingdom','organism_taxonomy_01domain']]
# Convert these dataframes to dummy/one-hot encoded dataframes
species_features_dummy = pd.get_dummies(species_features_df)
species_features_dummy.index = [i for i in unique_species_df['organism_taxonomy_09species']]
species_features_dummy.to_csv("./data/species_features_dummy.csv.gz", compression="gzip")