In [1]:
import networkx as nx
import itertools
import pandas as pd
import numpy as np
from stellargraph import StellarGraph
from rdkit.Chem import AllChem, DataStructs
import json
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow import keras
import tensorflow as tf

import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_regression, link_classification
from tensorflow.keras import Model, optimizers, losses, metrics

import multiprocessing
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('float32')

2023-06-27 10:31:55.500893: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-27 10:32:01.808058: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
import networkx as nx
import itertools
import pandas as pd
import numpy as np
from stellargraph import StellarGraph
from rdkit.Chem import AllChem, DataStructs
import category_encoders as ce

# Load data
df = pd.read_csv('./data/230106_frozen_metadata.csv.gz', low_memory=False)
df = df.dropna(subset=['organism_name']).reset_index(drop=True)

In [35]:
#Remove duplicate organism-molecule pair
df_agg = df.groupby(['organism_name',
                     'structure_smiles_2D']).size().reset_index(name='reference_wikidata')

df_agg = df.groupby(['organism_name', 'structure_smiles_2D']).agg({
    'reference_wikidata': 'size',
    'organism_taxonomy_08genus': 'first',
    'organism_taxonomy_06family': 'first',
    'organism_taxonomy_05order': 'first',
    'organism_taxonomy_04class': 'first',
    'organism_taxonomy_03phylum': 'first',
    'organism_taxonomy_02kingdom': 'first',
    'organism_taxonomy_01domain': 'first',
    'structure_taxonomy_classyfire_01kingdom': 'first',
    'structure_taxonomy_classyfire_02superclass': 'first',
    'structure_taxonomy_classyfire_03class': 'first',
    'structure_taxonomy_classyfire_04directparent' : 'first'
    # add other columns here as needed
}).reset_index()

df_agg['total_papers_molecule'] = df_agg.groupby(
    'structure_smiles_2D')['reference_wikidata'].transform('sum')
df_agg['total_papers_species'] = df_agg.groupby(
    'organism_name')['reference_wikidata'].transform('sum')

#get random subset of the database (comment to have the full DB)
#df_agg = df_agg.sample(n=20000).reset_index(drop=True)

In [36]:
unique_species_df = df_agg.drop_duplicates(subset=['organism_name'])
species_features_df = unique_species_df[['organism_taxonomy_01domain', 'organism_taxonomy_02kingdom',
          'organism_taxonomy_03phylum', 'organism_taxonomy_04class',
         'organism_taxonomy_05order', 'organism_taxonomy_06family',
         'organism_taxonomy_08genus', 'organism_name']]

In [5]:
# Import the necessary modules
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import hashlib

# Initialize a LabelEncoder for each level of the taxonomy
encoders = {col: LabelEncoder() for col in species_features_df.columns}

# Create a new DataFrame to hold the encoded labels
encoded_df = species_features_df.copy()

# Encode each column, but concatenate the encoded labels as we go
for col in species_features_df.columns:
    # Apply the LabelEncoder for this column
    encoded_labels = encoders[col].fit_transform(species_features_df[col])
    
    # Concatenate the encoded labels to the labels from the previous column
    if col == 'organism_taxonomy_01domain':
        encoded_df[col] = encoded_labels
    else:
        encoded_df[col] = encoded_df[prev_col].astype(str) + "_" + encoded_labels.astype(str)
    
    prev_col = col

# Initialize a MinMaxScaler to scale the hashed values
scaler = MinMaxScaler()

# Now convert the encoded labels to floats by applying a hash function and then scaling
for col in encoded_df.columns:
    # Apply the hash function and convert to absolute values to avoid negative numbers
    encoded_df[col] = [int(hashlib.md5(str(x).encode()).hexdigest(), 16) for x in encoded_df[col]]
    
    # Scale the hashed values to be between 0 and 1
    encoded_df[col] = scaler.fit_transform(encoded_df[col].values.reshape(-1, 1))


In [6]:
encoded_df.index = [i for i in species_features_df.organism_name]

In [7]:
encoded_df.to_csv("./data/species_features_encoded_with_hash.csv.gz", compression="gzip")

In [37]:
# Fetch unique species and molecules and their respective features
unique_species_df = df_agg.drop_duplicates(subset=['organism_name'])
unique_molecules_df = df_agg.drop_duplicates(subset=['structure_smiles_2D'])

# Fetch the corresponding features
species_features_df = unique_species_df[['organism_taxonomy_01domain', 'organism_taxonomy_02kingdom',
          'organism_taxonomy_03phylum', 'organism_taxonomy_04class',
         'organism_taxonomy_05order', 'organism_taxonomy_06family',
         'organism_taxonomy_08genus', 'organism_name']]
molecule_features_df = unique_molecules_df[['structure_taxonomy_classyfire_01kingdom',
                                            'structure_taxonomy_classyfire_02superclass',
                                            'structure_taxonomy_classyfire_03class',
                                            'structure_taxonomy_classyfire_04directparent']]


# create features
encoder = ce.BinaryEncoder(cols=[col for col in species_features_df.columns])
species_features_dummy = encoder.fit_transform(species_features_df)

In [38]:
enc = ce.basen.BaseNEncoder(cols=[col for col in species_features_df.columns], base=len(species_features_df)+1)

In [39]:
features = enc.fit_transform(species_features_df)

In [40]:
features.index = [i for i in species_features_df.organism_name]

In [41]:
features.to_csv("./data/species_BaseNEncoder.csv.gz", compression="gzip")