In [1]:
import networkx as nx
import itertools
import pandas as pd
import numpy as np
from stellargraph import StellarGraph
from rdkit.Chem import AllChem, DataStructs
import json
from sklearn import preprocessing, feature_extraction, model_selection
from sklearn.metrics import mean_absolute_error, mean_squared_error
from tensorflow import keras
import tensorflow as tf

import stellargraph as sg
from stellargraph.data import EdgeSplitter
from stellargraph.mapper import HinSAGELinkGenerator
from stellargraph.layer import HinSAGE, link_classification
from tensorflow.keras import Model, optimizers, losses, metrics

import multiprocessing
from stellargraph import datasets
from IPython.display import display, HTML
import matplotlib.pyplot as plt
from tensorflow.keras import mixed_precision
mixed_precision.set_global_policy('float32')

2023-06-27 13:33:53.016065: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-06-27 13:34:00.799683: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  SSE4.1 SSE4.2 AVX AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
g = nx.read_graphml("./graph/lotus_DB_as_graph.gml")
#species_features_dummy = pd.read_csv("./data/species_features_dummy.csv.gz", index_col=0)
species_features_dummy = pd.read_csv("./data/species_BaseNEncoder.csv.gz", index_col=0)
molecule_features_dummy = pd.read_csv("./data/mol_dummy_rdkit.csv.gz", index_col=0).astype("int8")
df_agg = pd.read_csv("./data/lotus_aggregated.csv", index_col=0)
#molecule_features_dummy = pd.read_csv("./data/molecule_features_dummy.csv.gz", index_col=0).astype('int8')

In [4]:
G = StellarGraph.from_networkx(g,
                               node_features={#'species':species_features_dummy,
                                              'molecule': molecule_features_dummy})
print(G.info())
G.check_graph_for_ml()

StellarDiGraph: Directed multigraph
 Nodes: 184990, Edges: 876144

 Node types:
  molecule: [148190]
    Features: float32 vector, length 1024
    Edge types: molecule-present_in->species
  species: [36800]
    Features: none
    Edge types: species-has->molecule

 Edge types:
    species-has->molecule: [438072]
        Weights: all 1 (default)
        Features: none
    molecule-present_in->species: [438072]
        Weights: all 1 (default)
        Features: none


In [5]:
batch_size = 256 #default: 200
epochs = 30 #default: 20
num_samples = [1, 1]
num_workers = multiprocessing.cpu_count()-2

In [6]:
# Define an edge splitter on the original graph G:
edge_splitter_test = EdgeSplitter(G)

# Randomly sample a fraction p=0.3 of all positive links, and same number of negative links, from G, and obtain the
# reduced graph G_test with the sampled links removed:
G_test, edge_ids_test, edge_labels_test = edge_splitter_test.train_test_split(
    p=0.1, method="global", keep_connected=False, edge_label="present_in"
)

Network has 438072 edges of type present_in
Network has 438072 edges of type present_in
** Sampled 43807 positive and 43807 negative edges. **


In [7]:
# Define an edge splitter on the reduced graph G_test:
edge_splitter_train = EdgeSplitter(G_test)

# Randomly sample a fraction p=0.3 of all positive links, and same number of negative links, from G_test, and obtain the
# reduced graph G_train with the sampled links removed:
G_train, edge_ids_train, edge_labels_train = edge_splitter_train.train_test_split(
    p=0.1, method="global", keep_connected=False, edge_label="present_in"
)

Network has 394265 edges of type present_in
Network has 394265 edges of type present_in
** Sampled 39426 positive and 39426 negative edges. **


In [8]:
print(G_train.info())

StellarDiGraph: Directed multigraph
 Nodes: 184990, Edges: 792911

 Node types:
  molecule: [148190]
    Features: float32 vector, length 1024
    Edge types: molecule-present_in->species
  species: [36800]
    Features: none
    Edge types: species-has->molecule

 Edge types:
    species-has->molecule: [438072]
        Weights: all 1 (default)
        Features: none
    molecule-present_in->species: [354839]
        Weights: all 1 (default)
        Features: none


In [9]:
print(G_test.info())

StellarDiGraph: Directed multigraph
 Nodes: 184990, Edges: 832337

 Node types:
  molecule: [148190]
    Features: float32 vector, length 1024
    Edge types: molecule-present_in->species
  species: [36800]
    Features: none
    Edge types: species-has->molecule

 Edge types:
    species-has->molecule: [438072]
        Weights: all 1 (default)
        Features: none
    molecule-present_in->species: [394265]
        Weights: all 1 (default)
        Features: none


In [10]:
train_gen = HinSAGELinkGenerator(G_train,
                                 batch_size=batch_size,
                                 num_samples=num_samples,
                                 head_node_types=["molecule", "species"])
train_flow = train_gen.flow(edge_ids_train, edge_labels_train, shuffle=True)

In [11]:
test_gen = HinSAGELinkGenerator(G_test,
                                batch_size=batch_size,
                                num_samples=num_samples,
                                head_node_types=["molecule", "species"])
test_flow = test_gen.flow(edge_ids_test, edge_labels_test)

In [12]:
hinsage_layer_sizes = [128, 128]
hinsage = HinSAGE(layer_sizes=hinsage_layer_sizes,
                  generator=train_gen,
                  bias=True,
                  dropout=0.3,
                 activations=['elu','selu'])

In [13]:
# Build the model and expose input and output sockets of graphsage model
# for link prediction
x_inp, x_out = hinsage.in_out_tensors()



In [14]:
prediction = link_classification(output_dim=1,
                                 output_act="sigmoid",
                                 edge_embedding_method="l1")(x_out)

link_classification: using 'l1' method to combine node embeddings into edge embeddings


In [15]:
model = keras.Model(inputs=x_inp, outputs=prediction)
lr_schedule = keras.optimizers.schedules.ExponentialDecay(
    initial_learning_rate=1e-1,
    decay_steps=10000,
    decay_rate=0.95)

optimizer=keras.optimizers.Adam(learning_rate=lr_schedule)
#optimizer = mixed_precision.LossScaleOptimizer(optimizer)

model.compile(
    optimizer=optimizer,
    loss=keras.losses.binary_crossentropy,
    metrics=["AUC"],
)

In [16]:
init_train_metrics = model.evaluate(train_flow, workers=num_workers, verbose=2)
init_test_metrics = model.evaluate(test_flow, workers=num_workers, verbose=2)

print("\nTrain Set Metrics of the initial (untrained) model:")
for name, val in zip(model.metrics_names, init_train_metrics):
    print("\t{}: {:0.4f}".format(name, val))

print("\nTest Set Metrics of the initial (untrained) model:")
for name, val in zip(model.metrics_names, init_test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

ValueError: cannot reshape array of size 0 into shape (256,newaxis,0)

In [None]:
callbacks = keras.callbacks.EarlyStopping(monitor="val_loss",
                                          patience=10,
                                          mode="auto",
                                          restore_best_weights=True)

history = model.fit(train_flow,
                    epochs=epochs,
                    workers=num_workers,
                    validation_data=test_flow,
                    verbose=2,
                    callbacks=[callbacks],
                    validation_split=0.0,
                    shuffle=True
                   )

In [None]:
sg.utils.plot_history(history)

In [None]:
train_metrics = model.evaluate(train_flow, verbose=2)
test_metrics = model.evaluate(test_flow, verbose=2)

print("\nTrain Set Metrics of the trained model:")
for name, val in zip(model.metrics_names, train_metrics):
    print("\t{}: {:0.4f}".format(name, val))

print("\nTest Set Metrics of the trained model:")
for name, val in zip(model.metrics_names, test_metrics):
    print("\t{}: {:0.4f}".format(name, val))

In [None]:
test_pred = HinSAGELinkGenerator(G,
                                 batch_size=128,
                                num_samples=num_samples,
                                head_node_types=["molecule", "species"]).flow(edge_ids_test, edge_labels_test)

In [None]:
predictions = model.predict(test_pred, workers=3)

In [None]:
predictions = predictions.flatten()

In [None]:
test = predictions[(predictions>0.9) | (predictions<0.1)]
test = test>0.5

In [None]:
test = test.astype('int8')

In [None]:
plt.hist(predictions)

In [None]:
np.where((predictions>0.9) | (predictions<0.1))

In [None]:
sum(test == edge_labels_test[np.where((predictions>0.9) | (predictions<0.1))])/len(test)

In [None]:
len(test)/len(predictions)

In [None]:
df_agg = pd.read_csv("./data/lotus_aggregated.csv", index_col=0)

In [None]:
finaly_test = np.array(list(zip(df_agg.structure_smiles_2D[:1000],df_agg.organism_name[::-1][:1000])))

In [None]:
final_test = HinSAGELinkGenerator(G,
                                batch_size=128,
                                num_samples=num_samples,
                                head_node_types=["molecule", "species"]).flow(finaly_test, np.ones(len(finaly_test)).reshape(-1, 1))

In [None]:
res = model.predict(final_test).flatten()

In [None]:
res[res>0.9]

In [None]:
len(res[res>0.9])/len(res)

In [None]:
finaly_test[res>0.9]

In [None]:
plt.hist(res)

In [None]:
org = df_agg.organism_name.unique()[:100]

In [None]:
mol = df_agg.structure_smiles_2D.unique()[:1000]

In [None]:
len(df_agg.structure_smiles_2D.unique())

In [None]:
len(df_agg.organism_name.unique())

In [None]:
%%time
combinations = list(itertools.product(mol, org))

In [None]:
%%time
A1, A2 = np.meshgrid(mol, org)

In [None]:
df_combinations = pd.DataFrame(combinations, columns=['structure_smiles_2D', 'organism_name'])

In [None]:
edge_test_final = df_combinations.to_numpy()

In [None]:
%%time
final_final_test = HinSAGELinkGenerator(G,
                                batch_size=256,
                                num_samples=num_samples,
                                head_node_types=["molecule", "species"]).flow(edge_test_final, np.ones(len(edge_test_final)).reshape(-1, 1))

In [None]:
final_final_pred = model.predict(final_final_test).flatten()

In [None]:
len(final_final_pred)

In [None]:
len(final_final_pred[final_final_pred>0.9])

In [None]:
results = edge_test_final[final_final_pred>0.9]

In [None]:
results[10000:10100]

In [None]:
df_agg

In [None]:
model.save(f"./model/batch_{batch_size}_layer_{hinsage_layer_sizes[0]}")

In [None]:
try_test = np.array([['C=C(C)C1CCC2(C)CCC3(C)C(CCC4C5(C)CCC(O)C(C)(C)C5CCC43C)C12', 'Homo sapiens']])

In [None]:
try_test = HinSAGELinkGenerator(G,
                                 batch_size=128,
                                num_samples=num_samples,
                                head_node_types=["molecule", "species"]).flow(try_test, np.ones(len(try_test)).reshape(-1, 1))

In [None]:
model.predict(try_test)

In [None]:
df_agg[df_agg.structure_smiles_2D=='C=C(C)C1CCC2(C)CCC3(C)C(CCC4C5(C)CCC(O)C(C)(C)C5CCC43C)C12']