In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import networkx as nx
from sklearn.cluster import KMeans

sys.path.insert(0,'/global/homes/b/bpb/repos/envnet/envnet/use')
import analysis_tools as at

INFO:rdkit:Enabling RDKit 2023.09.1 jupyter extensions


In [2]:
import os
from IPython import get_ipython
from pathlib import Path

def get_notebook_dir():
    ipython = get_ipython()
    if ipython:
        notebook_dir = ipython.starting_dir
        return notebook_dir
    else:
        return os.getcwd()

module_path = os.path.join(Path(get_notebook_dir()).parents[1])


G = nx.read_graphml(os.path.join(module_path, 'data/envnet.graphml'))

# write G to graphml
# nx.write_graphml(G,os.path.join(module_path, 'data/envnet.graphml'))




In [None]:
# use nod2vec to get embeddings and classify each node
# use the embeddings to classify the nodes

# get the embeddings
import node2vec
from sklearn.cluster import KMeans

from gensim.models import Word2Vec

# Precompute probabilities and generate walks - **ON WINDOWS ONLY WORKS WITH workers=1**
node2vec = node2vec.Node2Vec(G, dimensions=64, walk_length=20, num_walks=100, workers=16)

# Embed nodes
model = node2vec.fit(window=10, min_count=1, batch_words=4)

# Save embeddings for later use
# model.wv.save_word2vec_format(os.path.join(module_path, 'data/envnet.emb'))

# Save model for later use
# model.save(os.path.join(module_path, 'data/envnet.model'))

# Load embeddings
# model = Word2Vec.load('node2vec.model')
# model.wv.load_word2vec_format('node2vec.emb')

# Look for most similar nodes
# model.wv.most_similar('2')  # Output node names are always strings


Computing transition probabilities: 100%|██████████| 14340/14340 [00:26<00:00, 538.41it/s] 
Generating walks (CPU: 1): 100%|██████████| 7/7 [00:58<00:00,  8.36s/it]]
Generating walks (CPU: 2): 100%|██████████| 7/7 [00:55<00:00,  7.96s/it]]
Generating walks (CPU: 3): 100%|██████████| 7/7 [00:59<00:00,  8.46s/it]]
Generating walks (CPU: 5): 100%|██████████| 6/6 [00:50<00:00,  8.42s/it]
Generating walks (CPU: 6): 100%|██████████| 6/6 [00:48<00:00,  8.13s/it]]
Generating walks (CPU: 4): 100%|██████████| 7/7 [01:00<00:00,  8.62s/it]]
Generating walks (CPU: 7): 100%|██████████| 6/6 [00:46<00:00,  7.81s/it]
Generating walks (CPU: 8): 100%|██████████| 6/6 [00:47<00:00,  7.95s/it]]
Generating walks (CPU: 9): 100%|██████████| 6/6 [00:47<00:00,  7.89s/it]]
Generating walks (CPU: 10): 100%|██████████| 6/6 [00:47<00:00,  7.91s/it]
Generating walks (CPU: 11): 100%|██████████| 6/6 [00:47<00:00,  7.85s/it]
Generating walks (CPU: 12): 100%|██████████| 6/6 [00:47<00:00,  7.88s/it]
Generating walks (CPU:

In [None]:
# Get the node2vec embeddings
emb = model.wv.vectors
emb_df = pd.DataFrame(emb)
emb_df['node'] = model.wv.index2word
emb_df = emb_df.set_index('node')
emb_df = emb_df.reset_index()

# Elbow method to find the optimal number of clusters
wcss = []
max_clusters = 30  # You can adjust this value as needed
for i in range(1, max_clusters + 1):
    kmeans = KMeans(n_clusters=i, random_state=42)
    kmeans.fit(emb_df.drop(columns=['node']))
    wcss.append(kmeans.inertia_)

# Plot the elbow graph
plt.figure(figsize=(10, 6))
plt.plot(range(1, max_clusters + 1), wcss, marker='o')
plt.title('Elbow Method for Optimal Number of Clusters')
plt.xlabel('Number of Clusters')
plt.ylabel('WCSS')
plt.xticks(range(1, max_clusters + 1))
plt.grid(True)
plt.show()

# Choose the optimal number of clusters (e.g., based on the elbow plot)
optimal_clusters = 20  # Replace this with the optimal number of clusters from the elbow plot

# Cluster the embeddings into the optimal number of categories
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
emb_df['category'] = kmeans.fit_predict(emb_df.drop(columns=['node']))

# set all the categories to 0
for node in G.nodes:
    G.nodes[node]['category'] = 0

# Assign categories to nodes in the graph
for node, category in zip(emb_df['node'], emb_df['category']):
    G.nodes[node]['node2vec_category'] = category


In [5]:
node,category

('6947.0', 3)

In [6]:
nx.write_graphml(G,'envnet_node2vec.graphml')

In [None]:
node_data = at.graph_to_df()


In [None]:

node_data['inchi_key_identity'].nunique(),sum(pd.notna(node_data['class_results_propagated'])),sum(pd.notna(node_data['class_results'])),sum(pd.notna(node_data['inchi_key_identity'])),sum(pd.notna(node_data['inchi_key_identity']) & pd.notna(node_data['class_results_propagated'])),sum(pd.notna(node_data['inchi_key_identity']) & pd.notna(node_data['class_results']))

In [None]:
node_data = at.graph_to_df()
cols = ['node_id','class_results','class_results_propagated']
node_data = node_data[cols]
node_data['node_id'] = node_data['node_id'].astype(str)
node_data.set_index('node_id',inplace=True)
node_data

In [None]:
# assign colors to the top 10 class_results and the top 10 class_results_propagated
top_classes_propagated = node_data['class_results_propagated'].value_counts().index[:10]

# setup colormap for top 10
import matplotlib.colors as mcolors
import matplotlib.cm as cm
colors = cm.tab10.colors
colors = [mcolors.to_hex(c) for c in colors]
top_class_colors_propagated = dict(zip(top_classes_propagated,colors))

# put this information back in node_data
node_data['color_compound_class'] = node_data['class_results'].map(top_class_colors_propagated)
node_data['color_compound_class_propagated'] = node_data['class_results_propagated'].map(top_class_colors_propagated)
# fill in grey for missing values as hex code
node_data['color_compound_class'] = node_data['color_compound_class'].fillna('#FFFFFF')
node_data['color_compound_class_propagated'] = node_data['color_compound_class_propagated'].fillna('#FFFFFF')
# add color_compound_class and color_compound_class_propagated to the graph
nx.set_node_attributes(G,node_data['color_compound_class'].to_dict(),'color_compound_class')
nx.set_node_attributes(G,node_data['color_compound_class_propagated'].to_dict(),'color_compound_class_propagated')

# write the network back to graphml
nx.write_graphml(G,os.path.join(module_path, 'data/envnet.graphml'))

In [None]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches


def make_class_legend(top_class_colors,save_file=None):
    # Create a legend for the colors, use two columns for the top 10
    fig, ax = plt.subplots()
    patches = []
    for class_name,color in top_class_colors.items():
        patches.append(mpatches.Patch(color=color, label=class_name))
    plt.legend(handles=patches,ncol=2)
    plt.axis('off')
    if save_file:
        plt.savefig(save_file)
    plt.show()
make_class_legend(top_class_colors_propagated,os.path.join(module_path, 'data/compound_class_legend.pdf'))

In [None]:
node_data[(~node_data['color_compound_class'].str.contains('#FFFFFF'))].sample(20)