In [1]:
import torch_geometric
from torch_geometric.datasets import MoleculeNet

import networkx as nx
import pandas as pd
from IPython.display import display

from rdkit import Chem
from rdkit.Chem import Draw

In [None]:
# Load the HIV dataset
dataset = MoleculeNet(root="/Users/benbradshaw/Documents/Code/2023/hiv/data/", name="HIV")

In [None]:
num_node_features = dataset.num_node_features
num_edge_features = dataset.num_edge_features
num_graphs = len(dataset)
print('Number of Node Festures', num_node_features)
print('Number of Edges features', num_edge_features)
print('Number of Graphs:', num_graphs)

In [None]:
num_nodes = [data.num_nodes for data in dataset]
num_edges = [data.num_edges for data in dataset]

avg_nodes = sum(num_nodes) / len(num_nodes)
avg_edges = sum(num_edges) / len(num_edges)

print(f"Average nodes per graph: {avg_nodes}")
print(f"Average edges per graph: {avg_edges}")

In [None]:
data = dataset[0]

# Examine the attributes of the graph
print(f"Number of nodes: {data.num_nodes}")
print(f"Number of edges: {data.num_edges}")
print(f"Node features shape: {data.x.shape}")
print(f"Edge index shape: {data.edge_index.shape}")
print(f"Edge attributes shape: {data.edge_attr.shape if data.edge_attr is not None else 'No edge attributes'}")

In [None]:
G = torch_geometric.utils.to_networkx(data, to_undirected=True)
nx.draw(G, with_labels=True)

In [None]:
number_molecules_with_hiv = 0
pos_indexes, neg_indexes = [], []
for index, molecule in enumerate(dataset):
	label = molecule.y.item()
	if label:
		number_molecules_with_hiv += 1
		pos_indexes.append(index)
	else:
		neg_indexes.append(index)

pct_molecules_with_hiv = (100*number_molecules_with_hiv)/num_graphs
number_molecules_without_hiv = num_graphs - number_molecules_with_hiv

print('Number of Molecules with HIV CM or CA: {:.2f}%'.format(pct_molecules_with_hiv))

In [None]:
df = pd.read_csv('/Users/benbradshaw/Documents/Code/2023/hiv/data/hiv/raw/HIV.csv')
smiles = df['smiles'].tolist()

num_examples = 5
pos_molecules = [Chem.MolFromSmiles(smiles[index]) for index in pos_indexes]
neg_molecules = [Chem.MolFromSmiles(smiles[index]) for index in neg_indexes]

pos_molecules_sample = pos_molecules[:num_examples]
neg_molecules_sample = neg_molecules[:num_examples]

pos_images = Draw.MolsToGridImage(pos_molecules_sample, molsPerRow=num_examples)
neg_images = Draw.MolsToGridImage(neg_molecules_sample, molsPerRow=num_examples)

print('HIV Positive Molecules:')
display(pos_images)
print('HIV Negative Molecules:')
display(neg_images)


with open("./examples/pos_images.png", "wb") as f:
    f.write(pos_images.data)

with open("./examples/neg_images.png", "wb") as f:
    f.write(neg_images.data)

In [None]:
df = pd.read_csv('./data/hiv/raw/HIV.csv')
print('Number of Molecules HIV Positive:', number_molecules_with_hiv)
print('Number of Molecules HIV Negative:', number_molecules_without_hiv)