# Setup

In [1]:
from google.colab import drive

drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [2]:
%cd gdrive/MyDrive/News+LPReports/Graph/guide

/content/gdrive/MyDrive/News+LPReports/Graph/guide


In [3]:
!pip install rdkit-pypi

Collecting rdkit-pypi
[?25l  Downloading https://files.pythonhosted.org/packages/6c/f9/708cb804901775aeac7cbc2e3f966eac086e4d4caa1df0515aa365617e11/rdkit_pypi-2021.3.3-cp37-cp37m-manylinux2014_x86_64.whl (18.6MB)
[K     |████████████████████████████████| 18.6MB 1.2MB/s 
Installing collected packages: rdkit-pypi
Successfully installed rdkit-pypi-2021.3.3


In [4]:
!pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
!pip install torch-geometric

Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
Collecting torch-scatter
[?25l  Downloading https://pytorch-geometric.com/whl/torch-1.9.0%2Bcu102/torch_scatter-2.0.7-cp37-cp37m-linux_x86_64.whl (2.6MB)
[K     |████████████████████████████████| 2.6MB 12.4MB/s 
[?25hInstalling collected packages: torch-scatter
Successfully installed torch-scatter-2.0.7
Looking in links: https://pytorch-geometric.com/whl/torch-1.9.0+cu102.html
Collecting torch-sparse
[?25l  Downloading https://pytorch-geometric.com/whl/torch-1.9.0%2Bcu102/torch_sparse-0.6.10-cp37-cp37m-linux_x86_64.whl (1.4MB)
[K     |████████████████████████████████| 1.4MB 12.0MB/s 
Installing collected packages: torch-sparse
Successfully installed torch-sparse-0.6.10
Collecting torch-geometric
[?25l  Downloading https://files.pythonhosted.org/packages/91/61/b3f23832120c404673f6759008312ffe8269524a29bf6116d9980e44517b/torch_geometric-1.7.2.tar.gz (222kB)
[K     |████████████████████████████████| 225kB 14

In [5]:
import pandas as pd
import rdkit
from rdkit import Chem
from rdkit.Chem import Draw
import torch
import torch_geometric
from torch_geometric.data import Dataset, Data
import numpy as np 
import os
from tqdm import tqdm

In [6]:
print(f"Torch version: {torch.__version__}")
print(f"Cuda available: {torch.cuda.is_available()}")
print(f"Torch geometric version: {torch_geometric.__version__}")

Torch version: 1.9.0+cu102
Cuda available: False
Torch geometric version: 1.7.2


# Data

In [7]:
df = pd.read_csv('HIV.csv')
df.head(3)

Unnamed: 0,smiles,activity,HIV_active
0,CCC1=[O+][Cu-3]2([O+]=C(CC)C1)[O+]=C(CC)CC(CC)...,CI,0
1,C(=Cc1ccccc1)C1=[O+][Cu-3]2([O+]=C(C=Cc3ccccc3...,CI,0
2,CC(=O)N1c2ccccc2Sc2c1ccc1ccccc21,CI,0


In [12]:
def _get_node_features(mol):
  """ 
  This will return a matrix / 2d array of the shape
  [Number of Nodes, Node Feature size]
  """
  all_node_feats = []

  for atom in mol.GetAtoms():
      node_feats = []
      # Feature 1: Atomic number       
      node_feats.append(atom.GetAtomicNum())
      # Feature 2: Atom degree
      node_feats.append(atom.GetDegree())
      # Feature 3: Formal charge
      node_feats.append(atom.GetFormalCharge())
      # Feature 4: Hybridization
      node_feats.append(atom.GetHybridization())
      # Feature 5: Aromaticity
      node_feats.append(atom.GetIsAromatic())
      # Feature 6: Total Num Hs
      node_feats.append(atom.GetTotalNumHs())
      # Feature 7: Radical Electrons
      node_feats.append(atom.GetNumRadicalElectrons())
      # Feature 8: In Ring
      node_feats.append(atom.IsInRing())
      # Feature 9: Chirality
      node_feats.append(atom.GetChiralTag())

      # Append node features to matrix
      all_node_feats.append(node_feats)

  all_node_feats = np.asarray(all_node_feats)
  return torch.tensor(all_node_feats, dtype=torch.float)

In [10]:
for index, mol in tqdm(df.iterrows(), total=df.shape[0]):
  mol_obj = Chem.MolFromSmiles(mol["smiles"])

100%|██████████| 41127/41127 [00:09<00:00, 4166.74it/s]


In [13]:
_get_node_features(mol_obj)

tensor([[ 6.,  1.,  0.,  4.,  0.,  3.,  0.,  0.,  0.],
        [ 6.,  2.,  0.,  4.,  0.,  2.,  0.,  0.,  0.],
        [ 6.,  2.,  0.,  4.,  0.,  2.,  0.,  0.,  0.],
        [ 6.,  2.,  0.,  4.,  0.,  2.,  0.,  0.,  0.],
        [ 6.,  2.,  0.,  4.,  0.,  2.,  0.,  0.,  0.],
        [ 6.,  2.,  0.,  3.,  0.,  1.,  0.,  0.,  0.],
        [ 6.,  3.,  0.,  3.,  0.,  0.,  0.,  0.,  0.],
        [ 6.,  3.,  0.,  3.,  1.,  0.,  0.,  1.,  0.],
        [ 6.,  2.,  0.,  3.,  1.,  1.,  0.,  1.,  0.],
        [ 6.,  3.,  0.,  3.,  1.,  0.,  0.,  1.,  0.],
        [17.,  1.,  0.,  4.,  0.,  0.,  0.,  0.,  0.],
        [ 6.,  3.,  0.,  3.,  1.,  0.,  0.,  1.,  0.],
        [ 8.,  2.,  0.,  3.,  0.,  0.,  0.,  0.,  0.],
        [ 6.,  1.,  0.,  4.,  0.,  3.,  0.,  0.,  0.],
        [ 6.,  3.,  0.,  3.,  1.,  0.,  0.,  1.,  0.],
        [ 6.,  3.,  0.,  3.,  1.,  0.,  0.,  1.,  0.],
        [ 7.,  2.,  0.,  3.,  1.,  0.,  0.,  1.,  0.],
        [ 6.,  3.,  0.,  3.,  1.,  0.,  0.,  1.,  0.],
        [ 

In [14]:
print(len(_get_node_features(mol_obj)))

37


In [15]:
def _get_edge_features(mol):
  """ 
  This will return a matrix / 2d array of the shape
  [Number of edges, Edge Feature size]
  """
  all_edge_feats = []

  for bond in mol.GetBonds():
      edge_feats = []
      # Feature 1: Bond type (as double)
      edge_feats.append(bond.GetBondTypeAsDouble())
      # Feature 2: Rings
      edge_feats.append(bond.IsInRing())
      # Append node features to matrix (twice, per direction)
      all_edge_feats += [edge_feats, edge_feats]

  all_edge_feats = np.asarray(all_edge_feats)
  return torch.tensor(all_edge_feats, dtype=torch.float)

In [16]:
_get_edge_features(mol_obj)

tensor([[1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [2.0000, 0.0000],
        [2.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.5000, 1.0000],
        [1.5000, 1.0000],
        [1.5000, 1.0000],
        [1.5000, 1.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.5000, 1.0000],
        [1.5000, 1.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.5000, 1.0000],
        [1.5000, 1.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.5000, 1.0000],
        [1.5000, 1.0000],
        [1.5000, 1.0000],
        [1.5000, 1.0000],
        [1.0000, 0.0000],
        [1.0000, 0.0000],
        [1.5000, 1.0000],
        [1.5000, 1.0000],
        [1.5

In [17]:
print(len(_get_edge_features(mol_obj)))

80


In [27]:
def _get_adjacency_info(mol):
    """
    We could also use rdmolops.GetAdjacencyMatrix(mol)
    but we want to be sure that the order of the indices
    matches the order of the edge features
    """
    edge_indices = []
    for bond in mol.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edge_indices += [[i, j], [j, i]]

    edge_indices = torch.tensor(edge_indices)
    edge_indices = edge_indices.t().to(torch.long).view(2, -1)
    return edge_indices

In [28]:
_get_adjacency_info(mol_obj)

tensor([[ 0,  1,  1,  2,  2,  3,  3,  4,  4,  5,  5,  6,  6,  7,  6,  8,  8,  9,
          9, 10, 10, 11, 10, 12, 12, 13, 12, 14, 14, 15, 15, 16, 15, 17, 17, 18,
         18, 19, 19, 20, 20, 21, 21, 22, 22, 23, 23, 24, 23, 25, 25, 26, 18, 27,
         27, 28, 27, 29, 29, 30, 30, 31, 31, 32, 17, 33, 33, 34, 34, 35, 34, 36,
         36, 37, 36, 38, 38, 39, 39, 40, 40, 41, 41, 42, 41, 43, 43, 44, 43, 45,
         45, 46, 45, 47, 47, 48, 47, 49, 49, 50, 49, 51, 51, 52, 52, 53, 53, 54,
         53, 55, 51, 56, 56, 57, 56,  2, 31,  6, 30,  9, 29, 14, 26, 20],
        [ 1,  0,  2,  1,  3,  2,  4,  3,  5,  4,  6,  5,  7,  6,  8,  6,  9,  8,
         10,  9, 11, 10, 12, 10, 13, 12, 14, 12, 15, 14, 16, 15, 17, 15, 18, 17,
         19, 18, 20, 19, 21, 20, 22, 21, 23, 22, 24, 23, 25, 23, 26, 25, 27, 18,
         28, 27, 29, 27, 30, 29, 31, 30, 32, 31, 33, 17, 34, 33, 35, 34, 36, 34,
         37, 36, 38, 36, 39, 38, 40, 39, 41, 40, 42, 41, 43, 41, 44, 43, 45, 43,
         46, 45, 47, 45, 48, 47, 49