# Studying data
The blood-brain barrier (BBB) controls the entry of chemicals from the blood to the brain. Since brain drugs need to penetrate the BBB, rapid and reliable prediction of BBB penetration (BBBP) is helpful for drug development. In this study, free-form and in-blood-form datasets were prepared by modifying the original BBBP dataset, and the effects of the data modification were investigated.

The original BBBP dataset contains 2053 items with four attributes: the index number from 1 to 2053 (“num”), the name of the compound (“name”), the penetrating or non-penetrating properties (“p_np”), and the SMILES string of the compound (“smiles”).

Sakiyama H, Fukuda M, Okuno T. Prediction of Blood-Brain Barrier Penetration (BBBP) Based on Molecular Descriptors of the Free-Form and In-Blood-Form Datasets. Molecules. 2021 Dec 7;26(24):7428. doi: 10.3390/molecules26247428. PMID: 34946509; PMCID: PMC8708321.


In [54]:
import pandas as pd

# Read in the data
drug_data=pd.read_csv("BBBP.csv")

#sum all values equal to 1 in p_np column of drug_data
total_unitary_pnp = sum(drug_data.p_np==1)
print ("Total number of unitary p_np values: ", total_unitary_pnp)

#sum all values equal to 0 in p_np column of drug_data
total_zero_pnp = sum(drug_data.p_np==0)
print ("Total number of zero p_np values: ", total_zero_pnp)

# Print the total number of p_np values
print ("Total number of p_np values: ", len(drug_data.p_np))

# Calculate the percentage of unitary p_np values
print ("Percentage of unitary p_np values: ", round(total_unitary_pnp*100/len(drug_data.p_np),2), "%")

#calculate the percentage of zero p_np values
print ("Percentage of zero p_np values: ", round (total_zero_pnp*100/len(drug_data.p_np),2), "%")


Total number of unitary p_np values:  1567
Total number of zero p_np values:  483
Total number of p_np values:  2050
Percentage of unitary p_np values:  76.44 %
Percentage of zero p_np values:  23.56 %


# Watching data in more detail to be given to GAE

The GAE should classify in two

In [55]:
from rdkit import Chem
from rdkit import RDLogger
from utils import *
from matplotlib import colors
from rdkit.Chem.Draw import MolToImage

# check the data frame
drug_data.head()

#extract smiles from the data frame and check the first 10 smiles
smiles = drug_data['smiles']
smiles.head()

#convert smiles to mols and disable warnings
RDLogger.DisableLog('rdApp.*')
mols = [Chem.MolFromSmiles(s) for s in smiles]
mols
print (type(mols[0]))

#resizing the drug_data to 
end_of_array = 10
smiles=smiles[:end_of_array]
print(smiles)

# Create a graph representation of the first molecule
for i in range(0, end_of_array):
    name = drug_data['name'][i]
    img = get_image(mols[i],None , name)

<class 'rdkit.Chem.rdchem.Mol'>
0                     [Cl].CC(C)NCC(O)COc1cccc2ccccc12
1             C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl
2    c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO...
3                     C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C
4    Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)...
5    CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(...
6    CN(C)[C@H]1[C@@H]2C[C@H]3C(=C(O)c4c(O)cccc4[C@...
7                  Cn1c2CCC(Cn3ccnc3C)C(=O)c2c4ccccc14
8    COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@...
9                         NC(N)=NC(=O)c1nc(Cl)c(N)nc1N
Name: smiles, dtype: object


In [56]:
from torch.utils.data import DataLoader
import networkx as nx
# Load the drug data and create the graph representation
smiles = drug_data['smiles'].to_list()
smiles = smiles[:end_of_array]
print (smiles)
labels = drug_data['p_np'].to_list()
labels = labels[:end_of_array]
print (labels)
data_list = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(smiles, labels)
print (data_list[0])

# Create a dataloader for training
dataloader = DataLoader(dataset = data_list, batch_size = 2**7)
print (dataloader)
# Define the loss function
loss_function = nn.MSELoss()

class MyDataset(torch.utils.data.Dataset):
    def __init__(self, x_smiles):
        self.data_list = create_pytorch_geometric_graph_data_list_from_smiles_and_labels(x_smiles,labels)

    def __len__(self):
        return len(self.data_list)

    def __getitem__(self, idx):
        return self.data_list[idx]
    
dataset = MyDataset(smiles)
print (dataset[0])
# Define the number of channels for the input and output of the model
in_channels = dataset[0].x.shape[1]
out_channels = 16

model = GAE(in_channels, out_channels) 


['[Cl].CC(C)NCC(O)COc1cccc2ccccc12', 'C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl', 'c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO3)=O', 'C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C', 'Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)[C@@H](N4C3=O)C(O)=O', 'CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(N3C2=O)C(O)=O)CSc4nnnn4C)c5ccc(O)cc5)C(=O)C1=O', 'CN(C)[C@H]1[C@@H]2C[C@H]3C(=C(O)c4c(O)cccc4[C@@]3(C)O)C(=O)[C@]2(O)C(=O)\\C(=C(/O)NCN5CCCC5)C1=O', 'Cn1c2CCC(Cn3ccnc3C)C(=O)c2c4ccccc14', 'COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@@H]2OC(C)=O', 'NC(N)=NC(=O)c1nc(Cl)c(N)nc1N']
[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
Data(x=[20, 79], edge_index=[2, 40], edge_attr=[40, 10], y=[1])
<torch.utils.data.dataloader.DataLoader object at 0x288ca5060>
Data(x=[20, 79], edge_index=[2, 40], edge_attr=[40, 10], y=[1])


NameError: name 'GAE' is not defined

In [None]:
smiles = drug_data['smiles'].to_list()
smiles = smiles[:end_of_array]
print(smiles)
#G = mols
print ("creating graph")
G = create_graph(drug_data['smiles'][:end_of_array])
print ("graph created")

dataloader = DataLoader(data_list , batch_size = 2**7)
#print(dataloader)
# Convert the graph to an adjacency matrix
#A = nx.adjacency_matrix(G).todense()
#print ("adjacency matrix created")
#print (A)

num_features = 50
hidden_dim = 32
learning_rate = 0.01
num_epochs = 100

# print()
# dataloader = DataLoader(A, batch_size=num_features, shuffle=False)

# Define the model, criterion,
model = GraphAutoencoder(input_dim=num_features, hidden_dim=hidden_dim, output_dim=num_features)

# Define the loss function and optimizer
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

# Train the model
for epoch in range(num_epochs):
    print ("Epoch: " + str(epoch))
    for i, data in enumerate(dataloader):
        print("iteration:" + str(i))
        # Extract the features and labels
        print("data before casting to float32:")
        print(data)
        print("featrues and labels before casting to float32:")
        features, labels = data
        # data = data.to(torch.float32)
        # print("data after casting to float32:")
        # print(data)
        # features, labels = data
        # print("featrues and labels after casting to float32:")
        # print(features)
        
        # Forward pass
        output = model(features)
        loss = criterion(output, features)
        
        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

# Extract the embeddings from the trained model
embeddings = model.encoder.weight.data.cpu().numpy()
print(embeddings)

['[Cl].CC(C)NCC(O)COc1cccc2ccccc12', 'C(=O)(OC(C)(C)C)CCCc1ccc(cc1)N(CCCl)CCCl', 'c12c3c(N4CCN(C)CC4)c(F)cc1c(c(C(O)=O)cn2C(C)CO3)=O', 'C1CCN(CC1)Cc1cccc(c1)OCCCNC(=O)C', 'Cc1onc(c2ccccc2Cl)c1C(=O)N[C@H]3[C@H]4SC(C)(C)[C@@H](N4C3=O)C(O)=O', 'CCN1CCN(C(=O)N[C@@H](C(=O)N[C@H]2[C@H]3SCC(=C(N3C2=O)C(O)=O)CSc4nnnn4C)c5ccc(O)cc5)C(=O)C1=O', 'CN(C)[C@H]1[C@@H]2C[C@H]3C(=C(O)c4c(O)cccc4[C@@]3(C)O)C(=O)[C@]2(O)C(=O)\\C(=C(/O)NCN5CCCC5)C1=O', 'Cn1c2CCC(Cn3ccnc3C)C(=O)c2c4ccccc14', 'COc1ccc(cc1)[C@@H]2Sc3ccccc3N(CCN(C)C)C(=O)[C@@H]2OC(C)=O', 'NC(N)=NC(=O)c1nc(Cl)c(N)nc1N']
creating graph
graph created
Epoch: 0


TypeError: default_collate: batch must contain tensors, numpy arrays, numbers, dicts or lists; found <class 'torch_geometric.data.data.Data'>

In [None]:


import numpy as np
import tensorflow as tf
import deepchem as dc
import pandas as pd

from google.colab import files
drugs=files.upload()
drugs=pd.read_csv("BBBP.csv")

# Load the MoleculeNet dataset
tasks, datasets, transformers =  dc.molnet.load_muv()
train_dataset, valid_dataset, test_dataset = datasets

# Build the model
n_features = train_dataset.get_data_shape()[0]
n_tasks = len(tasks)
graph_conv_filters = [[64,64], [128,128]]
dense_layer_size = 128

# Create the graph autoencoder model
model = dc.models.GraphConvModel(n_tasks, graph_conv_filters, dense_layer_size,
                                 batch_size=128, learning_rate=1e-4)

# Fit the model
model.fit(train_dataset, nb_epoch=100)

# Evaluate the model
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
print(model.evaluate(test_dataset, [metric], transformers))

# Get the embeddings
embeddings = model.get_embeddings(train_dataset)


Saving BBBP.csv to BBBP (7).csv


AttributeError: ignored

In [None]:
import deepchem as dc
import pandas as pd
import numpy as np

# Load the drug dataset
from google.colab import files
data=files.upload()
data=pd.read_csv("BBBP.csv")

smiles = data['smiles'].to_list()

# Featurizing the data


# Splitting the data into train, validation and test

tasks = ['p_np']

featurizer = dc.feat.ConvMolFeaturizer()
loader = dc.data.CSVLoader(
    tasks=tasks, smiles_field="smiles", featurizer=featurizer)
dataset = loader.featurize("BBBP.csv")
X = featurizer.featurize(smiles)

# Splitting the data into train, validation and test
y = data['p_np'].values
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


# Initializing the model 
model = dc.models.GraphConvModel(len(tasks), batch_size=50, mode="regression")
train_dataset = dc.data.NumpyDataset(X_train, y_train)
test_dataset = dc.data.NumpyDataset(X_test, y_test)
tasks = ['p_np']
featurizer = dc.feat.ConvMolFeaturizer()
test_dataset = dc.data.DiskDataset.from_dataframe(data, tasks)
test_embeddings = model.predict(test_dataset)


# Get the embeddings for the test set


print(test_embeddings)


    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)


Saving BBBP.csv to BBBP (15).csv


    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=T

AttributeError: ignored

In [None]:
!pip install deepchem

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepchem
  Downloading deepchem-2.7.1-py3-none-any.whl (693 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.2/693.2 KB[0m [31m12.3 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit
  Downloading rdkit-2022.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m50.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit, deepchem
Successfully installed deepchem-2.7.1 rdkit-2022.9.3


In [None]:
from google.colab import files

df = files.upload()
import pandas as pd
df=pd.read_csv("BBBP.csv")
smiles=df['smiles']
!pip install dgl
!pip install torch
import dgl
import torch

g = dgl.DGLGraph()
# Add nodes to the graph
g.add_nodes(len(df))

# Add edges to the graph
for i, row in df.iterrows():
  for j, col in row.iteritems():
    if i != j and col == 1:
      g.add_edge(i, j)
import torch.nn as nn

class GraphAutoEncoder(nn.Module):
  def _init_(self, in_dim, hidden_dim, out_dim):
    super(GraphAutoEncoder, self)._init_()
    self.encoder = nn.Linear(in_dim, hidden_dim)
    self.decoder = nn.Linear(hidden_dim, out_dim)

  def forward(self, x):
    x = self.encoder(x)
    x = self.decoder(x)
    return x
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_fingerprint(smiles):
  # Convert the SMILES string to a RDKit molecule
  mol = Chem.MolFromSmiles(smiles)

  # Calculate the Morgan fingerprint for the molecule
  fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2)

  # Convert the fingerprint to a NumPy array
  fingerprint = np.array(fingerprint)

  return fingerprint

# Convert the SMILES strings in the BBBP dataset to fingerprints
fingerprints = df['smiles'].apply(smiles_to_fingerprint)

# Convert the fingerprints to a NumPy array
X = np.stack(fingerprints.values)
in_dim = X.shape[1]
out_dim = X.shape[1]


# Create an instance of the GraphAutoEncoder model
model = GraphAutoEncoder(in_dim=df.shape[1], hidden_dim=64, out_dim=df.shape[1])

# Define the loss function and the optimizer
loss_fn = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
# Set the model to training mode
model.train()

# Loop over the number of epochs
for epoch in range(100):
  # Clear the gradients
  optimizer.zero_grad()

  # # Extract the encoder part of the model
encoder = model.encoder

# Set the model to evaluation mode
model.eval()

# Transform the input data into the latent space
latent = encoder(torch.Tensor(df.values))
embedding = latent.detach().numpy()
from rdkit import Chem
from rdkit.Chem import AllChem

def smiles_to_fingerprint(smiles):
  # Convert the SMILES string to a RDKit molecule
  mol = Chem.MolFromSmiles(smiles)

  # Calculate the Morgan fingerprint for the molecule
  fingerprint = AllChem.GetMorganFingerprintAsBitVect(mol, 2)

  # Convert the fingerprint to a NumPy array
  fingerprint = np.array(fingerprint)

  return fingerprint

# Convert the SMILES strings in the BBBP dataset to fingerprints
fingerprints = df['smiles'].apply(smiles_to_fingerprint)

# Convert the fingerprints to a NumPy array
X = np.stack(fingerprints.values)
in_dim = X.shape[1]
out_dim = X.shape[1]


Saving BBBP.csv to BBBP (2).csv
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/




TypeError: ignored

In [None]:
!pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m49.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.3


In [None]:


import numpy as np
import tensorflow as tf
import deepchem as dc

# Load the BBBP dataset
tasks, datasets, transformers = dc.molnet.load_bbbp()
train_dataset, valid_dataset, test_dataset = datasets

# Build the model
n_features = train_dataset.get_data_shape()[0]
n_tasks = len(tasks)
graph_conv_filters = [[64,64], [128,128]]
dense_layer_size = 128

# Create the graph autoencoder model
model = dc.models.GraphConvModel(n_tasks, graph_conv_filters, dense_layer_size,
                                 batch_size=128, learning_rate=1e-4)

# Fit the model
model.fit(train_dataset, nb_epoch=100)

# Evaluate the model
metric = dc.metrics.Metric(dc.metrics.roc_auc_score, np.mean)
print(model.evaluate(test_dataset, [metric], transformers))

# Get the embeddings
embeddings = model.get_embeddings(train_dataset)


    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=True)
    rdkit.Chem.rdmolfiles.CanonicalRankAtoms(NoneType)
did not match C++ signature:
    CanonicalRankAtoms(RDKit::ROMol mol, bool breakTies=True, bool includeChirality=True, bool includeIsotopes=T

AttributeError: ignored

In [None]:
from google.colab import files

df = files.upload()
import pandas as pd
df=pd.read_csv("HIV.csv")
smiles_list=df['smiles']



from rdkit.Chem import AllChem
from torch_geometric.data import Data, DataLoader

def get_mol_graph(smiles):
  # Use RDKit to parse the SMILES string and get the molecular graph
  mol = Chem.MolFromSmiles(smiles)
  if mol is None: return None
  adjacency_matrix = Chem.rdmolops.GetAdjacencyMatrix(mol)
  atoms = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
  return adjacency_matrix, atoms
  
  
valid_smiles = list(filter(lambda x: Chem.MolFromSmiles(x) is not None, smiles_list))


def get_data_from_smiles(valid_smiles):
  data_list = []
  for smiles in smiles_list:
    graph = get_mol_graph(smiles)
    if graph is not None:
      data_list.append(Data(x=torch.tensor(graph[1]).unsqueeze(0), edge_index=torch.tensor(graph[0]).unsqueeze(0)))
  return data_list
  import rdkit
from rdkit import Chem
from rdkit.Chem import AllChem
from torch_geometric.data import Data, DataLoader

def get_mol_graph(valid_smiles):
    mol = Chem.MolFromSmiles(valid_smiles)
    if mol is None: return None
    adjacency_matrix = Chem.rdmolops.GetAdjacencyMatrix(mol)
    atoms = [atom.GetAtomicNum() for atom in mol.GetAtoms()]
    return adjacency_matrix, atoms

def get_data_from_smiles(valid_smiles):
    data_list = []
    for smiles in valid_smiles:
        graph = get_mol_graph(smiles)
        if graph is not None:
            data_list.append(Data(x=torch.tensor(graph[1]).unsqueeze(0), edge_index=torch.tensor(graph[0]).unsqueeze(0)))
    return data_list




import torch
import torch.nn as nn
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

class GraphAutoEncoder(nn.Module):
  def __init__(self, input_dim, hidden_dim):
    super(GraphAutoEncoder, self).__init__()
    self.conv1 = GCNConv(input_dim, hidden_dim)
    self.conv2 = GCNConv(hidden_dim, input_dim)

  def forward(self, data):
    x, edge_index = data.x, data.edge_index
    x = self.conv1(x, edge_index)
    x = F.relu(x)
    x = self.conv2(x, edge_index)
    return x


import random

# Set the device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Set the random seed
random.seed
# Load the model and move it to the appropriate device
input_dim = 50
hidden_dim = 32
model = GraphAutoEncoder(input_dim, hidden_dim).to(device)


# Load the input data
data = get_data_from_smiles(smiles_list)

# Extract the embeddings
# Extract the embeddings
batch_size = len(valid_smiles) 
hidden_dim = 32.
embeddings = []
data_loader = DataLoader(data, batch_size=len(data), shuffle=False)

for i, datum in enumerate(data_loader):
    datum.to(device)
    embeddings.append(model.conv1(datum.x, datum.edge_index))
    
# Print the embeddings
print(embeddings)





Saving BBBP.csv to BBBP (26).csv


RuntimeError: ignored

In [None]:
from deepchem.models import TransformerXM
from deepchem.utils import get_features
from deepchem.data import DiskDataset

# load dataset
tasks = ["p_np"]
from google.colab import files

data = files.upload()
import pandas as pd
data=pd.read_csv("BBBP.csv")
data = pd.read_csv("BBBP.csv")
X = get_features(data.smiles)
y = data[tasks].values
dataset = DiskDataset.from_numpy(X, y, w=None, ids=None)

# Split the dataset
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)

# Build the model
n_embedding = 128
model = TransformerXM(n_embedding=n_embedding, n_layers=3, n_heads=4, batch_size=32,
                      dropout=0.1, use_attn_loss=True, use_mask=True)

# Fit the model
model.fit(train_dataset, nb_epoch=100)

# Generate embeddings
embeddings = model.embed(dataset)

# Print the embeddings
print(embeddings)




ImportError: ignored

In [None]:
from deepchem.models import GraphAutoEncoder
from deepchem.utils import get_features

# load dataset
from google.colab import files

data = files.upload()
import pandas as pd
data=pd.read_csv("BBBP.csv")
data = pd.read_csv("BBBP.csv")

X = get_features(data.smiles)

# Build the model
latent_dim = 256
model = GraphAutoEncoder(latent_dim, graph_conv_layers=[64, 64], 
                         generator_layers=[512, 512])

# Fit the model
model.fit(X, nb_epoch=100)

# Generate embeddings
embeddings = model.encode(X)

# Print the embeddings
print(embeddings)


ImportError: ignored

In [None]:
from torch_geometric.nn import GCNConv, Autoencoder
import torch.nn.functional as F
import rdkit.Chem as Chem
from rdkit import Chem
from rdkit.Chem import Descriptors
from rdkit.Chem import rdMolDescriptors
import numpy as np
from torch_geometric.data import Data
from torch_geometric.datasets import QM9

class Autoencoder(torch.nn.Module):
    def __init__(self, in_channels, hidden_channels):
        super(Autoencoder, self).__init__()
        self.conv1 = GCNConv(in_channels, hidden_channels)
        self.conv2 = GCNConv(hidden_channels, in_channels)

    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = self.conv2(x, edge_index)
        return x

# Prepare the dataset
mols = []
for i in range(50000):
    mols.append(Chem.MolFromSmiles(qm9[i]["smiles"]))

data = []
for i, mol in enumerate(mols):
    mol = mol.GetMol()
    if mol is None:
        continue
    x = rdMolDescriptors.GetHashedAtomPairFingerprintAsBitVect(mol)
    x = np.array(x)
    x = x / np.sum(x)
    edge_index = Chem.GetAdjacencyMatrix(mol).nonzero()
    edge_index = torch.tensor(np.array(edge_index).T, dtype=torch.long)
    data.append(Data(x=torch.tensor(x, dtype=torch.float), edge_index=edge_index))

#Build the Autoencoder model
model = Autoencoder(16, 256)
model = model.cuda()

# Fit the model
for i in range(100):
    for data in data:
        data = data.to("cuda")
        optimizer.zero_grad()
        x_hat = model(data.x, data.edge)
        def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
x = F.relu(x)
x = self.conv2(x, edge_index)
encoded = self.conv1.weight
 return x, encoded
embeddings = model(data.x, data.edge_index)





In [None]:
!pip install dgl

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dgl
  Downloading dgl-0.9.1-cp38-cp38-manylinux1_x86_64.whl (4.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.9/4.9 MB[0m [31m80.7 MB/s[0m eta [36m0:00:00[0m
Collecting psutil>=5.8.0
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.2/280.2 KB[0m [31m29.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: psutil, dgl
  Attempting uninstall: psutil
    Found existing installation: psutil 5.4.8
    Uninstalling psutil-5.4.8:
      Successfully uninstalled psutil-5.4.8
Successfully installed dgl-0.9.1 psutil-5.9.4


In [None]:
import dgl
import dgl.function as fn
import torch
import torch.nn as nn
from dgl.data import SMILESGraphDataset
from google.colab import files

dataset = files.upload()
import pandas as pd
dataset=pd.read_csv("BBBP.csv")

# Prepare the dataset
dataset = SMILESGraphDataset("BBBP.csv")

# Define the Graph Auto-Encoder model
class GAE(nn.Module):
    def __init__(self, in_feats, hidden_size, k):
        super(GAE, self).__init__()
        self.conv1 = dgl.nn.SAGEConv(in_feats, hidden_size, k)
        self.conv2 = dgl.nn.SAGEConv(hidden_size, hidden_size, k)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, in_feats)

    def forward(self, g):
        g.ndata['h'] = g.ndata['x']
        g.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'h'))
        g.ndata['h'] = self.conv1(g, g.ndata['h'])
    def forward(self, g):
        g.ndata['h'] = g.ndata['x']
        g.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'h'))
        g.ndata['h'] = self.conv1(g, g.ndata['h'])
        g.ndata['h'] = self.fc1(g.ndata['h'])
        embeddings = g.ndata['h']
        return embeddings
print(embeddings)

       


ModuleNotFoundError: ignored

In [None]:

import deepchem as dc
from deepchem.models import GraphConvModel
from deepchem.feat import smiles_to_bigraph
from deepchem.utils import remove_missing_entries
from deepchem.utils.evaluate import Evaluator
from rdkit import Chem

# Load your drug dataset as a Pandas dataframe
from google.colab import files

df = files.upload()
import pandas as pd
df=pd.read_csv("BBBP.csv")

# Convert SMILES strings to molecular graphs
mols = df['smiles'].apply(Chem.MolFromSmiles)
bigraphs = smiles_to_bigraph(mols)

# Split the dataset into training, validation, and test sets
train_frac = 0.8
valid_frac = 0.1
test_frac = 0.1
splitters = {
    'train': train_frac,
    'valid': valid_frac,
    'test': test_frac
}
dataset = dc.data.NumpyDataset(bigraphs,df['p_np'],  splitters=splitters)

# Create a GraphConvModel for training
model = GraphConvModel(n_tasks=1, mode='classification', dropout=0.2)

# Fit the model to the dataset
model.fit(dataset, nb_epoch=10)

# Extract the embeddings of the molecules
embeddings = model.get_embeddings(dataset)





ImportError: ignored

In [None]:

import deepchem as dc
from deepchem.models import GraphConvModel

from deepchem.utils import remove_missing_entries
from deepchem.utils.evaluate import Evaluator
from rdkit import Chem
import pandas as pd

from google.colab import files

df = files.upload()
import pandas as pd
df=pd.read_csv("BBBP.csv")

# Print the length of the arrays before removing invalid SMILES strings
print("Original length of smiles array: ", len(df['smiles']))
print("Original length of activity array: ", len(df['p_np']))

# Remove invalid SMILES strings
valid_smiles = []
valid_activity = []
for i,smiles in enumerate(df['smiles']):
    try:
        mol = Chem.MolFromSmiles(smiles)
        if mol != None:
            valid_smiles.append(smiles)
            valid_activity.append(df['p_np'][i])
    except:
        continue
        
# Print the length of the arrays after removing invalid SMILES strings
print("Valid length of smiles array: ", len(valid_smiles))
print("Valid length of activity array: ", len(valid_activity))
dataset = dc.data.NumpyDataset(valid_smiles, valid_activity)
featurizer = dc.feat.ConvMolFeaturizer()
train_data = featurizer.featurize(valid_smiles)
train_dataset = dc.data.NumpyDataset(train_data, valid_activity)




from deepchem.splits import RandomSplitter
scaffold_splitter = RandomSplitter()
train_frac = 0.8
valid_frac = 0.1
test_frac = 0.1
dataset = dc.data.NumpyDataset(valid_smiles, valid_activity)
train_dataset, valid_dataset, test_dataset = scaffold_splitter.train_valid_test_split(dataset, train_frac, valid_frac, test_frac)
featurizer = dc.feat.ConvMolFeaturizer()
train_data = featurizer.featurize(valid_smiles)
train_dataset = dc.data.NumpyDataset(train_data, train_dataset.y)




# Print the shapes of the arrays in the datasets
print("train_dataset X shape: ", train_dataset.X.shape)
print("train_dataset y shape: ", train_dataset.y.shape)
print("valid_dataset X shape: ", valid_dataset.X.shape)
print("valid_dataset y shape: ", valid_dataset.y.shape)
print("test_dataset X shape: ", test_dataset.X.shape)
print("test_dataset y shape: ", test_dataset.y.shape)





# Create a GraphConvModel for training
model = GraphConvModel(n_tasks=1, mode='classification', dropout=0.2)


featurizer = dc.feat.ConvMolFeaturizer()
train_data = featurizer.featurize(valid_smiles)
train_X_reshaped = train_dataset.X.reshape((train_dataset.X.shape[0], -1))
train_y_reshaped = train_dataset.y.reshape((train_dataset.y.shape[0], -1))
train_dataset_reshaped = dc.data.NumpyDataset(train_X_reshaped, train_y_reshaped)

valid_X_reshaped = valid_dataset.X.reshape((valid_dataset.X.shape[0], -1))
valid_y_reshaped = valid_dataset.y.reshape((valid_dataset.y.shape[0], -1))
valid_dataset_reshaped = dc.data.NumpyDataset(valid_X_reshaped, valid_y_reshaped)



model = GraphConvModel(n_tasks=1, mode='classification', dropout=0.2)
model.fit(train_dataset, nb_epoch=10)


# Extract the embeddings of the molecules
from keras.layers import Input, Dropout
from deepchem.models import GraphConvModel
print(train_data.shape)


input_layer = Input(shape=(None, train_data.shape[2], train_data.shape[3]), name="input_layer")

output_embedding = model.get_layer(index=-2)(input_layer)
embedding_model = model(input_layer, output_embedding)
embeddings = embedding_model.predict(dataset.X)
print(embeddings)





Saving BBBP.csv to BBBP (21).csv
Original length of smiles array:  2050
Original length of activity array:  2050
Valid length of smiles array:  2039
Valid length of activity array:  2039
train_dataset X shape:  (2039,)
train_dataset y shape:  (1631,)
valid_dataset X shape:  (204,)
valid_dataset y shape:  (204,)
test_dataset X shape:  (204,)
test_dataset y shape:  (204,)


IndexError: ignored

In [None]:
import torch
import torch.nn as nn
import torch_geometric.nn as gnn
from torch_geometric.data import Data
from rdkit import Chem
import pandas as pd

from google.colab import files

df = files.upload()
import pandas as pd
df=pd.read_csv("BBBP.csv")
valid_smiles = []
for smiles in df["smiles"]:

                                  mol = Chem.MolFromSmiles(smiles)
                                  if mol is not None:
                                   valid_smiles.append(smiles)
def smiles_to_graph(valid_smiles):
    mol = Chem.MolFromSmiles(valid_smiles)
    adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
    x = mol.GetConformer().GetPositions()
    edge_index = torch.nonzero(adj)
    return Data(x=torch.tensor(x), edge_index=edge_index)
    class Encoder(nn.Module):
        def __init__(self):
            super(Encoder, self).__init__()
            self.conv1 = gnn.GCNConv(3, 64)
            self.conv2 = gnn.GCNConv(64, 128)
def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.conv1 = gnn.GCNConv(128, 64)
        self.conv2 = gnn.GCNConv(64, 3)
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x
class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
    def forward(self, data):
        x = self.encoder(data)
        x = self.decoder(x, data.edge_index)
        return x

autoencoder = Autoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters())
# Convert a new SMILES string to a graph
for smiles in valid_smiles:
                          graph = smiles_to_graph(smiles)

# Pass the graph through the encoder to generate an embedding
embedding = autoencoder.encoder(graph)



Saving BBBP.csv to BBBP (4).csv


NameError: ignored

In [None]:
from google.colab import files

df = files.upload()
import pandas as pd
df=pd.read_csv("BBBP.csv")
from rdkit import Chem

# Create an empty list to store the valid SMILES strings
valid_smiles = []

# Iterate through all the SMILES strings in your dataset
for smiles in df["smiles"]:
    # Attempt to parse the SMILES string using the RDKit
    mol = Chem.MolFromSmiles(smiles)
    # If the SMILES string is valid, add it to the list of valid SMILES
    if mol is not None:
        valid_smiles.append(smiles)
    # If the SMILES string is invalid, it will return None and you can ignore it
import torch
import torch.nn as nn
from torch_geometric.nn import GCNConv, GAE, VGAE

class GF_VAE(nn.Module):
    def __init__(self, in_feats, hidden_size):
        super(GF_VAE, self).__init__()
        self.encoder = GCNConv(in_feats, hidden_size)
        self.decoder = GAE(hidden_size, in_feats)

    def forward(self, x, edge_index):
        z = self.encoder(x, edge_index)
        return self.decoder(z, edge_index), z
in_feats = 2039 # example value, you should use the size of your fingerprints representation
hidden_size = 128 

# create the model
from rdkit import Chem
from rdkit.Chem import AllChem, DataStructs

fingerprints = []
for smiles in valid_smiles:
    mol = Chem.MolFromSmiles(smiles)
    # calculate morgan fingerprint
    fp = AllChem.GetMorganFingerprintAsBitVect(mol, 2)
    arr = np.zeros((1,))
    DataStructs.ConvertToNumpyArray(fp, arr)
    fingerprints.append(arr)
from sklearn.preprocessing import OneHotEncoder
# Create an instance of the one-hot encoder
enc = OneHotEncoder()

# Convert the fingerprints to one-hot encoding
x = enc.fit_transform(fingerprints)
from scipy.spatial import distance

# calculate pairwise distance between fingerprints
pairwise_distance = distance.cdist(fingerprints, fingerprints, 'euclidean')

# define threshold for connecting edges
threshold = 0.5

# create edges where distance is less than the threshold
edge_index = []
for i in range(len(pairwise_distance)):
    for j in range(i+1, len(pairwise_distance)):
        if pairwise_distance[i,j] < threshold:
            edge_index.append([i, j])

# convert to a tensor
edge_index = torch.tensor(edge_index).transpose(0,1)


import torch.nn as nn

# define the reconstruction loss
reconstruction_loss = nn.MSELoss()
# define the KL divergence loss
kl_loss = nn.KLDivLoss(reduction='batchmean')

# define the combined loss function
def loss_function(recon_x, x, mu, logvar):
    BCE = reconstruction_loss(recon_x, x)
    KLD = -0.5 * torch.mean(1 + logvar - mu.pow(2) - logvar.exp())
    return BCE + KLD




# prepare the edges
n_epochs = 50

# train the model
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
model.train()
for epoch in range(n_epochs):
    z, _ = model(x, edge_index)
    loss = loss_function(z, x)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
# Create the model, load trained weights and set it to eval mode
model.eval()
import os

path_to_saved_weights = 'path/to/save/weights'
# if directory does not exist create it
if not os.path.exists(os.path.dirname(path_to_saved_weights)):
    os.makedirs(os.path.dirname(path_to_saved_weights))

# save model weights
torch.save(model.state_dict(), path_to_saved_weights)

model.load_state_dict(torch.load(path_to_saved_weights))
# create the model
model = GF_VAE(in_feats, hidden_size)
# load the weights
model.load_state_dict(torch.load(path_to_saved_weights))
import torch

# convert the one-hot encoded fingerprints to a Pytorch tensor
input_features = torch.from_numpy(x.toarray()).float()


# Prepare the input data
x = input_features


# get the embeddings
_, z = model(x, edge_index)

# print the embeddings
print(z)



Saving BBBP.csv to BBBP (6).csv


TypeError: ignored

In [None]:
!pip install torch_geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_geometric
  Downloading torch_geometric-2.2.0.tar.gz (564 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.0/565.0 KB[0m [31m32.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting psutil>=5.8.0
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.2/280.2 KB[0m [31m35.9 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.2.0-py3-none-any.whl size=773302 sha256=3e6d329b865d2cab3bd93c38de007c57567f823dea21a789a71391522bb56e9a
  Stored in directory: /root/.cache/pip/wheels/59/a3/20/198928106d

In [None]:
!pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m56.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.3


In [None]:
import torch
import torch.nn as nn
import torch_geometric.nn as gnn
from torch_geometric.data import Data
from rdkit import Chem
import pandas as pd

from google.colab import files

df = files.upload()
import pandas as pd
df=pd.read_csv("BBBP.csv")
valid_smiles = []
for smiles in df["smiles"]:
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        valid_smiles.append(smiles)
        
class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.conv1 = gnn.GCNConv(3, 64)
        self.conv2 = gnn.GCNConv(64, 128)
    def forward(self, data):
        x, edge_index = data.x, data.edge_index
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

class Decoder(nn.Module):
    def __init__(self):
        super(Decoder, self).__init__()
        self.conv1 = gnn.GCNConv(128, 64)
        self.conv2 = gnn.GCNConv(64, 3)
    def forward(self, x, edge_index):
        x = self.conv1(x, edge_index)
        x = torch.relu(x)
        x = self.conv2(x, edge_index)
        return x

class Autoencoder(nn.Module):
    def __init__(self):
        super(Autoencoder, self).__init__()
        self.encoder = Encoder()
        self.decoder = Decoder()
    def forward(self, data):
        x = self.encoder(data)
        x = self.decoder(x, data.edge_index)
        return x



from rdkit.Chem import Draw

def smiles_to_graph(valid_smiles):
    mol = Chem.MolFromSmiles(valid_smiles)
    img = Draw.MolToImage(mol)
    adj = Chem.rdmolops.GetAdjacencyMatrix(mol)
    x = mol.GetConformer().GetPositions()
    edge_index = torch.nonzero(adj)
    return Data(x=torch.tensor(x), edge_index=edge_index)


for smiles in valid_smiles:
    graph = smiles_to_graph(smiles)

autoencoder = Autoencoder()
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(autoencoder.parameters())
for smiles in valid_smiles:
                          graph = smiles_to_graph(smiles)

# Pass the graph through the encoder to generate an embedding
embedding = autoencoder.encoder(graph)



Saving BBBP.csv to BBBP (9).csv


ValueError: ignored

In [None]:
import torch
import torch.nn as nn
from rdkit import Chem
from rdkit.Chem import Draw
import pandas as pd

from google.colab import files

Dataset = files.upload()
import pandas as pd
Dataset=pd.read_csv("BBBP.csv")
valid_smiles = []
for smiles in df["smiles"]:
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        valid_smiles.append(smiles)
        

class DrugDataset(Dataset):
    def __init__(self, smiles):
        self.smiles = smiles

    def __len__(self):
        return len(self.smiles)

    def __getitem__(self, idx):
        mol = Chem.MolFromSmiles(self.smiles[idx])
        img = Draw.MolToImage(mol)
        return img

class Encoder(nn.Module):
    def __init__(self):
        super(Encoder, self).__init__()
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1)
        self.conv2 = nn.Conv2d(64, 64, kernel_size=3, stride=1, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2)
        self.fc1 = nn.Linear(64 * 64 * 64, 100)
        self.fc2 = nn.Linear(100, 2)

    def forward(self, x):
        x = self.conv1(x)
        # Initialize the encoder
encoder = Encoder()

# Initialize the dataset
dataset = DrugDataset(valid_smiles)

# Iterate through the dataset to obtain embeddings
embeddings = []
for i in range(len(dataset)):
    img = dataset[i]
    img = img.reshape(1, 3, img.shape[0], img.shape[1])  # Reshape the image to fit the input of the encoder
    img = img.float() / 255  # Normalize the pixel values
    embedding = encoder.fc2(encoder(img)).detach().numpy()  # Obtain the embedding for the current sample
    embeddings.append(embedding)

# The list `embeddings` now contains the embeddings for each sample in the dataset



Saving BBBP.csv to BBBP (10).csv


ValueError: ignored

In [None]:
!pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m44.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.3


In [None]:
!pip install torch_geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_geometric
  Downloading torch_geometric-2.2.0.tar.gz (564 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.0/565.0 KB[0m [31m12.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting psutil>=5.8.0
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.2/280.2 KB[0m [31m30.5 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.2.0-py3-none-any.whl size=773302 sha256=bd9ffce4c64ab5fbc051fd3dd13b213c8f016ec86ff19aebf05f87d53c5cc8e6
  Stored in directory: /root/.cache/pip/wheels/59/a3/20/198928106d

In [None]:
!pip install deepchem

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepchem
  Downloading deepchem-2.7.1-py3-none-any.whl (693 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.2/693.2 KB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: deepchem
Successfully installed deepchem-2.7.1


In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

import dgl
import rdkit.Chem as Chem
import torch.nn as nn
from dgl.nn import SAGEConv
from google.colab import files
df=files.upload()
# Load the SMILES dataset
df = pd.read_csv("BBBP.csv")
smiles_list = df["smiles"].tolist()

# Convert the SMILES strings into DGL graphs
from rdkit import Chem
from rdkit.Chem import rdMolDescriptors
from dgl import DGLGraph

graphs = []
for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        adj_matrix = Chem.rdmolops.GetAdjacencyMatrix(mol)
        src, dst = np.where(adj_matrix)
        graph = dgl.DGLGraph((src, dst))
        graphs.append(graph)
    else:
        print("Invalid SMILES:{}".format(smiles))

# Define the Graph Auto-Encoder model
class GAE(nn.Module):
    def __init__(self, in_feats, hidden_size, k):
        super(GAE, self).__init__()
        self.conv1 = SAGEConv(in_feats, hidden_size, k)
        self.conv2 = SAGEConv(hidden_size, hidden_size, k)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, in_feats)

    def forward(self, g):
        g.ndata['h'] = g.ndata['x']
        g.update_all(fn.copy_src('h', 'm'), fn.sum('m', 'h'))
        g.ndata['h'] = self.conv1(g, g.ndata['h'])
        g.ndata['h'] = self.fc1(g.ndata['h'])
        embeddings = g.ndata['h']
        return embeddings
        graphs = [g.to(torch.device('cuda:0')) for g in graphs]
    

# Build the model
for i, graph in enumerate(graphs):
    x = torch.randn((graph.number_of_nodes(), in_feats))
    graph.ndata['x'] = x
    # pass graph through the model
    embeddings = model(graph)
    # calculate the loss
    target = torch.Tensor(embeddings.size())
    loss = criterion(embeddings, target)
    # backward pass and optimization
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

in_feats = graph.ndata["x"].size()[1]
hidden_size = 256

# define the loss function
criterion = nn.MSELoss()

# define the optimizer
optimizer = torch.optim.Adam(model.parameters())







Saving BBBP.csv to BBBP (12).csv
Invalid SMILES:O=N([O-])C1=C(CN=C1NCCSCc2ncccc2)Cc3ccccc3
Invalid SMILES:c1(nc(NC(N)=[NH2])sc1)CSCCNC(=[NH]C#N)NC
Invalid SMILES:Cc1nc(sc1)\[NH]=C(\N)N
Invalid SMILES:s1cc(CSCCN\C(NC)=[NH]\C#N)nc1\[NH]=C(\N)N
Invalid SMILES:c1c(c(ncc1)CSCCN\C(=[NH]\C#N)NCC)Br
Invalid SMILES:n1c(csc1\[NH]=C(\N)N)c1ccccc1
Invalid SMILES:n1c(csc1\[NH]=C(\N)N)c1cccc(c1)N
Invalid SMILES:n1c(csc1\[NH]=C(\N)N)c1cccc(c1)NC(C)=O
Invalid SMILES:n1c(csc1\[NH]=C(\N)N)c1cccc(c1)N\C(NC)=[NH]\C#N
Invalid SMILES:s1cc(nc1\[NH]=C(\N)N)C
Invalid SMILES:c1(cc(N\C(=[NH]\c2cccc(c2)CC)C)ccc1)CC


KeyError: ignored

In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn

import dgl
import rdkit.Chem as Chem
import torch.nn as nn
from dgl.nn import SAGEConv
from google.colab import files
df=files.upload()
# Load the SMILES dataset
df = pd.read_csv("BBBP.csv")
smiles_list = df["smiles"].tolist()

# Convert the SMILES strings into DGL graphs
graphs = []
for smiles in smiles_list:
    mol = Chem.MolFromSmiles(smiles)
    if mol is not None:
        adj_matrix = Chem.rdmolops.GetAdjacencyMatrix(mol)
        src, dst = np.where(adj_matrix)
        graph = dgl.DGLGraph((src, dst))
        graphs.append(graph)
    else:
        print("Invalid SMILES:{}".format(smiles))
        
#Convert the graphs to PyTorch tensors
graphs = [g.to(torch.device('cuda:0')) for g in graphs]

# Define the Graph Auto-Encoder model
class GAE(nn.Module):
    def __init__(self, in_feats, hidden_size, k):
        super(GAE, self).__init__()
        self.conv1 = SAGEConv(in_feats, hidden_size, k)
        self.conv2 = SAGEConv(hidden_size, hidden_size, k)
        self.fc1 = nn.Linear(hidden_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, in_feats)
embeddings = []
for graph in graphs:
    graph.ndata['x'] = torch.randn((graph.number_of_nodes(), in_feats)).to(torch.device('cuda:0'))
    emb = model(graph)
    emb = emb.detach().cpu().numpy()
    embeddings.append(emb)


Using backend: pytorch


OSError: ignored

In [None]:
!pip install dgl


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip freeze | grep cudatoolkit

In [None]:
!pip install dgl-cu100


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
!pip install cudatoolkit==11.0


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement cudatoolkit==11.0 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for cudatoolkit==11.0[0m[31m
[0m

In [None]:
!pip install cudatoolkit==11.1


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
[31mERROR: Could not find a version that satisfies the requirement cudatoolkit==11.1 (from versions: none)[0m[31m
[0m[31mERROR: No matching distribution found for cudatoolkit==11.1[0m[31m
[0m

In [None]:
!pip install deepchem

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting deepchem
  Downloading deepchem-2.7.1-py3-none-any.whl (693 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m693.2/693.2 KB[0m [31m48.4 MB/s[0m eta [36m0:00:00[0m
Collecting rdkit
  Downloading rdkit-2022.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m50.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit, deepchem
Successfully installed deepchem-2.7.1 rdkit-2022.9.3


In [None]:
!pip install torch_geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch_geometric
  Downloading torch_geometric-2.2.0.tar.gz (564 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.0/565.0 KB[0m [31m25.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting psutil>=5.8.0
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.2/280.2 KB[0m [31m32.4 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: torch_geometric
  Building wheel for torch_geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch_geometric: filename=torch_geometric-2.2.0-py3-none-any.whl size=773302 sha256=41ea3b1b0c441b1f7a4a0d77f8dae5fa0bd0ec6b724076d7764c8138f04d8c09
  Stored in directory: /root/.cache/pip/wheels/59/a3/20/198928106d

In [None]:
!pip install rdkit

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting rdkit
  Downloading rdkit-2022.9.3-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m29.3/29.3 MB[0m [31m56.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: rdkit
Successfully installed rdkit-2022.9.3


In [None]:
!pip install torch-sparse


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch-sparse
  Using cached torch_sparse-0.6.16.tar.gz (208 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-sparse
  Building wheel for torch-sparse (setup.py) ... [?25l[?25hcanceled
[31mERROR: Operation cancelled by user[0m[31m
[0mTraceback (most recent call last):
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/cli/base_command.py", line 167, in exc_logging_wrapper
    status = run_func(*args)
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/cli/req_command.py", line 199, in wrapper
    return func(self, options, args)
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/commands/install.py", line 361, in run
    _, build_failures = build(
  File "/usr/local/lib/python3.8/dist-packages/pip/_internal/wheel_builder.py", line 348, in build
    wheel_file = _build_one(
  File "/usr/local

In [None]:
!pip uninstall torch-sparse
!pip uninstall torch-sparse
!pip install --verbose torch-sparse==0.4.3

[0mUsing pip 22.0.4 from /usr/local/lib/python3.8/dist-packages/pip (python 3.8)
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch-sparse==0.4.3
  Downloading torch_sparse-0.4.3.tar.gz (11 kB)
  Running command python setup.py egg_info
  running egg_info
  creating /tmp/pip-pip-egg-info-51rjwnu_/torch_sparse.egg-info
  writing /tmp/pip-pip-egg-info-51rjwnu_/torch_sparse.egg-info/PKG-INFO
  writing dependency_links to /tmp/pip-pip-egg-info-51rjwnu_/torch_sparse.egg-info/dependency_links.txt
  writing requirements to /tmp/pip-pip-egg-info-51rjwnu_/torch_sparse.egg-info/requires.txt
  writing top-level names to /tmp/pip-pip-egg-info-51rjwnu_/torch_sparse.egg-info/top_level.txt
  writing manifest file '/tmp/pip-pip-egg-info-51rjwnu_/torch_sparse.egg-info/SOURCES.txt'
  reading manifest file '/tmp/pip-pip-egg-info-51rjwnu_/torch_sparse.egg-info/SOURCES.txt'
  reading manifest template 'MANIFEST.in'
  adding license file 'LIC

In [None]:
!pip install torch-geometric \
  torch-sparse \
  torch-scatter \
  torch-cluster \
  -f https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://pytorch-geometric.com/whl/torch-1.8.0+cu101.html
Collecting torch-sparse
  Using cached torch_sparse-0.6.16.tar.gz (208 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-scatter
  Downloading torch_scatter-2.1.0.tar.gz (106 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m106.8/106.8 KB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting torch-cluster
  Downloading torch_cluster-1.6.0.tar.gz (43 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.4/43.4 KB[0m [31m4.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: torch-sparse, torch-scatter, torch-cluster
  Building wheel for torch-sparse (setup.py) ... [?25l[?25hcanceled
[31mERROR: Operation cancelled by us

In [None]:
import torch
print(torch.__version__)
!pip install torch-scatter torch-sparse -f https://data.pyg.org/whl/torch-{torch.__version__}.html

1.13.0+cu116
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in links: https://data.pyg.org/whl/torch-1.13.0+cu116.html
Collecting torch-scatter
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu116/torch_scatter-2.1.0%2Bpt113cu116-cp38-cp38-linux_x86_64.whl (9.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.4/9.4 MB[0m [31m43.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting torch-sparse
  Downloading https://data.pyg.org/whl/torch-1.13.0%2Bcu116/torch_sparse-0.6.16%2Bpt113cu116-cp38-cp38-linux_x86_64.whl (4.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m4.5/4.5 MB[0m [31m82.4 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: torch-scatter, torch-sparse
Successfully installed torch-scatter-2.1.0+pt113cu116 torch-sparse-0.6.16+pt113cu116


In [None]:
!pip install torch-geometric

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting torch-geometric
  Downloading torch_geometric-2.2.0.tar.gz (564 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m565.0/565.0 KB[0m [31m29.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting psutil>=5.8.0
  Downloading psutil-5.9.4-cp36-abi3-manylinux_2_12_x86_64.manylinux2010_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (280 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m280.2/280.2 KB[0m [31m31.7 MB/s[0m eta [36m0:00:00[0m
Building wheels for collected packages: torch-geometric
  Building wheel for torch-geometric (setup.py) ... [?25l[?25hdone
  Created wheel for torch-geometric: filename=torch_geometric-2.2.0-py3-none-any.whl size=773302 sha256=7e0ce44f9762b8700f83511e1b106fbf75b97a91795bc94c544e17917463c439
  Stored in directory: /root/.cache/pip/wheels/59/a3/20/198928106d