In [1]:
import deepchem as dc
import pandas as pd
from deepchem.feat import ConvMolFeaturizer
from deepchem.models import GraphConvModel
import os
from rdkit import Chem
os.chdir("..")
os.chdir("Preprocessing")
from preprocessing import *
import numpy as np

df = preprocessing("C:\\Users\Gilbert\Documents\BCB_Research\Kcat_Benchmark_ML_Models\Data\kcat_transferase.csv")

In [2]:

data = df.copy()
comp = data[["Compound", "Kcat"]]
comp["Kcat"] = np.log10(comp["Kcat"])

# Generate molecular objects
comp["mol"] = comp["Compound"].apply(lambda x: Chem.MolFromSmiles(x))

# Featurize the molecules
featurizer = ConvMolFeaturizer()
comp["Graph_features"] = comp["mol"].apply(featurizer.featurize).apply(lambda x: x[0])

# Extract features and labels
X = list(comp["Graph_features"].values)
y = comp["Kcat"].values.reshape(-1, 1)

# Create a DeepChem Dataset
dataset = dc.data.NumpyDataset(X, y)

# Splitting the dataset
splitter = dc.splits.RandomSplitter()
train_dataset, valid_dataset, test_dataset = splitter.train_valid_test_split(dataset)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp["Kcat"] = np.log10(comp["Kcat"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp["mol"] = comp["Compound"].apply(lambda x: Chem.MolFromSmiles(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  comp["Graph_features"] = comp["mol"].apply(featurizer.featurize).apply(lambda x: x[0])


In [5]:
# Model initialization remains unchanged
batch_size = 50
n_task = 1
model = GraphConvModel(n_tasks=n_task, mode="regression", batch_size=batch_size, dropout=0.3)

# Training
num_epochs = 10
losses = []
train_r2_scores = []
valid_r2_scores = []
metric = dc.metrics.Metric(dc.metrics.pearson_r2_score)

for epoch in range(num_epochs):
    # Training the model for one epoch
    loss = model.fit(train_dataset, nb_epoch=1)
    losses.append(loss)
    
    # Evaluating on training and validation datasets
    train_scores = model.evaluate(train_dataset, [metric], transformers=[])
    valid_scores = model.evaluate(valid_dataset, [metric], transformers=[])
    
    train_r2_scores.append(train_scores['pearson_r2_score'])
    valid_r2_scores.append(valid_scores['pearson_r2_score'])
    
    # Printing the details for the epoch
    print(f"Epoch {epoch+1}/{num_epochs}")
    print(f"Loss: {loss:.4f}")
    print(f"Train R^2 Score: {train_scores['pearson_r2_score']:.4f}")
    print(f"Valid R^2 Score: {valid_scores['pearson_r2_score']:.4f}")
    print("---------------------------------------------------")

# If you want to print final scores
print(f"Final Train R^2 Score: {train_r2_scores[-1]:.4f}")
print(f"Final Valid R^2 Score: {valid_r2_scores[-1]:.4f}")



Epoch 1/10
Loss: 2.7608
Train R^2 Score: 0.0969
Valid R^2 Score: 0.0988
---------------------------------------------------
Epoch 2/10
Loss: 2.3146
Train R^2 Score: 0.0599
Valid R^2 Score: 0.0371
---------------------------------------------------
Epoch 3/10
Loss: 0.2238
Train R^2 Score: 0.1175
Valid R^2 Score: 0.0748
---------------------------------------------------
Epoch 4/10
Loss: 2.2120
Train R^2 Score: 0.1367
Valid R^2 Score: 0.1015
---------------------------------------------------
Epoch 5/10
Loss: 2.2033
Train R^2 Score: 0.1403
Valid R^2 Score: 0.1130
---------------------------------------------------
Epoch 6/10
Loss: 1.3415
Train R^2 Score: 0.1596
Valid R^2 Score: 0.1272
---------------------------------------------------
Epoch 7/10
Loss: 2.1307
Train R^2 Score: 0.1366
Valid R^2 Score: 0.0827
---------------------------------------------------
Epoch 8/10
Loss: 2.1115
Train R^2 Score: 0.2007
Valid R^2 Score: 0.1553
---------------------------------------------------
Epoch 9/

In [11]:
# Extracting atom features from the ConvMol object and averaging them to get a single vector per molecule
features_df = comp["Graph_features"].apply(lambda x: x.get_atom_features().mean(axis=0)).apply(pd.Series)
features_df.columns = [f'feature_{i}' for i in range(features_df.shape[1])]


Unnamed: 0,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,feature_8,feature_9,...,feature_65,feature_66,feature_67,feature_68,feature_69,feature_70,feature_71,feature_72,feature_73,feature_74
count,4136.0,4136.0,4136.0,4136.0,4136.0,4136.0,4136.0,4136.0,4136.0,4136.0,...,4136.0,4136.0,4136.0,4136.0,4136.0,4136.0,4136.0,4136.0,4136.0,4136.0
mean,0.471789,0.109466,0.354762,0.014072,0.001949,0.0,0.039118,0.001745,0.000335,0.002987,...,0.457731,0.536969,0.0,0.004696,0.194509,0.452374,0.369363,0.143541,0.03448,0.000242
std,0.158142,0.095921,0.151353,0.09316,0.036065,0.0,0.046861,0.022105,0.016004,0.054038,...,0.241687,0.242521,0.0,0.06783,0.205738,0.170961,0.169277,0.136435,0.068114,0.015549
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.37037,0.0,0.253968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.333333,0.4,0.0,0.0,0.0,0.375,0.285714,0.064516,0.0,0.0
50%,0.474937,0.117647,0.37037,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.428571,0.571429,0.0,0.0,0.176471,0.464466,0.375,0.117647,0.0,0.0
75%,0.555556,0.16129,0.454545,0.0,0.0,0.0,0.076923,0.0,0.0,0.0,...,0.6,0.644589,0.0,0.0,0.333333,0.548387,0.45,0.1875,0.052632,0.0
max,0.96875,1.0,0.8,1.0,1.0,0.0,0.235294,1.0,1.0,1.0,...,1.0,1.0,0.0,1.0,1.0,1.0,0.857143,1.0,1.0,1.0


graph_conv_10
graph_conv_11
batch_normalization_15
batch_normalization_16
batch_normalization_17
dropout_15
dropout_16
dropout_17
graph_pool_10
graph_pool_11
dense_10
graph_gather_5
trim_graph_output_5
dense_11
