In [1]:
import pandas as pd
import numpy as np
import scipy
import sklearn

import json
import joblib
from pathlib import Path

import deepchem as dc
import rdkit
from rdkit import Chem
import torch 

import sys
sys.path.append('../')


No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'torch_geometric'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (/mnt/Supermicro/data2/test-syntelly/.deepchem/lib/python3.11/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorc

In [2]:
from config.experiment_config import config

# Graph Conv

Построим простой эксперимент с помощью deepchem, чтобы улучшить результаты с использованием GCN

In [13]:
df = pd.read_csv(config.DATA_PATH / "melt_clean.csv")

def is_valid_smiles(smiles):
    mol = Chem.MolFromSmiles(smiles)
    return mol is not None and mol.GetNumAtoms() > 1

df_clean = df[df["canonical_smiles"].apply(is_valid_smiles)].copy()
print(f"Before: {len(df)}, After cleaning: {len(df_clean)}")
df_clean.to_csv(config.DATA_PATH / "melt_clean_filtered.csv", index=False)


Before: 11485, After cleaning: 11484


In [15]:
loader = dc.data.CSVLoader(tasks=["melt_value"], 
                  feature_field="canonical_smiles",
                  id_field='index',
                  featurizer= dc.feat.MolGraphConvFeaturizer(use_edges=True))

dataset = loader.create_dataset(config.DATA_PATH / 'melt_clean_filtered.csv')

In [16]:
dataset.X[0]

GraphData(node_features=[53, 30], edge_index=[2, 126], edge_features=[126, 11])

In [17]:
from deepchem.models.torch_models import GCN, GCNModel
metric = dc.metrics.Metric(dc.metrics.mean_squared_error, mode="regression")
results = []
N_EPOCHES = 5
split_data = json.load((config.DATA_PATH / 'melt_split.json').open())
for fold in range(5):
    split_dict = split_data[fold]
    train_ids, val_ids, test_ids = [], [], []
    for index, idx in enumerate(dataset.ids):
        split = split_dict.get(str(idx))
        if split == "train":
            train_ids.append(index)
        elif split == "val":
            val_ids.append(index)
        elif split == "test":
            test_ids.append(index)

    train_dataset = dataset.select(train_ids)
    val_dataset   = dataset.select(val_ids)
    test_dataset  = dataset.select(test_ids)


    model = GCNModel(
        n_tasks=1,
        mode="regression",
        graph_conv_layers=[64, 128, 128, 64], 
        activation=torch.nn.LeakyReLU(),
        batchnorm=True,
        dropout=0.2,
        predictor_dropout=0.2,
        batch_size=32,
        learning_rate=1e-3,
        device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    )
    print(f'Started fold: {fold+1}')
    model.fit(train_dataset, nb_epoch=N_EPOCHES)

    val_score = model.evaluate(val_dataset, [metric])
    test_score = model.evaluate(test_dataset, [metric])
    
    print(f"Fold {fold} | Val MSE: {val_score['mean_squared_error']:.4f} | Test MSE: {test_score['mean_squared_error']:.4f}")
    results.append((val_score, test_score))


  from .autonotebook import tqdm as notebook_tqdm


Started fold: 1
Fold 0 | Val MSE: 4207.1428 | Test MSE: 4540.8182
Started fold: 2
Fold 1 | Val MSE: 3934.1291 | Test MSE: 4989.6489
Started fold: 3
Fold 2 | Val MSE: 5231.6817 | Test MSE: 14763.9235
Started fold: 4
Fold 3 | Val MSE: 4877.7864 | Test MSE: 4234.8061
Started fold: 5
Fold 4 | Val MSE: 4251.5373 | Test MSE: 5399.3662


In [18]:
val_mse = [r[0]['mean_squared_error'] for r in results]
test_mse = [r[1]['mean_squared_error'] for r in results]

print("\n=== Cross-Validation Summary ===")
print(f"Validation MSE: Mean = {np.mean(val_mse):.4f}, STD = {np.std(val_mse):.4f}")
print(f"Test MSE:       Mean = {np.mean(test_mse):.4f}, STD = {np.std(test_mse):.4f}")


=== Cross-Validation Summary ===
Validation MSE: Mean = 4500.4555, STD = 478.7042
Test MSE:       Mean = 6785.7126, STD = 4008.6488


# Custom GCN

In [1]:
import torch

from torch import nn
from torch_geometric.nn import GCNConv, BatchNorm, global_mean_pool, Sequential
from torch_geometric.loader import DataLoader
from torch_geometric.data import Data
import torch.nn.functional as F
from torch.utils.data import Subset, Dataset

from sklearn.metrics import root_mean_squared_error, mean_absolute_error
from scipy.stats import pearsonr

import deepchem as dc
import pandas as pd
import numpy as np

import mlflow
import json

import sys
sys.path.append('../')

from config.experiment_config import config


No normalization for SPS. Feature removed!
No normalization for AvgIpc. Feature removed!
No normalization for NumAmideBonds. Feature removed!
No normalization for NumAtomStereoCenters. Feature removed!
No normalization for NumBridgeheadAtoms. Feature removed!
No normalization for NumHeterocycles. Feature removed!
No normalization for NumSpiroAtoms. Feature removed!
No normalization for NumUnspecifiedAtomStereoCenters. Feature removed!
No normalization for Phi. Feature removed!
Skipped loading some Tensorflow models, missing a dependency. No module named 'tensorflow'
Skipped loading modules with pytorch-geometric dependency, missing a dependency. No module named 'dgl'
Skipped loading modules with transformers dependency. No module named 'transformers'
cannot import name 'HuggingFaceModel' from 'deepchem.models.torch_models' (/mnt/Supermicro/data2/test-syntelly/.torch/lib/python3.11/site-packages/deepchem/models/torch_models/__init__.py)
Skipped loading modules with pytorch-lightning dep

In [2]:
import os
import random

def set_seed(SEED):
    os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"
    random.seed(SEED)
    np.random.seed(SEED)
    torch.manual_seed(SEED)
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    torch.use_deterministic_algorithms(True)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

def seed_worker(worker_id):
    worker_seed = torch.initial_seed() % 2**32
    np.random.seed(worker_seed)
    random.seed(worker_seed)

set_seed(config.RANDOM_SEED)


In [3]:
# dataset = pd.read_csv(config.DATA_PATH / "melt_clean_filtered.csv")
# dataset.iloc[8247]['melt_value']

# smiles = df[df['index'] == 9648]['canonical_smiles'].values[0]
# smiles

# featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
# featurizer.featurize(smiles)[0]

In [4]:
# df[df['index'] == 9648]

In [5]:
class MeltDataset(Dataset):
    def __init__(self):
        super(MeltDataset, self).__init__()
        self.featurizer = dc.feat.MolGraphConvFeaturizer(use_edges=True)
        self.dataset = pd.read_csv(config.DATA_PATH / "melt_clean_filtered.csv")

    def __getitem__(self, index):
        try:
            smiles = self.dataset.iloc[index]['canonical_smiles']
            value = self.dataset.iloc[index]['melt_value']
            data = self.featurizer.featurize(smiles)[0]
        except IndexError:
            print(index)
        

        return Data(
            x=torch.tensor(data.node_features, dtype=torch.float32),
            y=torch.tensor(value, dtype=torch.float32),
            edge_index=torch.tensor(data.edge_index, dtype=torch.int32),
            edge_attr=torch.tensor(data.edge_features, dtype=torch.float32)
        )
        
    def __len__(self):
        return len(self.dataset)


In [6]:
class GCNBlock(torch.nn.Module):
    def __init__(self, in_channels: int, out_channels: int, dropout_rate: float):
        super(GCNBlock, self).__init__()
        self.conv = GCNConv(in_channels, out_channels)
        self.bn = BatchNorm(out_channels)
        self.dropout_rate = dropout_rate
    
    def forward(self, x, edge_index):
        x = self.conv(x, edge_index)
        x = self.bn(x)
        x = F.dropout(x, p=self.dropout_rate, training=self.training)
        x = F.relu(x)

        return x


In [7]:
class Net(torch.nn.Module):
    def __init__(self, hidden_dims: list[int], dropout_rate: float):
        super(Net, self).__init__()
        self.gcns = nn.ModuleList()
        self.hidden_dims = hidden_dims
        self.dropout_rate = dropout_rate

        for in_dim, out_dim in zip(self.hidden_dims[:-1], self.hidden_dims[1:]):
            gcn_block = GCNBlock(in_channels=in_dim,
                                 out_channels=out_dim,
                                 dropout_rate=self.dropout_rate)
            
            self.gcns.append(gcn_block)

        self.output_layer = nn.Sequential(
            nn.Linear(self.hidden_dims[-1], self.hidden_dims[-1]//2),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(self.hidden_dims[-1]//2, self.hidden_dims[-1]//2),
            nn.ReLU(),
            nn.Dropout(p=0.2),
            nn.Linear(self.hidden_dims[-1]//2, 1)
        )
        

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        for block in self.gcns:
            x = block(x, edge_index)

        x = global_mean_pool(x, batch)

        out = self.output_layer(x).view(-1)
        return out


In [8]:
def train_one_epoch(model, loader, optimizer, device):
    model.train()
    total_loss = 0
    for batch in loader:
        batch = batch.to(device)
        optimizer.zero_grad()
        pred = model(batch)
        loss = F.mse_loss(pred, batch.y)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=2.0)
        total_loss += loss.item() * batch.num_graphs
        optimizer.step()
    
    return total_loss / len(loader.dataset)


def evaluate(model, loader, device):
    model.eval()
    total_loss = 0
    y_true, y_pred = [], []
    
    with torch.no_grad():
        for batch in loader:
            batch = batch.to(device)
            pred = model(batch)
            y_true.append(batch.y.cpu().numpy())
            y_pred.append(pred.cpu().numpy())
            loss = F.mse_loss(pred, batch.y)
            total_loss += loss.item() * batch.num_graphs
            
    y_true = np.concatenate(y_true)    
    y_pred = np.concatenate(y_pred)
    rmse = root_mean_squared_error(y_true, y_pred)
    pearson_r, _ = pearsonr(y_true, y_pred)   
    mae = mean_absolute_error(y_true, y_pred)
    return total_loss / len(loader.dataset), rmse, pearson_r, mae

In [9]:
def cross_validation(dataset, model_class, train_one_epoch_fn, evaluate_fn,
                device, generator, seed_worker, k_folds=5, batch_size=64, l2_norm=1e-4, momentum=0.97, epochs=None, 
                learning_rate=None, split_path=None, net_parameters=None, config=None):
    
    cv_results = {
        f'fold_{key}':  {
                    'loss_train': [],
                    'loss_val': [],
                    'loss_test': [],
                    'val_rmse': [],
                    'val_r': [],
                    'val_mae': [],
                    'test_rmse': [],
                    'test_r': [],
                    'test_mae': [],
                    'epoch': []
        } for key in range(0, k_folds)
    }
    mlflow.set_experiment('GCN_arch')
    with mlflow.start_run(run_name=f'{config.TIMESTAMP}'):   
        for fold in range(k_folds):
            train_idx, val_idx, test_idx = [], [], []
            data_split = json.load((split_path).open())
            split_dict = data_split[fold]

            for index, idx in enumerate(dataset.dataset['index'].astype(str)):
                split = split_dict.get(idx)
                
                if split == "train":
                    train_idx.append(index)
                elif split == "val":
                    val_idx.append(index)
                elif split == "test":
                    test_idx.append(index)

            train_dataset = Subset(dataset, train_idx)
            val_dataset = Subset(dataset, val_idx)
            test_dataset = Subset(dataset, test_idx)
            print(f'train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}')
            
            train_loader = DataLoader(train_dataset, batch_size=batch_size,
                                        shuffle=True, num_workers=4,
                                        worker_init_fn=seed_worker, generator=generator)
            val_loader = DataLoader(val_dataset, batch_size=batch_size,
                                        shuffle=False, num_workers=4,
                                        worker_init_fn=seed_worker, generator=generator)
            test_loader = DataLoader(test_dataset, batch_size=batch_size,
                                        shuffle=False, num_workers=4,
                                        worker_init_fn=seed_worker, generator=generator)
            
            model = model_class(**net_parameters)
            model.to(config.DEVICE)
            
            optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, 
                                            momentum=momentum, 
                                            weight_decay=l2_norm)
            scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min',
                                        factor=0.7, threshold=0.05, patience=8,
                                        min_lr=1e-7)
            for epoch in range(epochs):
                train_loss = train_one_epoch_fn(model, train_loader, optimizer, device)
                val_loss, val_rmse, val_r, val_mae = evaluate_fn(model, val_loader, device)

                scheduler.step(val_loss)

                mlflow.log_metric(f'Loss/Train_MSE_fold_{fold}', train_loss, epoch)
                mlflow.log_metric(f'Loss/Validation_MSE_fold_{fold}', val_loss, epoch)
                mlflow.log_metric(f'Validation/MAE_fold_{fold}', val_mae, epoch)
                mlflow.log_metric(f'Validation/RMSE_fold_{fold}', val_rmse, epoch)
                mlflow.log_metric(f'Validation/PearsonR_fold_{fold}', val_r, epoch)
                mlflow.log_metric(f'LR/Learning Rate_fold_{fold}', optimizer.param_groups[0]['lr'], epoch)
                
                fold_key = f'fold_{fold}'
                cv_results[fold_key]['loss_train'].append(train_loss)
                cv_results[fold_key]['loss_val'].append(val_loss)
                cv_results[fold_key]['val_rmse'].append(val_rmse)
                cv_results[fold_key]['val_r'].append(val_r)
                cv_results[fold_key]['val_mae'].append(val_mae)
                cv_results[fold_key]['epoch'].append(epoch)

                mlflow.pytorch.log_model(model, artifact_path=f'model_fold_{fold}')

            _, test_rmse, test_r, test_mae = evaluate_fn(model, test_loader, device)

            cv_results[fold_key]['test_rmse'].append(test_rmse)
            cv_results[fold_key]['test_r'].append(test_r)            
            cv_results[fold_key]['test_mae'].append(test_mae)
                
            calculate_test_results(cv_results=cv_results, epochs=epochs, mlflow=mlflow, model=model)


def calculate_test_results(cv_results: dict, epochs: int, mlflow, model):
    test_rmse_all = []
    test_r_all = []
    test_mae_all = []
    for fold_key in cv_results.keys():
        test_rmse_all.append(cv_results[fold_key]['test_rmse'])
        test_r_all.append(cv_results[fold_key]['test_r'])
        test_mae_all.append(cv_results[fold_key]['test_mae'])

    test_rmse_array = np.array(test_rmse_all).flatten()
    test_r_array = np.array(test_r_all).flatten() 
    test_mae_array = np.array(test_mae_all).flatten()

    
    mean_test_rmse = np.mean(test_rmse_array)
    std_test_rmse = np.std(test_rmse_array)

    mean_test_r = np.nanmean(test_r_array)
    std_test_r = np.nanstd(test_r_array)
    
    mean_test_mae = np.mean(test_mae_array)
    std_test_mae = np.std(test_mae_array)

    mlflow.log_metrics({
    "Test/RMSE_mean": mean_test_rmse,
    "Test/RMSE_std": std_test_rmse,
    "Test/MAE_mean": mean_test_mae,
    "Test/MAE_std": std_test_mae,
    "Test/PearsonR_mean": mean_test_r,
    "Test/PearsonR_std": std_test_r
    })

In [None]:
generator = torch.Generator().manual_seed(config.RANDOM_SEED)

hidden_dims = [30, 64, 128]

net_params = {
    'hidden_dims': hidden_dims,
    'dropout_rate': 0.2
    } 

run_params = {
        'l2_norm': 1e-4,
        'batch_size': 32,
        'learning_rate': 1e-3, 
        'momentum': 0.97,
        "device": config.DEVICE, 
        "epochs": 300,
        "k_folds": 5
    }

cross_val_mean = cross_validation(
        dataset=MeltDataset(), 
        net_parameters=net_params,
        **run_params,
        config=config,
        model_class=Net, 
        train_one_epoch_fn=train_one_epoch, 
        evaluate_fn=evaluate,
        split_path=config.DATA_PATH / 'melt_split.json',
        generator=generator,
        seed_worker=seed_worker)

2025/09/06 19:37:32 INFO mlflow.tracking.fluent: Experiment with name 'GCN_arch' does not exist. Creating a new experiment.


train: 9058, val: 1214, test: 1212






