In [None]:
!pip install rdkit pandas scikit-learn mordred




In [None]:
!pip install rdkit torch torch-geometric pandas numpy scikit-learn tqdm


Collecting rdkit
  Downloading rdkit-2024.9.6-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (4.0 kB)
Collecting torch-geometric
  Downloading torch_geometric-2.6.1-py3-none-any.whl.metadata (63 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m63.1/63.1 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading 

In [None]:
!pip install torch_geometric



In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_geometric
from torch_geometric.loader import DataLoader  # Updated import
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
from tqdm import tqdm
import pandas as pd
import warnings

# Suppress specific warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")

datasets = {
    "HLM": ("ADMET_HLM_CONFIDENTIAL.csv", "in-vitro_HLM_bienta: CLint (Num) (uL/min/mg)"),
    "KSOL": ("ADMET_KSOL_CONFIDENTIAL.csv", "in-vitro_KSOL-PBS_bienta: mean_solubility (Num) (uM)"),
    "LogD": ("ADMET_LogD_CONFIDENTIAL.csv", "in-vitro_LogD_bienta: LogD (Num)"),
    "MDR1": ("ADMET_MDR1-MDCK2_CONFIDENTIAL.csv", "in-vitro_MDR1-MDCKII-Papp_bienta: mean_Papp_A_to_B (Num) (10^-6 cm/s)"),
    "MLM": ("ADMET_MLM_CONFIDENTIAL.csv", "in-vitro_MLM_bienta: CLint (Num) (uL/min/mg)")
}

def one_hot_encoding(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]
    return [x == s for s in allowable_set]

def smiles_to_graph(smiles, target):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features = (
            one_hot_encoding(atom.GetSymbol(), ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe', 'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd', 'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In', 'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
            one_hot_encoding(atom.GetDegree(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 'Unknown']) +
            one_hot_encoding(atom.GetTotalNumHs(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 'Unknown']) +
            one_hot_encoding(atom.GetImplicitValence(), [0, 1, 2, 3, 4, 5, 6, 7, 8, 'Unknown']) +
            [atom.GetIsAromatic()]
        )
        atom_features_list.append(atom_features)
    x = torch.tensor(atom_features_list, dtype=torch.float)

    edge_indices = []
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_indices += [[i, j], [j, i]]

    edge_index = torch.tensor(edge_indices).t().contiguous() if edge_indices else torch.empty((2, 0), dtype=torch.long)

    return torch_geometric.data.Data(x=x, edge_index=edge_index, y=torch.tensor([target], dtype=torch.float))

# Load all datasets
all_data = {}
for name, (file, target_col) in datasets.items():
    df = pd.read_csv(file)
    df = df.dropna(subset=["CXSMILES (CDD Compatible)", target_col])
    df[target_col] = df[target_col].astype(float)

    graph_data = [smiles_to_graph(row["CXSMILES (CDD Compatible)"], row[target_col]) for _, row in tqdm(df.iterrows(), total=len(df))]
    graph_data = [g for g in graph_data if g is not None]
    all_data[name] = graph_data

# Log transform and clip target values
detection_limits = {
    "HLM": 0.1,
    "KSOL": 0.1,
    "LogD": None,  # LogD is already in log units
    "MDR1": 0.1,
    "MLM": 0.1
}

for name in all_data:
    if name != "LogD":
        y_values = torch.cat([data.y for data in all_data[name]])
        y_log = torch.log(torch.clamp(y_values, min=detection_limits[name]))
        for i, data in enumerate(all_data[name]):
            all_data[name][i].y = y_log[i].unsqueeze(0)

# Normalize target values
scaler = StandardScaler()
for name in all_data:
    y_values = torch.cat([data.y for data in all_data[name]])
    y_normalized = torch.tensor(scaler.fit_transform(y_values.reshape(-1, 1)).flatten(), dtype=torch.float)
    for i, data in enumerate(all_data[name]):
        all_data[name][i].y = y_normalized[i].unsqueeze(0)

class GNN(nn.Module):
    def __init__(self, num_features):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 128)
        self.conv3 = GCNConv(128, 256)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = torch.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv2(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

def evaluate_model(model, data_loader, criterion, device, scaler, dataset_name):
    model.eval()
    total_loss = 0.0
    preds, actuals = [], []

    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            pred = model(batch).view(-1)
            loss = criterion(pred, batch.y)
            total_loss += loss.item()
            preds.extend(pred.cpu().numpy())
            actuals.extend(batch.y.cpu().numpy())

    preds = scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
    actuals = scaler.inverse_transform(np.array(actuals).reshape(-1, 1)).flatten()

    if dataset_name != "LogD":
        preds = np.exp(preds)
        actuals = np.exp(actuals)

    mae = mean_absolute_error(actuals, preds)
    r2 = r2_score(actuals, preds)

    return total_loss, mae, r2

def train_model(dataset_name, data, epochs=100, batch_size=32, lr=0.001, patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True)
    test_loader = DataLoader(test_data, batch_size=batch_size)

    model = GNN(num_features=data[0].num_features).to(device)
    optimizer = optim.Adam(model.parameters(), lr=lr, weight_decay=1e-5)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    criterion = nn.MSELoss()

    best_loss = float("inf")
    early_stop_counter = 0

    scaler = StandardScaler()
    y_values = torch.cat([d.y for d in data])
    scaler.fit(y_values.reshape(-1, 1))

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0

        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            pred = model(batch).view(-1)
            loss = criterion(pred, batch.y)
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        print(f"{dataset_name} - Epoch {epoch+1}/{epochs}, Training Loss: {total_loss:.4f}")

        val_loss, mae, r2 = evaluate_model(model, test_loader, criterion, device, scaler, dataset_name)
        print(f"{dataset_name} - Validation Loss: {val_loss:.4f}, MAE: {mae:.4f}, R2: {r2:.4f}")

        scheduler.step(val_loss)
        current_lr = scheduler.optimizer.param_groups[0]['lr']
        print(f"Current learning rate: {current_lr}")

        if val_loss < best_loss:
            best_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), f"{dataset_name}_best_model.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print(f"Early stopping triggered for {dataset_name}. Stopping training.")
                break

    print(f"{dataset_name} - Best Validation Loss: {best_loss:.4f}")

# Train model for each dataset
for name, data in all_data.items():
    print(f"Training on {name} dataset...")
    train_model(name, data)




100%|██████████| 24/24 [00:00<00:00, 441.76it/s]


100%|██████████| 24/24 [00:00<00:00, 483.83it/s]


100%|██████████| 24/24 [00:00<00:00, 568.00it/s]


100%|██████████| 24/24 [00:00<00:00, 547.39it/s]


100%|██████████| 24/24 [00:00<00:00, 506.16it/s]


Training on HLM dataset...
HLM - Epoch 1/100, Training Loss: 0.9975
HLM - Validation Loss: 1.0621, MAE: 0.7825, R2: -0.3016
Current learning rate: 0.001
HLM - Epoch 2/100, Training Loss: 0.9993
HLM - Validation Loss: 1.0590, MAE: 0.7693, R2: -0.2741
Current learning rate: 0.001
HLM - Epoch 3/100, Training Loss: 0.9955
HLM - Validation Loss: 1.0580, MAE: 0.7582, R2: -0.2522
Current learning rate: 0.001
HLM - Epoch 4/100, Training Loss: 0.9938
HLM - Validation Loss: 1.0576, MAE: 0.7496, R2: -0.2358
Current learning rate: 0.001
HLM - Epoch 5/100, Training Loss: 0.9872
HLM - Validation Loss: 1.0578, MAE: 0.7416, R2: -0.2209
Current learning rate: 0.001
HLM - Epoch 6/100, Training Loss: 0.9865
HLM - Validation Loss: 1.0585, MAE: 0.7350, R2: -0.2092
Current learning rate: 0.001
HLM - Epoch 7/100, Training Loss: 0.9836
HLM - Validation Loss: 1.0593, MAE: 0.7285, R2: -0.1979
Current learning rate: 0.001
HLM - Epoch 8/100, Training Loss: 0.9827
HLM - Validation Loss: 1.0603, MAE: 0.7236, R2: -0

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_geometric
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")

# Dataset path and target columns
dataset_path = "polaris-admet_train.tsv"
target_cols = ["MDR1-MDCKII", "KSOL", "LogD", "MLM", "HLM"]

# Function: One-hot encoding for atoms
def one_hot_encoding(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]  # Assign "Unknown" if not in set
    return [x == s for s in allowable_set]

# Function: Convert SMILES to graph
def smiles_to_graph(smiles, targets):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Atom features
    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features = (
            one_hot_encoding(atom.GetSymbol(), ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
                                                'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
                                                'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
                                                'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
            one_hot_encoding(atom.GetDegree(), list(range(11)) + ['Unknown']) +
            one_hot_encoding(atom.GetTotalNumHs(), list(range(9)) + ['Unknown']) +
            one_hot_encoding(atom.GetImplicitValence(), list(range(9)) + ['Unknown']) +
            [atom.GetIsAromatic()]
        )
        atom_features_list.append(atom_features)

    x = torch.tensor(atom_features_list, dtype=torch.float)

    # Edge connections
    edge_indices = []
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_indices += [[i, j], [j, i]]

    edge_index = torch.tensor(edge_indices).t().contiguous() if edge_indices else torch.empty((2, 0), dtype=torch.long)

    # Convert target values safely
    try:
        y = torch.tensor([float(t) for t in targets], dtype=torch.float)
    except ValueError:
        return None  # Skip molecules with invalid targets

    return torch_geometric.data.Data(x=x, edge_index=edge_index, y=y)

# Load dataset
df = pd.read_csv(dataset_path)

# Convert target columns to numeric and remove NaNs
df[target_cols] = df[target_cols].apply(pd.to_numeric, errors="coerce")
df = df.dropna(subset=["SMILES"] + target_cols)

# Convert dataset to graphs
graph_data = [smiles_to_graph(row["SMILES"], row[target_cols].values) for _, row in tqdm(df.iterrows(), total=len(df))]
graph_data = [g for g in graph_data if g is not None]  # Remove None entries

# Normalize target values
scaler = StandardScaler()
y_values = torch.cat([data.y for data in graph_data]).numpy().reshape(-1, len(target_cols))
y_normalized = scaler.fit_transform(y_values)

for i, data in enumerate(graph_data):
    graph_data[i].y = torch.tensor(y_normalized[i], dtype=torch.float)

# Define GNN model
class GNN(nn.Module):
    def __init__(self, num_features, num_targets):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 128)
        self.conv3 = GCNConv(128, 256)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_targets)  # Multi-output layer
        self.dropout = nn.Dropout(0.2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = torch.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv2(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Evaluate model
def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    preds, actuals = [], []

    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1, pred.shape[1]))  # Ensure shapes match
            total_loss += loss.item()
            preds.extend(pred.cpu().numpy())
            actuals.extend(batch.y.cpu().numpy())

    # Reshape predictions and actuals to 2D arrays before inverse transform
    preds = np.array(preds).reshape(-1, len(target_cols))
    actuals = np.array(actuals).reshape(-1, len(target_cols))

    # Inverse transform to get original scale
    preds = scaler.inverse_transform(preds)
    actuals = scaler.inverse_transform(actuals)

    # Calculate metrics
    mae = mean_absolute_error(actuals, preds)
    r2 = r2_score(actuals, preds)

    return total_loss, mae, r2

# Train model
def train_model(data, epochs=100, batch_size=32, lr=0.001, patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=2)

    num_features = data[0].x.shape[1]
    num_targets = len(target_cols)
    model = GNN(num_features, num_targets).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    criterion = nn.MSELoss()

    best_loss = float("inf")
    early_stop_counter = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0

        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1, pred.shape[1]))  # 🔹 FIXED SHAPE MISMATCH
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        val_loss, mae, r2 = evaluate_model(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {total_loss:.4f} - Validation Loss: {val_loss:.4f} - MAE: {mae:.4f} - R2: {r2:.4f}")

        scheduler.step(val_loss)
        if val_loss < best_loss:
            best_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping triggered. Stopping training.")
                break

    print(f"Best Validation Loss: {best_loss:.4f}")

# Run training
train_model(graph_data)

100%|██████████| 434/434 [00:00<00:00, 512.92it/s]


Epoch 1/100 - Training Loss: 10.2118 - Validation Loss: 3.8720 - MAE: 86.4014 - R2: -0.0060
Epoch 2/100 - Training Loss: 10.1752 - Validation Loss: 3.8731 - MAE: 85.5260 - R2: -0.0053
Epoch 3/100 - Training Loss: 10.1446 - Validation Loss: 3.8739 - MAE: 84.7649 - R2: -0.0039
Epoch 4/100 - Training Loss: 10.1118 - Validation Loss: 3.8544 - MAE: 83.8260 - R2: 0.0039
Epoch 5/100 - Training Loss: 9.9209 - Validation Loss: 3.8044 - MAE: 82.0689 - R2: 0.0242
Epoch 6/100 - Training Loss: 9.6989 - Validation Loss: 3.7610 - MAE: 79.8450 - R2: 0.0423
Epoch 7/100 - Training Loss: 9.4662 - Validation Loss: 3.8088 - MAE: 81.4252 - R2: 0.0247
Epoch 8/100 - Training Loss: 9.4846 - Validation Loss: 3.7112 - MAE: 81.3202 - R2: 0.0581
Epoch 9/100 - Training Loss: 9.3003 - Validation Loss: 3.7186 - MAE: 80.6937 - R2: 0.0557
Epoch 10/100 - Training Loss: 9.0227 - Validation Loss: 3.7264 - MAE: 78.0379 - R2: 0.0568
Epoch 11/100 - Training Loss: 9.0661 - Validation Loss: 3.7806 - MAE: 81.0517 - R2: 0.0344
E

In [None]:
import torch
import torch.nn as nn
import torch_geometric
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")

# Dataset path and target columns
test_dataset_path = "/content/polaris-admet_test.csv"  # Replace with your test CSV file path
target_cols = ["MDR1-MDCKII", "KSOL", "LogD", "MLM", "HLM"]

# Function: One-hot encoding for atoms
def one_hot_encoding(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]  # Assign "Unknown" if not in set
    return [x == s for s in allowable_set]

# Function: Convert SMILES to graph
def smiles_to_graph(smiles, targets=None):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        print(f"Invalid SMILES: {smiles}")  # Debug: Print invalid SMILES
        return None

    # Atom features
    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features = (
            one_hot_encoding(atom.GetSymbol(), ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
                                                'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
                                                'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
                                                'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
            one_hot_encoding(atom.GetDegree(), list(range(11)) + ['Unknown']) +
            one_hot_encoding(atom.GetTotalNumHs(), list(range(9)) + ['Unknown']) +
            one_hot_encoding(atom.GetImplicitValence(), list(range(9)) + ['Unknown']) +
            [atom.GetIsAromatic()]
        )
        atom_features_list.append(atom_features)

    x = torch.tensor(atom_features_list, dtype=torch.float)

    # Edge connections
    edge_indices = []
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_indices += [[i, j], [j, i]]

    edge_index = torch.tensor(edge_indices).t().contiguous() if edge_indices else torch.empty((2, 0), dtype=torch.long)

    # Convert target values safely (if targets are provided)
    if targets is not None:
        try:
            y = torch.tensor([float(t) for t in targets], dtype=torch.float)
        except ValueError:
            print(f"Invalid targets for SMILES: {smiles}")  # Debug: Print invalid targets
            return None
    else:
        y = None

    return torch_geometric.data.Data(x=x, edge_index=edge_index, y=y)

# Load test dataset
df_test = pd.read_csv(test_dataset_path)

# Check if the test dataset is empty
if df_test.empty:
    raise ValueError("The test dataset is empty. Please provide a valid CSV file.")

# Check if the 'SMILES' column exists
if "SMILES" not in df_test.columns:
    raise ValueError("The test dataset must contain a 'SMILES' column.")


# Convert dataset to graphs
test_graph_data = []
invalid_smiles = []

for _, row in tqdm(df_test.iterrows(), total=len(df_test)):
    # Skip target validation for the test dataset
    graph = smiles_to_graph(row["SMILES"], None)  # Pass None for targets
    if graph is not None:
        test_graph_data.append(graph)
    else:
        invalid_smiles.append(row["SMILES"])

# Print invalid SMILES strings (if any)
if invalid_smiles:
    print(f"Warning: {len(invalid_smiles)} invalid SMILES strings were found and skipped.")
    print("Invalid SMILES:", invalid_smiles)

# Check if any valid graphs were created
if not test_graph_data:
    raise ValueError("No valid graphs were created from the test dataset. Check the SMILES strings.")

# Load the trained model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_features = test_graph_data[0].x.shape[1]
num_targets = len(target_cols)
model = GNN(num_features, num_targets).to(device)
model.load_state_dict(torch.load("best_model.pth", map_location=device))  # Load saved model
model.eval()

# Create DataLoader for test data
test_loader = DataLoader(test_graph_data, batch_size=32, shuffle=False, num_workers=2)

# Run predictions
predictions = []
with torch.no_grad():
    for batch in test_loader:
        batch = batch.to(device)
        pred = model(batch)
        predictions.extend(pred.cpu().numpy())

# Inverse transform predictions to original scale
predictions = scaler.inverse_transform(np.array(predictions).reshape(-1, len(target_cols)))

# Add predictions to the test DataFrame
df_test[target_cols] = predictions

# Save predictions to a new CSV file
output_path = "test_predictions.csv"
df_test.to_csv(output_path, index=False)
print(f"Predictions saved to {output_path}")

100%|██████████| 126/126 [00:00<00:00, 666.24it/s]
  model.load_state_dict(torch.load("best_model.pth", map_location=device))  # Load saved model


Predictions saved to test_predictions.csv


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch_geometric
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")

# Dataset path and target columns
dataset_path = "/content/cleaned_imputed_polaris-admet_train.csv"
target_cols = ["MDR1-MDCKII", "KSOL", "LogD", "MLM", "HLM"]

# Function: One-hot encoding for atoms
def one_hot_encoding(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]  # Assign "Unknown" if not in set
    return [x == s for s in allowable_set]

# Function: Convert SMILES to graph
def smiles_to_graph(smiles, targets):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    # Atom features
    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features = (
            one_hot_encoding(atom.GetSymbol(), ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
                                                'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
                                                'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
                                                'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
            one_hot_encoding(atom.GetDegree(), list(range(11)) + ['Unknown']) +
            one_hot_encoding(atom.GetTotalNumHs(), list(range(9)) + ['Unknown']) +
            one_hot_encoding(atom.GetImplicitValence(), list(range(9)) + ['Unknown']) +
            [atom.GetIsAromatic()]
        )
        atom_features_list.append(atom_features)

    x = torch.tensor(atom_features_list, dtype=torch.float)

    # Edge connections
    edge_indices = []
    for bond in mol.GetBonds():
        i, j = bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()
        edge_indices += [[i, j], [j, i]]

    edge_index = torch.tensor(edge_indices).t().contiguous() if edge_indices else torch.empty((2, 0), dtype=torch.long)

    # Convert target values safely
    try:
        y = torch.tensor([float(t) for t in targets], dtype=torch.float)
    except ValueError:
        return None  # Skip molecules with invalid targets

    return torch_geometric.data.Data(x=x, edge_index=edge_index, y=y)

# Load dataset
df = pd.read_csv(dataset_path)

# Convert target columns to numeric and remove NaNs
df[target_cols] = df[target_cols].apply(pd.to_numeric, errors="coerce")
df = df.dropna(subset=["SMILES"] + target_cols)

# Convert dataset to graphs
graph_data = [smiles_to_graph(row["SMILES"], row[target_cols].values) for _, row in tqdm(df.iterrows(), total=len(df))]
graph_data = [g for g in graph_data if g is not None]  # Remove None entries

# Normalize target values
scaler = StandardScaler()
y_values = torch.cat([data.y for data in graph_data]).numpy().reshape(-1, len(target_cols))
y_normalized = scaler.fit_transform(y_values)

for i, data in enumerate(graph_data):
    graph_data[i].y = torch.tensor(y_normalized[i], dtype=torch.float)

# Define GNN model
class GNN(nn.Module):
    def __init__(self, num_features, num_targets):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 128)
        self.conv3 = GCNConv(128, 256)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, num_targets)  # Multi-output layer
        self.dropout = nn.Dropout(0.2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = torch.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv2(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x)

# Evaluate model
def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss = 0.0
    preds, actuals = [], []

    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1, pred.shape[1]))  # Ensure shapes match
            total_loss += loss.item()
            preds.extend(pred.cpu().numpy())
            actuals.extend(batch.y.cpu().numpy())

    # Reshape predictions and actuals to 2D arrays before inverse transform
    preds = np.array(preds).reshape(-1, len(target_cols))
    actuals = np.array(actuals).reshape(-1, len(target_cols))

    # Inverse transform to get original scale
    preds = scaler.inverse_transform(preds)
    actuals = scaler.inverse_transform(actuals)

    # Calculate metrics
    mae = mean_absolute_error(actuals, preds)
    r2 = r2_score(actuals, preds)

    return total_loss, mae, r2

# Train model
def train_model(data, epochs=100, batch_size=32, lr=0.001, patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=2)

    num_features = data[0].x.shape[1]
    num_targets = len(target_cols)
    model = GNN(num_features, num_targets).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    criterion = nn.MSELoss()

    best_loss = float("inf")
    early_stop_counter = 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0

        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1, pred.shape[1]))  # 🔹 FIXED SHAPE MISMATCH
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        val_loss, mae, r2 = evaluate_model(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {total_loss:.4f} - Validation Loss: {val_loss:.4f} - MAE: {mae:.4f} - R2: {r2:.4f}")

        scheduler.step(val_loss)
        if val_loss < best_loss:
            best_loss = val_loss
            early_stop_counter = 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping triggered. Stopping training.")
                break

    print(f"Best Validation Loss: {best_loss:.4f}")

# Run training
train_model(graph_data)

100%|██████████| 434/434 [00:00<00:00, 542.79it/s]


Epoch 1/100 - Training Loss: 10.2841 - Validation Loss: 3.8725 - MAE: 85.7979 - R2: -0.0064
Epoch 2/100 - Training Loss: 10.2176 - Validation Loss: 3.8721 - MAE: 85.0989 - R2: -0.0055
Epoch 3/100 - Training Loss: 10.1421 - Validation Loss: 3.8694 - MAE: 84.9237 - R2: -0.0030
Epoch 4/100 - Training Loss: 10.2306 - Validation Loss: 3.8514 - MAE: 83.4199 - R2: 0.0050
Epoch 5/100 - Training Loss: 9.8956 - Validation Loss: 3.8322 - MAE: 81.3150 - R2: 0.0169
Epoch 6/100 - Training Loss: 9.7107 - Validation Loss: 3.7566 - MAE: 80.7639 - R2: 0.0458
Epoch 7/100 - Training Loss: 9.3785 - Validation Loss: 3.7847 - MAE: 81.6576 - R2: 0.0380
Epoch 8/100 - Training Loss: 9.2034 - Validation Loss: 3.7311 - MAE: 80.7746 - R2: 0.0569
Epoch 9/100 - Training Loss: 9.0544 - Validation Loss: 3.7410 - MAE: 80.5604 - R2: 0.0500
Epoch 10/100 - Training Loss: 8.9870 - Validation Loss: 3.6946 - MAE: 80.6494 - R2: 0.0669
Epoch 11/100 - Training Loss: 9.1156 - Validation Loss: 3.7595 - MAE: 78.6657 - R2: 0.0472
E

In [None]:
#MDR1-MDCK2
import torch
import torch.nn as nn
import torch.optim as optim
import torch_geometric
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")

# Dataset path and target column (SELECT ONE)
dataset_path = "/content/mdr1-mdck2 (1).csv"
target_col = "in-vitro_MDR1-MDCKII-Papp_bienta: mean_Papp_A_to_B (Num) (10^-6 cm/s)"

# Function: One-hot encoding for atoms
def one_hot_encoding(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]  # Assign "Unknown" if not in set
    return [x == s for s in allowable_set]

# Function: Convert SMILES to graph
def smiles_to_graph(smiles, target):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features = (
            one_hot_encoding(atom.GetSymbol(), ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
                                                'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
                                                'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
                                                'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
            one_hot_encoding(atom.GetDegree(), list(range(11)) + ['Unknown']) +
            one_hot_encoding(atom.GetTotalNumHs(), list(range(9)) + ['Unknown']) +
            one_hot_encoding(atom.GetImplicitValence(), list(range(9)) + ['Unknown']) +
            [atom.GetIsAromatic()]
        )
        atom_features_list.append(atom_features)

    x = torch.tensor(atom_features_list, dtype=torch.float)
    edge_indices = [[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in mol.GetBonds()]
    edge_index = torch.tensor(edge_indices + [list(reversed(e)) for e in edge_indices]).t().contiguous() if edge_indices else torch.empty((2, 0), dtype=torch.long)

    try:
        y = torch.tensor([float(target)], dtype=torch.float)
    except ValueError:
        return None

    return torch_geometric.data.Data(x=x, edge_index=edge_index, y=y)

# Load dataset
df = pd.read_csv(dataset_path)
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=["smiles", target_col])

# Convert dataset to graphs
graph_data = [smiles_to_graph(row["smiles"], row[target_col]) for _, row in tqdm(df.iterrows(), total=len(df))]
graph_data = [g for g in graph_data if g is not None]

# Normalize target values
scaler = StandardScaler()
y_values = np.array([data.y.item() for data in graph_data]).reshape(-1, 1)
y_normalized = scaler.fit_transform(y_values)
for i, data in enumerate(graph_data):
    graph_data[i].y = torch.tensor(y_normalized[i], dtype=torch.float)

# Define GNN model
class GNN(nn.Module):
    def __init__(self, num_features):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 128)
        self.conv3 = GCNConv(128, 256)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = torch.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv2(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x).view(-1)

# Evaluate model
def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss, preds, actuals = 0.0, [], []
    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1))
            total_loss += loss.item()
            preds.extend(pred.cpu().numpy())
            actuals.extend(batch.y.cpu().numpy())
    preds = scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
    actuals = scaler.inverse_transform(np.array(actuals).reshape(-1, 1)).flatten()
    return total_loss / len(data_loader), mean_absolute_error(actuals, preds), r2_score(actuals, preds)

# Train model
def train_model(data, epochs=100, batch_size=32, lr=0.001, patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=2)

    model = GNN(data[0].x.shape[1]).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    criterion = nn.MSELoss()
    best_loss, early_stop_counter = float("inf"), 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()  # Clear previous gradients
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Evaluate on test data
        val_loss, mae, r2 = evaluate_model(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {total_loss / len(train_loader):.4f} - Validation Loss: {val_loss:.4f} - MAE: {mae:.4f} - R2: {r2:.4f}")

        scheduler.step(val_loss)
        if val_loss < best_loss:
            best_loss, early_stop_counter = val_loss, 0
            torch.save(model.state_dict(), "best_model.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping triggered.")
                break

# Train the model
train_model(graph_data)


 45%|████▍     | 641/1435 [00:02<00:01, 556.95it/s][10:06:26] SMILES Parse Error: syntax error while parsing: ERROR
[10:06:26] SMILES Parse Error: check for mistakes around position 1:
[10:06:26] ERROR
[10:06:26] ^
[10:06:26] SMILES Parse Error: Failed parsing SMILES 'ERROR' for input: 'ERROR'
 55%|█████▌    | 793/1435 [00:02<00:00, 652.18it/s][10:06:26] Explicit valence for atom # 18 C, 5, is greater than permitted
[10:06:26] Explicit valence for atom # 17 C, 5, is greater than permitted
 61%|██████    | 872/1435 [00:02<00:00, 686.79it/s][10:06:26] Explicit valence for atom # 4 C, 5, is greater than permitted
100%|██████████| 1435/1435 [00:04<00:00, 358.30it/s]


Epoch 1/100 - Training Loss: 1.0380 - Validation Loss: 0.8825 - MAE: 112732.6016 - R2: -0.0112
Epoch 2/100 - Training Loss: 1.0231 - Validation Loss: 0.8578 - MAE: 92252.7500 - R2: 0.0171
Epoch 3/100 - Training Loss: 0.9865 - Validation Loss: 0.8202 - MAE: 95075.6250 - R2: 0.0602
Epoch 4/100 - Training Loss: 0.9764 - Validation Loss: 0.8453 - MAE: 109098.2734 - R2: 0.0316
Epoch 5/100 - Training Loss: 0.9572 - Validation Loss: 0.8044 - MAE: 93602.4219 - R2: 0.0785
Epoch 6/100 - Training Loss: 0.9285 - Validation Loss: 0.8020 - MAE: 82133.9922 - R2: 0.0811
Epoch 7/100 - Training Loss: 0.9282 - Validation Loss: 0.8104 - MAE: 111987.3281 - R2: 0.0718
Epoch 8/100 - Training Loss: 0.9167 - Validation Loss: 0.7747 - MAE: 93981.0859 - R2: 0.1126
Epoch 9/100 - Training Loss: 0.9188 - Validation Loss: 0.7728 - MAE: 90530.6016 - R2: 0.1147
Epoch 10/100 - Training Loss: 0.9034 - Validation Loss: 0.8028 - MAE: 112124.7422 - R2: 0.0805
Epoch 11/100 - Training Loss: 0.8875 - Validation Loss: 0.7825 -

In [None]:
#MLM

import torch
import torch.nn as nn
import torch.optim as optim
import torch_geometric
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")

# Dataset path and target column (SELECT ONE)
dataset_path = "/content/finalMLM (1).csv"
target_col = "in-vitro_MLM_bienta: CLint (Num) (uL/min/mg)"

# Function: One-hot encoding for atoms
def one_hot_encoding(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]  # Assign "Unknown" if not in set
    return [x == s for s in allowable_set]

# Function: Convert SMILES to graph
def smiles_to_graph(smiles, target):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features = (
            one_hot_encoding(atom.GetSymbol(), ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
                                                'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
                                                'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
                                                'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
            one_hot_encoding(atom.GetDegree(), list(range(11)) + ['Unknown']) +
            one_hot_encoding(atom.GetTotalNumHs(), list(range(9)) + ['Unknown']) +
            one_hot_encoding(atom.GetImplicitValence(), list(range(9)) + ['Unknown']) +
            [atom.GetIsAromatic()]
        )
        atom_features_list.append(atom_features)

    x = torch.tensor(atom_features_list, dtype=torch.float)
    edge_indices = [[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in mol.GetBonds()]
    edge_index = torch.tensor(edge_indices + [list(reversed(e)) for e in edge_indices]).t().contiguous() if edge_indices else torch.empty((2, 0), dtype=torch.long)

    try:
        y = torch.tensor([float(target)], dtype=torch.float)
    except ValueError:
        return None

    return torch_geometric.data.Data(x=x, edge_index=edge_index, y=y)

# Load dataset
df = pd.read_csv(dataset_path)
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=["smiles", target_col])

# Convert dataset to graphs
graph_data = [smiles_to_graph(row["smiles"], row[target_col]) for _, row in tqdm(df.iterrows(), total=len(df))]
graph_data = [g for g in graph_data if g is not None]

# Normalize target values
scaler = StandardScaler()
y_values = np.array([data.y.item() for data in graph_data]).reshape(-1, 1)
y_normalized = scaler.fit_transform(y_values)
for i, data in enumerate(graph_data):
    graph_data[i].y = torch.tensor(y_normalized[i], dtype=torch.float)

# Define GNN model
class GNN(nn.Module):
    def __init__(self, num_features):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 128)
        self.conv3 = GCNConv(128, 256)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = torch.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv2(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x).view(-1)

# Evaluate model
def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss, preds, actuals = 0.0, [], []
    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1))
            total_loss += loss.item()
            preds.extend(pred.cpu().numpy())
            actuals.extend(batch.y.cpu().numpy())
    preds = scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
    actuals = scaler.inverse_transform(np.array(actuals).reshape(-1, 1)).flatten()
    return total_loss / len(data_loader), mean_absolute_error(actuals, preds), r2_score(actuals, preds)

# Train model
def train_model(data, epochs=100, batch_size=32, lr=0.001, patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=2)

    model = GNN(data[0].x.shape[1]).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    criterion = nn.MSELoss()
    best_loss, early_stop_counter = float("inf"), 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()  # Clear previous gradients
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Evaluate on test data
        val_loss, mae, r2 = evaluate_model(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {total_loss / len(train_loader):.4f} - Validation Loss: {val_loss:.4f} - MAE: {mae:.4f} - R2: {r2:.4f}")

        scheduler.step(val_loss)
        if val_loss < best_loss:
            best_loss, early_stop_counter = val_loss, 0
            torch.save(model.state_dict(), "best_modelM.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping triggered.")
                break

# Train the model
train_model(graph_data)


100%|██████████| 2516/2516 [00:04<00:00, 567.15it/s]


Epoch 1/100 - Training Loss: 0.8393 - Validation Loss: 1.6081 - MAE: 5124648.5000 - R2: 0.0039
Epoch 2/100 - Training Loss: 0.8138 - Validation Loss: 1.3940 - MAE: 4059339.2500 - R2: 0.1357
Epoch 3/100 - Training Loss: 0.6899 - Validation Loss: 1.0088 - MAE: 3246588.7500 - R2: 0.3745
Epoch 4/100 - Training Loss: 0.5054 - Validation Loss: 1.3508 - MAE: 5521192.5000 - R2: 0.1751
Epoch 5/100 - Training Loss: 0.3254 - Validation Loss: 0.7942 - MAE: 4168315.7500 - R2: 0.5073
Epoch 6/100 - Training Loss: 0.3444 - Validation Loss: 0.8506 - MAE: 3417702.2500 - R2: 0.4725
Epoch 7/100 - Training Loss: 0.3247 - Validation Loss: 0.7899 - MAE: 2736259.7500 - R2: 0.5117
Epoch 8/100 - Training Loss: 0.4041 - Validation Loss: 0.7672 - MAE: 4379133.0000 - R2: 0.5240
Epoch 9/100 - Training Loss: 0.2775 - Validation Loss: 0.9887 - MAE: 4377253.0000 - R2: 0.3872
Epoch 10/100 - Training Loss: 0.3335 - Validation Loss: 0.7825 - MAE: 3549728.0000 - R2: 0.5145
Epoch 11/100 - Training Loss: 0.2608 - Validation

In [None]:
#HLM

import torch
import torch.nn as nn
import torch.optim as optim
import torch_geometric
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")

# Dataset path and target column (SELECT ONE)
dataset_path = "/content/finalHLM (1).csv"
target_col = "in-vitro_HLM_bienta: CLint (Num) (uL/min/mg)"

# Function: One-hot encoding for atoms
def one_hot_encoding(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]  # Assign "Unknown" if not in set
    return [x == s for s in allowable_set]

# Function: Convert SMILES to graph
def smiles_to_graph(smiles, target):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features = (
            one_hot_encoding(atom.GetSymbol(), ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
                                                'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
                                                'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
                                                'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
            one_hot_encoding(atom.GetDegree(), list(range(11)) + ['Unknown']) +
            one_hot_encoding(atom.GetTotalNumHs(), list(range(9)) + ['Unknown']) +
            one_hot_encoding(atom.GetImplicitValence(), list(range(9)) + ['Unknown']) +
            [atom.GetIsAromatic()]
        )
        atom_features_list.append(atom_features)

    x = torch.tensor(atom_features_list, dtype=torch.float)
    edge_indices = [[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in mol.GetBonds()]
    edge_index = torch.tensor(edge_indices + [list(reversed(e)) for e in edge_indices]).t().contiguous() if edge_indices else torch.empty((2, 0), dtype=torch.long)

    try:
        y = torch.tensor([float(target)], dtype=torch.float)
    except ValueError:
        return None

    return torch_geometric.data.Data(x=x, edge_index=edge_index, y=y)

# Load dataset
df = pd.read_csv(dataset_path)
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=["smiles", target_col])

# Convert dataset to graphs
graph_data = [smiles_to_graph(row["smiles"], row[target_col]) for _, row in tqdm(df.iterrows(), total=len(df))]
graph_data = [g for g in graph_data if g is not None]

# Normalize target values
scaler = StandardScaler()
y_values = np.array([data.y.item() for data in graph_data]).reshape(-1, 1)
y_normalized = scaler.fit_transform(y_values)
for i, data in enumerate(graph_data):
    graph_data[i].y = torch.tensor(y_normalized[i], dtype=torch.float)

# Define GNN model
class GNN(nn.Module):
    def __init__(self, num_features):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 128)
        self.conv3 = GCNConv(128, 256)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = torch.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv2(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x).view(-1)

# Evaluate model
def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss, preds, actuals = 0.0, [], []
    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1))
            total_loss += loss.item()
            preds.extend(pred.cpu().numpy())
            actuals.extend(batch.y.cpu().numpy())
    preds = scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
    actuals = scaler.inverse_transform(np.array(actuals).reshape(-1, 1)).flatten()
    return total_loss / len(data_loader), mean_absolute_error(actuals, preds), r2_score(actuals, preds)

# Train model
def train_model(data, epochs=100, batch_size=32, lr=0.001, patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=2)

    model = GNN(data[0].x.shape[1]).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    criterion = nn.MSELoss()
    best_loss, early_stop_counter = float("inf"), 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()  # Clear previous gradients
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Evaluate on test data
        val_loss, mae, r2 = evaluate_model(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {total_loss / len(train_loader):.4f} - Validation Loss: {val_loss:.4f} - MAE: {mae:.4f} - R2: {r2:.4f}")

        scheduler.step(val_loss)
        if val_loss < best_loss:
            best_loss, early_stop_counter = val_loss, 0
            torch.save(model.state_dict(), "best_modelM.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping triggered.")
                break

# Train the model
train_model(graph_data)


100%|██████████| 8541/8541 [00:16<00:00, 508.09it/s]


Epoch 1/100 - Training Loss: 0.9597 - Validation Loss: 1.0498 - MAE: 46.5230 - R2: 0.0307
Epoch 2/100 - Training Loss: 0.9433 - Validation Loss: 1.0358 - MAE: 48.0920 - R2: 0.0436
Epoch 3/100 - Training Loss: 0.9328 - Validation Loss: 1.0388 - MAE: 51.4141 - R2: 0.0410
Epoch 4/100 - Training Loss: 0.9251 - Validation Loss: 1.0226 - MAE: 45.5121 - R2: 0.0562
Epoch 5/100 - Training Loss: 0.9144 - Validation Loss: 1.0234 - MAE: 43.6187 - R2: 0.0555
Epoch 6/100 - Training Loss: 0.9052 - Validation Loss: 1.0216 - MAE: 42.5978 - R2: 0.0572
Epoch 7/100 - Training Loss: 0.9016 - Validation Loss: 0.9977 - MAE: 44.3972 - R2: 0.0791
Epoch 8/100 - Training Loss: 0.9132 - Validation Loss: 1.0483 - MAE: 40.5269 - R2: 0.0325
Epoch 9/100 - Training Loss: 0.8989 - Validation Loss: 1.0372 - MAE: 41.0352 - R2: 0.0428
Epoch 10/100 - Training Loss: 0.8979 - Validation Loss: 0.9970 - MAE: 43.2471 - R2: 0.0799
Epoch 11/100 - Training Loss: 0.8954 - Validation Loss: 0.9806 - MAE: 44.7650 - R2: 0.0949
Epoch 12

In [None]:
#LOGD

import torch
import torch.nn as nn
import torch.optim as optim
import torch_geometric
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")

# Dataset path and target column (SELECT ONE)
dataset_path = "/content/Final_LogD_new_merged.csv"
target_col = "in-vitro_LogD_bienta: LogD (Num)"

# Function: One-hot encoding for atoms
def one_hot_encoding(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]  # Assign "Unknown" if not in set
    return [x == s for s in allowable_set]

# Function: Convert SMILES to graph
def smiles_to_graph(smiles, target):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features = (
            one_hot_encoding(atom.GetSymbol(), ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
                                                'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
                                                'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
                                                'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
            one_hot_encoding(atom.GetDegree(), list(range(11)) + ['Unknown']) +
            one_hot_encoding(atom.GetTotalNumHs(), list(range(9)) + ['Unknown']) +
            one_hot_encoding(atom.GetImplicitValence(), list(range(9)) + ['Unknown']) +
            [atom.GetIsAromatic()]
        )
        atom_features_list.append(atom_features)

    x = torch.tensor(atom_features_list, dtype=torch.float)
    edge_indices = [[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in mol.GetBonds()]
    edge_index = torch.tensor(edge_indices + [list(reversed(e)) for e in edge_indices]).t().contiguous() if edge_indices else torch.empty((2, 0), dtype=torch.long)

    try:
        y = torch.tensor([float(target)], dtype=torch.float)
    except ValueError:
        return None

    return torch_geometric.data.Data(x=x, edge_index=edge_index, y=y)

# Load dataset
df = pd.read_csv(dataset_path)
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=["smiles", target_col])

# Convert dataset to graphs
graph_data = [smiles_to_graph(row["smiles"], row[target_col]) for _, row in tqdm(df.iterrows(), total=len(df))]
graph_data = [g for g in graph_data if g is not None]

# Normalize target values
scaler = StandardScaler()
y_values = np.array([data.y.item() for data in graph_data]).reshape(-1, 1)
y_normalized = scaler.fit_transform(y_values)
for i, data in enumerate(graph_data):
    graph_data[i].y = torch.tensor(y_normalized[i], dtype=torch.float)

# Define GNN model
class GNN(nn.Module):
    def __init__(self, num_features):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 128)
        self.conv3 = GCNConv(128, 256)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = torch.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv2(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x).view(-1)

# Evaluate model
def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss, preds, actuals = 0.0, [], []
    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1))
            total_loss += loss.item()
            preds.extend(pred.cpu().numpy())
            actuals.extend(batch.y.cpu().numpy())
    preds = scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
    actuals = scaler.inverse_transform(np.array(actuals).reshape(-1, 1)).flatten()
    return total_loss / len(data_loader), mean_absolute_error(actuals, preds), r2_score(actuals, preds)

# Train model
def train_model(data, epochs=100, batch_size=32, lr=0.001, patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=2)

    model = GNN(data[0].x.shape[1]).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    criterion = nn.MSELoss()
    best_loss, early_stop_counter = float("inf"), 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()  # Clear previous gradients
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Evaluate on test data
        val_loss, mae, r2 = evaluate_model(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {total_loss / len(train_loader):.4f} - Validation Loss: {val_loss:.4f} - MAE: {mae:.4f} - R2: {r2:.4f}")

        scheduler.step(val_loss)
        if val_loss < best_loss:
            best_loss, early_stop_counter = val_loss, 0
            torch.save(model.state_dict(), "best_modelM.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping triggered.")
                break

# Train the model
train_model(graph_data)


 89%|████████▊ | 15781/17785 [00:26<00:03, 666.82it/s][10:58:16] non-ring atom 0 marked aromatic
[10:58:16] non-ring atom 0 marked aromatic
100%|██████████| 17785/17785 [00:29<00:00, 595.72it/s]


Epoch 1/100 - Training Loss: 0.7042 - Validation Loss: 0.5095 - MAE: 0.8645 - R2: 0.4939
Epoch 2/100 - Training Loss: 0.5244 - Validation Loss: 0.4060 - MAE: 0.7611 - R2: 0.5987
Epoch 3/100 - Training Loss: 0.4521 - Validation Loss: 0.3886 - MAE: 0.7387 - R2: 0.6165
Epoch 4/100 - Training Loss: 0.4284 - Validation Loss: 0.3611 - MAE: 0.7087 - R2: 0.6441
Epoch 5/100 - Training Loss: 0.3996 - Validation Loss: 0.3197 - MAE: 0.6712 - R2: 0.6840
Epoch 6/100 - Training Loss: 0.3888 - Validation Loss: 0.3935 - MAE: 0.7673 - R2: 0.6098
Epoch 7/100 - Training Loss: 0.3730 - Validation Loss: 0.3191 - MAE: 0.6668 - R2: 0.6860
Epoch 8/100 - Training Loss: 0.3603 - Validation Loss: 0.2965 - MAE: 0.6400 - R2: 0.7079
Epoch 9/100 - Training Loss: 0.3568 - Validation Loss: 0.3038 - MAE: 0.6473 - R2: 0.7005
Epoch 10/100 - Training Loss: 0.3475 - Validation Loss: 0.3400 - MAE: 0.6937 - R2: 0.6635
Epoch 11/100 - Training Loss: 0.3370 - Validation Loss: 0.2777 - MAE: 0.6230 - R2: 0.7241
Epoch 12/100 - Trai

In [None]:
#KSOL

import torch
import torch.nn as nn
import torch.optim as optim
import torch_geometric
from torch_geometric.loader import DataLoader
from torch_geometric.nn import GCNConv, global_mean_pool
from rdkit import Chem
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.preprocessing import StandardScaler
import numpy as np
import pandas as pd
from tqdm import tqdm
import warnings

# Suppress warnings
warnings.filterwarnings("ignore", category=UserWarning, module="torch.optim.lr_scheduler")

# Dataset path and target column (SELECT ONE)
dataset_path = "/content/finalKSOL (1).csv"
target_col = "in-vitro_KSOL-PBS_bienta: mean_solubility (Num) (uM)"

# Function: One-hot encoding for atoms
def one_hot_encoding(x, allowable_set):
    if x not in allowable_set:
        x = allowable_set[-1]  # Assign "Unknown" if not in set
    return [x == s for s in allowable_set]

# Function: Convert SMILES to graph
def smiles_to_graph(smiles, target):
    mol = Chem.MolFromSmiles(smiles)
    if mol is None:
        return None

    atom_features_list = []
    for atom in mol.GetAtoms():
        atom_features = (
            one_hot_encoding(atom.GetSymbol(), ['C', 'N', 'O', 'S', 'F', 'Si', 'P', 'Cl', 'Br', 'Mg', 'Na', 'Ca', 'Fe',
                                                'As', 'Al', 'I', 'B', 'V', 'K', 'Tl', 'Yb', 'Sb', 'Sn', 'Ag', 'Pd',
                                                'Co', 'Se', 'Ti', 'Zn', 'H', 'Li', 'Ge', 'Cu', 'Au', 'Ni', 'Cd', 'In',
                                                'Mn', 'Zr', 'Cr', 'Pt', 'Hg', 'Pb', 'Unknown']) +
            one_hot_encoding(atom.GetDegree(), list(range(11)) + ['Unknown']) +
            one_hot_encoding(atom.GetTotalNumHs(), list(range(9)) + ['Unknown']) +
            one_hot_encoding(atom.GetImplicitValence(), list(range(9)) + ['Unknown']) +
            [atom.GetIsAromatic()]
        )
        atom_features_list.append(atom_features)

    x = torch.tensor(atom_features_list, dtype=torch.float)
    edge_indices = [[bond.GetBeginAtomIdx(), bond.GetEndAtomIdx()] for bond in mol.GetBonds()]
    edge_index = torch.tensor(edge_indices + [list(reversed(e)) for e in edge_indices]).t().contiguous() if edge_indices else torch.empty((2, 0), dtype=torch.long)

    try:
        y = torch.tensor([float(target)], dtype=torch.float)
    except ValueError:
        return None

    return torch_geometric.data.Data(x=x, edge_index=edge_index, y=y)

# Load dataset
df = pd.read_csv(dataset_path)
df[target_col] = pd.to_numeric(df[target_col], errors="coerce")
df = df.dropna(subset=["smiles", target_col])

# Convert dataset to graphs
graph_data = [smiles_to_graph(row["smiles"], row[target_col]) for _, row in tqdm(df.iterrows(), total=len(df))]
graph_data = [g for g in graph_data if g is not None]

# Normalize target values
scaler = StandardScaler()
y_values = np.array([data.y.item() for data in graph_data]).reshape(-1, 1)
y_normalized = scaler.fit_transform(y_values)
for i, data in enumerate(graph_data):
    graph_data[i].y = torch.tensor(y_normalized[i], dtype=torch.float)

# Define GNN model
class GNN(nn.Module):
    def __init__(self, num_features):
        super(GNN, self).__init__()
        self.conv1 = GCNConv(num_features, 64)
        self.conv2 = GCNConv(64, 128)
        self.conv3 = GCNConv(128, 256)
        self.fc1 = nn.Linear(256, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.dropout = nn.Dropout(0.2)

    def forward(self, data):
        x, edge_index, batch = data.x, data.edge_index, data.batch
        x = torch.relu(self.conv1(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv2(x, edge_index))
        x = self.dropout(x)
        x = torch.relu(self.conv3(x, edge_index))
        x = global_mean_pool(x, batch)
        x = torch.relu(self.fc1(x))
        x = self.dropout(x)
        x = torch.relu(self.fc2(x))
        return self.fc3(x).view(-1)

# Evaluate model
def evaluate_model(model, data_loader, criterion, device):
    model.eval()
    total_loss, preds, actuals = 0.0, [], []
    with torch.no_grad():
        for batch in data_loader:
            batch = batch.to(device)
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1))
            total_loss += loss.item()
            preds.extend(pred.cpu().numpy())
            actuals.extend(batch.y.cpu().numpy())
    preds = scaler.inverse_transform(np.array(preds).reshape(-1, 1)).flatten()
    actuals = scaler.inverse_transform(np.array(actuals).reshape(-1, 1)).flatten()
    return total_loss / len(data_loader), mean_absolute_error(actuals, preds), r2_score(actuals, preds)

# Train model
def train_model(data, epochs=100, batch_size=32, lr=0.001, patience=10):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    train_loader = DataLoader(train_data, batch_size=batch_size, shuffle=True, num_workers=2)
    test_loader = DataLoader(test_data, batch_size=batch_size, num_workers=2)

    model = GNN(data[0].x.shape[1]).to(device)
    optimizer = optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.5, patience=5)
    criterion = nn.MSELoss()
    best_loss, early_stop_counter = float("inf"), 0

    for epoch in range(epochs):
        model.train()
        total_loss = 0.0
        for batch in train_loader:
            batch = batch.to(device)
            optimizer.zero_grad()  # Clear previous gradients
            pred = model(batch)
            loss = criterion(pred, batch.y.view(-1))
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        # Evaluate on test data
        val_loss, mae, r2 = evaluate_model(model, test_loader, criterion, device)
        print(f"Epoch {epoch+1}/{epochs} - Training Loss: {total_loss / len(train_loader):.4f} - Validation Loss: {val_loss:.4f} - MAE: {mae:.4f} - R2: {r2:.4f}")

        scheduler.step(val_loss)
        if val_loss < best_loss:
            best_loss, early_stop_counter = val_loss, 0
            torch.save(model.state_dict(), "best_modelM.pth")
        else:
            early_stop_counter += 1
            if early_stop_counter >= patience:
                print("Early stopping triggered.")
                break

# Train the model
train_model(graph_data)


[12:11:18] Explicit valence for atom # 5 N, 4, is greater than permitted
 43%|████▎     | 10927/25174 [00:14<00:26, 535.55it/s][12:11:23] Explicit valence for atom # 5 N, 4, is greater than permitted
[12:11:31] Explicit valence for atom # 5 N, 4, is greater than permitted
 84%|████████▍ | 21249/25174 [00:26<00:07, 554.02it/s][12:11:36] Explicit valence for atom # 5 N, 4, is greater than permitted
100%|██████████| 25174/25174 [00:32<00:00, 786.03it/s]


Epoch 1/100 - Training Loss: 1.2470 - Validation Loss: 0.0006 - MAE: 966373.6875 - R2: 0.1760
Epoch 2/100 - Training Loss: 1.2472 - Validation Loss: 0.0006 - MAE: 707694.6250 - R2: 0.2259
Epoch 3/100 - Training Loss: 1.2448 - Validation Loss: 0.0020 - MAE: 4320383.5000 - R2: -1.6904
Epoch 4/100 - Training Loss: 1.2456 - Validation Loss: 0.0006 - MAE: 880928.4375 - R2: 0.2318
Epoch 5/100 - Training Loss: 1.2420 - Validation Loss: 0.0005 - MAE: 765769.3125 - R2: 0.2687
Epoch 6/100 - Training Loss: 1.2417 - Validation Loss: 0.0007 - MAE: 1670716.1250 - R2: 0.0692
Epoch 7/100 - Training Loss: 1.2429 - Validation Loss: 0.0007 - MAE: 1619854.5000 - R2: 0.0831
Epoch 8/100 - Training Loss: 1.2461 - Validation Loss: 0.0007 - MAE: 1842581.6250 - R2: 0.0081
Epoch 9/100 - Training Loss: 1.2437 - Validation Loss: 0.0008 - MAE: 1648603.6250 - R2: -0.0536
Epoch 10/100 - Training Loss: 1.2460 - Validation Loss: 0.0007 - MAE: 1473221.2500 - R2: 0.0376
Epoch 11/100 - Training Loss: 1.2414 - Validation L