In [1]:
import torch
from rdkit import Chem
from rdkit.Chem import rdmolops
from torch_geometric.data import Data
from model import GCNModel
from pathlib import Path
import numpy as np
import pandas as pd

In [4]:
# Node features (atom features)
def get_atom_features(atom):
    return [
        atom.GetMass(),                  # Atomic mass
        atom.GetAtomicNum(),             # Atomic number
        atom.GetDegree(),                # Degree (number of bonded neighbors)
        atom.GetTotalDegree(),
        atom.GetTotalNumHs(),            # Number of hydrogen atoms attached
        atom.GetImplicitValence(),       # Implicit valence
        atom.GetFormalCharge(),          # Formal charge
        int(atom.GetHybridization()),       # Is hydrogen
        int(atom.GetIsAromatic()),        # Aromaticity
        atom.GetNumRadicalElectrons(),   # Number of radical electrons
        atom.GetNumImplicitHs(),         # Number of implicit hydrogens
        atom.GetNumExplicitHs(),         # Number of explicit hydrogens
    ]

def get_bond_features(bond):
    return [
        bond.GetBondTypeAsDouble(),
        int(bond.IsInRing()),
        int(bond.GetIsAromatic()),
        int(bond.GetIsConjugated())
    ]


In [2]:
data_root = Path("../data")
weights_path = Path("./weights")
train_file_path = data_root / "train.csv"
test_file_path = data_root / "test.csv"
output_file_path = data_root / "submission.csv"

device = "cuda" if torch.cuda.is_available() else "cpu"

train = pd.read_csv(train_file_path)
test = pd.read_csv(test_file_path)
submission = test.copy()
submission.drop(columns=["smiles"], inplace=True)

In [37]:
def make_graph_data(smiles: str, device='cuda'):
    # Convert SMILES to RDKit molecule object
    molecule = Chem.MolFromSmiles(smiles)
    # Check if molecule conversion was successful
    if molecule is None:
        raise ValueError("Invalid SMILES string")
        # Build node features
    node_features = [get_atom_features(atom) for atom in molecule.GetAtoms()]
    node_features = torch.tensor(node_features, dtype=torch.float, device=device)

    # Edge indices and edge features
    edges = []
    edge_features = []
    for bond in molecule.GetBonds():
        i = bond.GetBeginAtomIdx()
        j = bond.GetEndAtomIdx()
        edges.append((i, j))
        edge_features.append(get_bond_features(bond))

    # Convert edge list and edge features to tensors
    edge_index = torch.tensor(edges, dtype=torch.long, device=device).t().contiguous()
    edge_attr = torch.tensor(edge_features, dtype=torch.float, device=device)
    # Create PyTorch Geometric data object
    return Data(x=node_features, edge_index=edge_index, edge_attr=edge_attr)


class MoleculeDataset(torch.utils.data.Dataset):
    def __init__(self, df:pd.DataFrame, device='cuda'):
        self.df = df
        self.device = device

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        graph_data = make_graph_data(self.df['smiles'][idx], device=self.device)
        label = torch.tensor(self.df['activity'][idx], dtype=torch.float, device=self.device)
        return graph_data, label

In [38]:
dataset = MoleculeDataset(train)
dataloader = torch.utils.data.DataLoader(dataset, batch_size=32, shuffle=True)

In [26]:
# Example model instantiation
model = GCNModel(in_channels=dataset[0][0].x.shape[1], hidden_channels=32, out_channels=1)

NameError: name 'node_features' is not defined

In [74]:
model(data)

torch.Size([15, 1])


tensor([[0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.],
        [0.]], grad_fn=<LogSoftmaxBackward0>)