In [38]:
import os
import numpy as np
import pandas as pd
import tensorflow as tf
from rdkit import Chem
from kgcnn.data.utils import ragged_tensor_from_nested_numpy
from kgcnn.literature.CGCNN import make_crystal_model

class MoleculeDataset:
    def __init__(self, energy_csv_file, path_to_position_atoms, augment=False):
        self.df = pd.read_csv(energy_csv_file)
        self.path_position_atoms = path_to_position_atoms
        self.augment = augment
        
    def _read_xyz_file(self, file_path):
        with open(file_path, 'r') as file:
            lines = file.readlines()[2:]
            atoms = []
            positions = []
            for line in lines:
                parts = line.split()
                atom = parts[0]
                x = float(parts[1])
                y = float(parts[2])
                z = float(parts[3])
                atoms.append(atom)
                positions.append([x, y, z])
        return atoms, np.array(positions)

    def _add_noise(self, positions):
        noise = np.random.normal(0, 0.1, positions.shape)
        return positions + noise

    def _atom_features(self, atoms):
        atom_types = list(set(atoms))
        atom_features = [atom_types.index(atom) for atom in atoms]
        return np.array(atom_features).reshape(-1, 1)

    def _bond_features(self, atoms, positions):
        mol = self._create_molecule(atoms, positions)
        edge_index = []
        edge_attr = []
        for bond in mol.GetBonds():
            i = bond.GetBeginAtomIdx()
            j = bond.GetEndAtomIdx()
            edge_index.append([i, j])
            edge_index.append([j, i])
            edge_attr.append([1])
            edge_attr.append([1])
        return np.array(edge_index).T, np.array(edge_attr).reshape(-1, 1)

    def _create_molecule(self, atoms, positions):
        mol = Chem.RWMol()
        conf = Chem.Conformer(len(atoms))
        for i, (atom, pos) in enumerate(zip(atoms, positions)):
            element = atom
            x, y, z = pos
            atom_idx = mol.AddAtom(Chem.Atom(element))
            conf.SetAtomPosition(atom_idx, (x, y, z))
        mol.AddConformer(conf)
        return mol

    def load_data(self):
        node_attributes = []
        edge_indices = []
        edge_attributes = []
        energies = []

        for idx in range(len(self.df)):
            energy = self.df.energy[idx]
            id = self.df.id[idx]
            new_id = f"id_{id}.xyz"
            file_path = os.path.join(self.path_position_atoms, new_id)
            atoms, positions = self._read_xyz_file(file_path)
            
            if self.augment:
                positions = self._add_noise(positions)
            
            atom_features = self._atom_features(atoms)
            edge_index, edge_attr = self._bond_features(atoms, positions)
            
            node_attributes.append(atom_features)
            edge_indices.append(edge_index)
            edge_attributes.append(edge_attr)
            energies.append(energy)
        
        return node_attributes, edge_indices, edge_attributes, energies

# Load and format the dataset
dataset = MoleculeDataset(energy_csv_file='./data/energies/train.csv', path_to_position_atoms='./data/atoms/train/')
node_attributes, edge_indices, edge_attributes, energies = dataset.load_data()

# Convert to ragged tensor format
node_attributes = tf.ragged.constant(node_attributes)
edge_indices = tf.ragged.constant(edge_indices)
edge_attributes = tf.ragged.constant(edge_attributes)
energies = tf.convert_to_tensor(energies, dtype=tf.float32)


In [39]:
# Adjusted model creation
model = make_crystal_model(
    inputs=[
        {"shape": (None,), "name": "node_attributes", "dtype": "float32"},
        {"shape": (None, 2), "name": "edge_indices", "dtype": "int64"},
        {"shape": (None,), "name": "edge_attributes", "dtype": "float32"}
    ],
    input_embedding={"node": {"input_dim": 95, "output_dim": 64}},
    representation="graph",
    make_distances=False,
    expand_distance=False,
    gauss_args={"bins": 50, "distance": 8.0, "offset": 0.0, "sigma": 0.4},
    depth=4,
    conv_layer_args={"units": 64, "activation_s": "kgcnn>leaky_relu", "activation_out": "linear"},
    node_pooling_args={"pooling_method": "mean"},
    verbose=1,
    output_embedding="graph",
    output_mlp={"use_bias": [True, True], "units": [64, 1], "activation": ["kgcnn>leaky_relu", "linear"]}
)

# Compile the model
model.compile(optimizer='adam', loss='mean_squared_error')

# Ensure all tensors are dense tensors
node_attributes_dense = node_attributes.to_tensor()
edge_indices_dense = edge_indices.to_tensor()
edge_attributes_dense = edge_attributes.to_tensor()

# Print tensor shapes for debugging
print(f"Node attributes shape: {node_attributes_dense.shape}")
print(f"Edge indices shape: {edge_indices_dense.shape}")
print(f"Edge attributes shape: {edge_attributes_dense.shape}")

# Train the model
model.fit(
    {
        "node_attributes": node_attributes_dense,
        "edge_indices": edge_indices_dense,
        "edge_attributes": edge_attributes_dense
    },
    energies,
    epochs=50,
    batch_size=32
)

# Save the model
model.save('trained_cgcnn_model.h5')

# Predict energy values for new data
predictions = model.predict(
    {
        "node_attributes": node_attributes_dense,
        "edge_indices": edge_indices_dense,
        "edge_attributes": edge_attributes_dense
    }
)


INFO:kgcnn.utils.models:Updated model kwargs:
INFO:kgcnn.utils.models:{'name': 'CGCNN', 'inputs': [{'shape': (None,), 'name': 'node_attributes', 'dtype': 'float32'}, {'shape': (None, 2), 'name': 'edge_indices', 'dtype': 'int64'}, {'shape': (None,), 'name': 'edge_attributes', 'dtype': 'float32'}], 'input_embedding': {'node': {'input_dim': 95, 'output_dim': 64}}, 'representation': 'graph', 'expand_distance': False, 'make_distances': False, 'gauss_args': {'bins': 50, 'distance': 8.0, 'offset': 0.0, 'sigma': 0.4}, 'depth': 4, 'verbose': 1, 'conv_layer_args': {'units': 64, 'activation_s': 'kgcnn>leaky_relu', 'activation_out': 'linear', 'batch_normalization': True}, 'node_pooling_args': {'pooling_method': 'mean'}, 'output_embedding': 'graph', 'output_mlp': {'use_bias': [True, True], 'units': [64, 1], 'activation': ['kgcnn>leaky_relu', 'linear']}}


ValueError: Exception encountered when calling layer "cgcnn_layer_10" (type CGCNNLayer).

in user code:

    File "/home/smussard/.conda/envs/mysepteo/lib/python3.7/site-packages/kgcnn/layers/message.py", line 91, in call  *
        n_in, n_out = self.lay_gather([nodes, edge_index], **kwargs)
    File "/home/smussard/.conda/envs/mysepteo/lib/python3.7/site-packages/keras/utils/traceback_utils.py", line 67, in error_handler  **
        raise e.with_traceback(filtered_tb) from None
    File "/tmp/__autograph_generated_filecr_gq597.py", line 85, in tf__call
        ag__.if_stmt(ag__.not_(do_return), if_body_2, else_body_2, get_state_2, set_state_2, ('do_return', 'retval_'), 2)
    File "/tmp/__autograph_generated_filecr_gq597.py", line 74, in if_body_2
        out = [ag__.converted_call(ag__.ld(tf).gather, (ag__.ld(inputs)[0], ag__.converted_call(ag__.ld(tf).gather, (ag__.ld(inputs)[1], ag__.ld(i)), dict(axis=ag__.ld(self).axis_indices), fscope)), dict(batch_dims=1, axis=ag__.ld(self).axis), fscope) for i in ag__.ld(self).selection_index]
    File "/tmp/__autograph_generated_filecr_gq597.py", line 74, in <listcomp>
        out = [ag__.converted_call(ag__.ld(tf).gather, (ag__.ld(inputs)[0], ag__.converted_call(ag__.ld(tf).gather, (ag__.ld(inputs)[1], ag__.ld(i)), dict(axis=ag__.ld(self).axis_indices), fscope)), dict(batch_dims=1, axis=ag__.ld(self).axis), fscope) for i in ag__.ld(self).selection_index]

    ValueError: Exception encountered when calling layer "gather_embedding_selection_10" (type GatherEmbeddingSelection).
    
    in user code:
    
        File "/home/smussard/.conda/envs/mysepteo/lib/python3.7/site-packages/kgcnn/layers/gather.py", line 193, in call  *
            out = [tf.gather(inputs[0], tf.gather(inputs[1], i, axis=self.axis_indices), batch_dims=1, axis=self.axis) for i
    
        ValueError: Shape must be at least rank 3 but is rank 2 for '{{node cgcnn_layer_10/gather_embedding_selection_10/GatherV2}} = GatherV2[Taxis=DT_INT32, Tindices=DT_INT32, Tparams=DT_FLOAT, batch_dims=0](Placeholder_2, cgcnn_layer_10/gather_embedding_selection_10/GatherV2/indices, cgcnn_layer_10/gather_embedding_selection_10/GatherV2/axis)' with input shapes: [?,?], [], [] and with computed input tensors: input[2] = <2>.
    
    
    Call arguments received by layer "gather_embedding_selection_10" (type GatherEmbeddingSelection):
      • inputs=['tf.Tensor(shape=(None, None, 64), dtype=float32)', 'tf.Tensor(shape=(None, None), dtype=float32)']
      • kwargs={'training': 'None'}


Call arguments received by layer "cgcnn_layer_10" (type CGCNNLayer):
  • inputs=['tf.Tensor(shape=(None, None, 64), dtype=float32)', 'tf.Tensor(shape=(None, None, 2), dtype=int64)', 'tf.Tensor(shape=(None, None), dtype=float32)']
  • kwargs={'training': 'None'}