# Case 3: Predicting Electron Density with pre-trained DeepDFT

## Introduction

DeepDFT employs a Message-Passing scheme to compute electron density in real 3D space. For that, it requires the construction of a graph that joins atomic nodes and probe nodes. In this small study case, we will employ DeepDFT to compute electron density cube files for different suggar molecules. 



## Libraries and dependencies

In [1]:
import torch
import math
import utils
import densitymodel
import os
import argparse
import json
import ase
import ase.io
import numpy as np
import dataset
from ase.units import Bohr

In [2]:
class LazyMeshGrid():
    def __init__(self, cell, grid_step, origin=None):
        self.cell = cell
        self.scaled_grid_vectors = [np.arange(0, l, grid_step)/l for l in self.cell.lengths()]
        self.shape = np.array([len(g) for g in self.scaled_grid_vectors] + [3])
        if origin is None:
            self.origin = np.zeros(3)
        else:
            self.origin = origin

        self.origin = np.expand_dims(self.origin, 0)

    def __getitem__(self, indices):
        indices = np.array(indices)
        indices_shape = indices.shape
        if not (len(indices_shape) == 2 and indices_shape[0] == 3):
            raise NotImplementedError("Indexing must be a 3xN array-like object")
        gridA = self.scaled_grid_vectors[0][indices[0]]
        gridB = self.scaled_grid_vectors[1][indices[1]]
        gridC = self.scaled_grid_vectors[2][indices[2]]

        grid_pos = np.stack([gridA, gridB, gridC], 1)
        grid_pos = np.dot(grid_pos, self.cell)
        grid_pos += self.origin

        return grid_pos

In [3]:
def load_model(model_dir, device):
    """
    load_model
    ==========
    Parameters
    ----------
    model_dir: str
        Where the model is located
    device: torch.Device    
        Where to store the model (cpu, cuda, etc)
    Returns
    -------
    densitymodel.DensityModel, float
    
    Examples
    --------
    >>> load_model(deepdft_folder / 'qm9_pretrained_model', torch.device('cuda:0'))
    """
    with open(os.path.join(model_dir, "arguments.json"), "r") as f:
        runner_args = argparse.Namespace(**json.load(f))
    model = densitymodel.DensityModel(runner_args.num_interactions, runner_args.node_size, runner_args.cutoff)
    device = torch.device(device)
    model.to(device)
    state_dict = torch.load(os.path.join(model_dir, "best_model.pth"))
    model.load_state_dict(state_dict["model"])
    return model, runner_args.cutoff


def load_molecule(atomspath, vacuum, grid_step):
    """
    load_molecule
    =============
    
    Parameters
    ----------
    atomspath: str
        File. Tested on xyz and mol formats
    vacuum: float
        padding
    grid_step: number of grids
    """
    atoms = ase.io.read(atomspath)
    
    diff = atoms.get_positions().max(0) - atoms.get_positions().min(0)
    
    atoms.center(vacuum=vacuum)

    
    
    a, b, c, ang_bc, ang_ac, ang_ab = atoms.get_cell_lengths_and_angles()
    a, b, c = ceil_float(a, grid_step), ceil_float(b, grid_step), ceil_float(c, grid_step)
    atoms.set_cell([a, b, c, ang_bc, ang_ac, ang_ab])

    origin = np.zeros(3)

    grid_pos = LazyMeshGrid(atoms.get_cell(), grid_step)

    metadata = {"filename": atomspath}
    return {
        # "density": density,
        "atoms": atoms,
        "origin": origin,
        "grid_position": grid_pos,
        "metadata": metadata, # Meta information
        "diff": diff
    }



def ceil_float(x, step_size):
    # Round up to nearest step_size and subtract a small epsilon
    x = math.ceil(x/step_size) * step_size
    # eps = 2*np.finfo(float).eps * x
    return x # - eps

In [4]:
u = ase.io.read('xyz/sugbench_000000.opt.xyz')

In [5]:
u.get_positions().max(0) - u.get_positions().min(0)

array([11.0902, 11.5524,  8.2889])

## Exercises

### Ex1: Save cube files with electron densities

Bellow we have re-adapted DeepDFT script for the evaluation of cube files.
The first step is about loading the model. We will use their pretrained model on GDB9, which is a dataset containing 134k organic molecules with less than 9 heavy atoms.


In [6]:
device = torch.device(torch.device('cuda:0'))
model, cutoff = load_model('/home/bcz/dev/DeepDFT/qm9_pretrained_model/', device)
grid_step = 0.25 # If required, we can make tighter grids by reducing this value
#                  for visualization purposes, it's ok
padding = 1.0 # If required, we can increase it to visualize the density in far regions
#               Yet, if it becomes too large, density calcualtion will be simply clamped
bohr2ang = 0.529177 # DeepDFT works in electron/(angstrom^-3). We will move it to atomic units
bohr2angp3 = bohr2ang ** 3

The protocol employs the Atomic Simulation Environment (ASE). This library accepts .mol files and .xyz files. Our optimized molecules are at .mol2 format, so we will translate them to xyz using babel

In [7]:
%%bash

mkdir xyz
cd ./mol2/ 
ls *.mol2 | grep opt | sed "s/.mol2//g" | xargs -I % -P 10 obabel -i mol2 %.mol2 -o xyz -O ../xyz/%.xyz

mkdir: cannot create directory ‘xyz’: File exists
11 molecule converted
 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
11 molecule molecule converted converted

1 molecule converted
1 molecule converted
1 molecule converted
11 molecule converted1
 molecule converted molecule converted

1 molecule converted
11 molecule converted molecule
 converted
1 molecule1 converted molecule
 converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
11 molecule converted
 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
1 molecule converted
11 molecule converted
 molecule converted1
 molecule converted
1 molecule converted
1 mol

Loading all the molecules in a list of (name,molecule) tuples

In [8]:
with open('./wfn/sugbench.index') as f:
    index = [i.strip().replace('.orca', '') for i in f.readlines()]
    
sugbench = []
for i in index: 
    mol = load_molecule('./xyz/{:s}.xyz'.format(i), vacuum=padding, grid_step=grid_step)
    sugbench.append(
        (
            i, mol
            
        )
    )



DeepDFT works as a message-passing neural network. This means that any computation requires building graphs between atoms (nodes) and probes (probe nodes). DeepDFT uses distance as adjacency criteria, and the distance between nodes as edge data.


*collate_fn* generates such graph for atomic coordinates, so the MPNN can process the molecule information

The calculation of the electron density is performed at two steps:

1. Calculation of molecule representation, through message-passing between atomic nodes. Probing nodes are excluded
2. Calculation of electron density at probing coordinates, through message-passing from atomic nodes to probing nodes.

In [9]:
collate_fn = dataset.CollateFuncAtoms(
    cutoff=cutoff,
    pin_memory=True,
    disable_pbc=True,
)


In [10]:
with torch.no_grad(): # Disabling gradient calculation is important to avoid memory deplition
    for i, density_dict in sugbench: # iterating on the suggar dataset
        print("processing {:s}".format(i), end=" ")
        r = - density_dict["diff"] / 2
        cubewriter = utils.CubeWriter(
            "./cube/{:s}.deepdft.cube".format(i),
            density_dict["atoms"],
            density_dict["grid_position"].shape[0:3],
            density_dict["origin"],
            "predicted by DeepDFT model",
        )
        # Part 1: Message Passing between atomic nodes
        graph_dict = collate_fn([density_dict])
        device_batch = {
            k: v.to(device=device, non_blocking=True) for k, v in graph_dict.items()
        }
        atom_representation = model.atom_model(device_batch)
        
        # Part 2: Message Passing from atomic nodes to probing nodes
        # given that there are many probing nodes, calculations become iterative
        density_iter = dataset.DensityGridIterator(density_dict, True, 1000, cutoff)
        for probe_graph_dict in density_iter:

            probe_dict = dataset.collate_list_of_dicts([probe_graph_dict])
            probe_dict = {
                k: v.to(device=device, non_blocking=True) for k, v in probe_dict.items()
            }
            device_batch["probe_edges"] = probe_dict["probe_edges"]
            device_batch["probe_edges_features"] = probe_dict["probe_edges_features"]
            device_batch["num_probe_edges"] = probe_dict["num_probe_edges"]
            device_batch["num_probes"] = probe_dict["num_probes"]

            cubewriter.write(bohr2angp3 * model.probe_model(device_batch, atom_representation).cpu().detach().numpy().flatten())
        ase.io.write('./cube/{:s}.xyz'.format(i), density_dict['atoms'])
        print("-- DONE")
        break

processing sugbench_000000.opt -- DONE


The results can be visualized using different programs. In this case, we will use ChimeraX.

<img src="movie1.gif" alt="">

## Ex 2: Calculating electron density at custom coordinates



In [114]:
def process_arbitrary_coordinates(atoms, r, cutoff):
    probe_edges, probe_edge_features = dataset.probes_to_graph(atoms, r, cutoff)
    probe_edges = np.concatenate(probe_edges, axis=0)
    probe_edge_features = np.concatenate(probe_edge_features, axis=0)[:, None]
    num_probe_edges = probe_edges.shape[0]
    num_probes = r.shape[0]
    probe_dict = dict(
        probe_edges=torch.tensor(probe_edges, dtype=torch.long), 
        probe_edges_features=torch.tensor(probe_edge_features, dtype=torch.float), 
        num_probe_edges=torch.tensor(num_probe_edges), num_probes=torch.tensor(num_probes)
    )
    return probe_dict

In [139]:
r = density_dict['atoms'].get_positions().mean(0).reshape(1, 3) + np.random.rand(100, 3) * 3.0

with torch.no_grad(): # Disabling gradient calculation is important to avoid memory deplition

    density_dict = sugbench[0][1]
    
    # Part 1: Message Passing between atomic nodes
    graph_dict = collate_fn([density_dict])
    device_batch = {
        k: v.to(device=device, non_blocking=True) for k, v in graph_dict.items()
    }
    atom_representation = model.atom_model(device_batch)

    # Part 2: Message Passing from atomic nodes to probing nodes
    # given that there are many probing nodes, calculations become iterative
    probe_graph_dict = process_arbitrary_coordinates(density_dict['atoms'], r, cutoff)
    probe_dict = dataset.collate_list_of_dicts([probe_graph_dict])
    probe_dict = {
        k: v.to(device=device, non_blocking=True) for k, v in probe_dict.items()
    }
    device_batch["probe_edges"] = probe_dict["probe_edges"]
    device_batch["probe_edges_features"] = probe_dict["probe_edges_features"]
    device_batch["num_probe_edges"] = probe_dict["num_probe_edges"]
    device_batch["num_probes"] = probe_dict["num_probes"]
    
    p = bohr2angp3 * model.probe_model(device_batch, atom_representation).cpu().detach().numpy().flatten()
    print("-- DONE")

-- DONE
