<a href="https://colab.research.google.com/github/ariahosseini/DeepML/blob/main/027_SchPackNet_Proj_TwentySeven.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install --upgrade schnetpack

## Prepare and Load Data

In [None]:
import os
import numpy as np
from ase import Atoms

In [None]:
from schnetpack.data import ASEAtomsData
from schnetpack.datasets import QM9
from schnetpack.transform import ASENeighborList
import schnetpack as spk
import schnetpack.transform as trn

In [None]:
qm9data = QM9(
    './qm9.db',
    batch_size=10,
    num_train=110000,
    num_val=10000,
    split_file='./split_qm9.npz',
    transforms=[ASENeighborList(cutoff=5.)]
)
qm9data.prepare_data()
qm9data.setup()

100%|██████████| 133885/133885 [03:00<00:00, 739.72it/s]


In [None]:
print('Number of reference calculations:', len(qm9data.dataset))
print('Number of train data:', len(qm9data.train_dataset))
print('Number of validation data:', len(qm9data.val_dataset))
print('Number of test data:', len(qm9data.test_dataset))
print('Available properties:')

for p in qm9data.dataset.available_properties:
    print('-', p)

In [None]:
example = qm9data.dataset[0]
print('Properties:')
for k, v in example.items():
    print('-', k, ':', v.shape)

In [None]:
for batch in qm9data.val_dataloader():
    print(batch.keys())
    break

In [None]:
print('System index:', batch['_idx_m'])
print('Center atom index:', batch['_idx_i'])
print('Neighbor atom index:', batch['_idx_j'])

In [None]:
print('Total energy at 0K:', batch[QM9.U0])
print('HOMO:', batch[QM9.homo])

In [None]:
if not os.path.exists('./uracil_dft.npz'):
    !wget http://quantum-machine.org/gdml/data/npz/md17_uracil.npz

In [None]:
# load atoms from npz file. Here, we only parse the first 10 molecules
data = np.load('./md17_uracil.npz')
numbers = data["z"]
atoms_list = []
property_list = []
for positions, energies, forces in zip(data["R"], data["E"], data["F"]):
    ats = Atoms(positions=positions, numbers=numbers)
    properties = {'energy': energies, 'forces': forces}
    property_list.append(properties)
    atoms_list.append(ats)

print('Properties:', property_list[0])

In [None]:
%rm './new_dataset.db'
new_dataset = ASEAtomsData.create(
    './new_dataset.db',
    distance_unit='Ang',
    property_unit_dict={'energy':'kcal/mol', 'forces':'kcal/mol/Ang'}
)
new_dataset.add_systems(property_list, atoms_list)

In [None]:
# calculate this at the same level of theory as your data
atomref = {'energy': [314.0, 0.0, 0.0, 0.0]} # atomref value for hydrogen: 314.0
# the supplied list is ordered by atomic number, e.g.:
atomref_hydrogen= atomref['energy'][1]
# dataset = ASEAtomsData.create(
#     './new_dataset.db',
#     distance_unit='Ang',
#     property_unit_dict={'energy':'kcal/mol'},
#     atomref=atomref
# )

In [None]:
print('Number of reference calculations:', len(new_dataset))
print('Available properties:')

for p in new_dataset.available_properties:
    print('-', p)
print()

example = new_dataset[0]
print('Properties of molecule with id 0:')

for k, v in example.items():
    print('-', k, ':', v.shape)

In [None]:
custom_data = spk.data.AtomsDataModule(
    './new_dataset.db',
    batch_size=10,
    distance_unit='Ang',
    property_units={'energy':'kcal/mol', 'forces':'kcal/mol/Ang'},
    num_train=1000,
    num_val=100,
    transforms=[
        trn.ASENeighborList(cutoff=5.),
        trn.RemoveOffsets("energy", remove_mean=True, remove_atomrefs=False),
        trn.CastTo32()
    ],
    num_workers=1,
    pin_memory=True, # set to false, when not using a GPU
)
custom_data.prepare_data()
custom_data.setup()

# New Section