In [1]:
import torch
import markov_bridges.data.qm9.utils as qm9utils

from markov_bridges.configs.config_classes.data.molecules_configs import QM9Config
from markov_bridges.data.dataloaders_utils import get_dataloaders
from markov_bridges.configs.experiments_configs.mixed.edmg_experiments import get_edmg_experiment

In [2]:
config = get_edmg_experiment()
config.data = QM9Config(datadir = "qm9_dataset_from_notebook", #lp set this, also added this folder to .gitignore
                        num_pts_train=1000,
                        num_pts_test=200,
                        num_pts_valid=200)

In [3]:
dataloaders = get_dataloaders(config)



Conditioning on ['H_thermo', 'homo']


In [4]:
dataloaders.get_databach_keys()

['num_atoms',
 'charges',
 'positions',
 'index',
 'A',
 'B',
 'C',
 'mu',
 'alpha',
 'homo',
 'lumo',
 'gap',
 'r2',
 'zpve',
 'U0',
 'U',
 'H',
 'G',
 'Cv',
 'omega1',
 'zpve_thermo',
 'U0_thermo',
 'U_thermo',
 'H_thermo',
 'G_thermo',
 'Cv_thermo']

In [5]:
databatch = dataloaders.get_databatch()
#self.dtype = torch.float32


In [6]:
context = qm9utils.prepare_context(config.noising_model.conditioning, 
                                   databatch, 
                                   dataloaders.property_norms)

In [7]:
context.shape

torch.Size([32, 23, 2])

In [8]:
databatch["charges"].shape

torch.Size([32, 23, 1])

In [9]:
dataloaders.dataset_info

{'name': 'qm9',
 'atom_encoder': {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4},
 'atom_decoder': ['H', 'C', 'N', 'O', 'F'],
 'n_nodes': {22: 3393,
  17: 13025,
  23: 4848,
  21: 9970,
  19: 13832,
  20: 9482,
  16: 10644,
  13: 3060,
  15: 7796,
  25: 1506,
  18: 13364,
  12: 1689,
  11: 807,
  24: 539,
  14: 5136,
  26: 48,
  7: 16,
  10: 362,
  8: 49,
  9: 124,
  27: 266,
  4: 4,
  29: 25,
  6: 9,
  5: 5,
  3: 1},
 'max_n_nodes': 29,
 'atom_types': {1: 635559, 2: 101476, 0: 923537, 3: 140202, 4: 2323},
 'distances': [903054,
  307308,
  111994,
  57474,
  40384,
  29170,
  47152,
  414344,
  2202212,
  573726,
  1490786,
  2970978,
  756818,
  969276,
  489242,
  1265402,
  4587994,
  3187130,
  2454868,
  2647422,
  2098884,
  2001974,
  1625206,
  1754172,
  1620830,
  1710042,
  2133746,
  1852492,
  1415318,
  1421064,
  1223156,
  1322256,
  1380656,
  1239244,
  1084358,
  981076,
  896904,
  762008,
  659298,
  604676,
  523580,
  437464,
  413974,
  352372,
  291886,
  271948,
  2

## Lisa test

In [1]:
import torch
import markov_bridges.data.qm9.utils as qm9utils

from markov_bridges.configs.config_classes.data.molecules_configs import QM9Config
from markov_bridges.data.dataloaders_utils import get_dataloaders
from markov_bridges.configs.experiments_configs.mixed.edmg_experiments import get_edmg_experiment

In [2]:
config = get_edmg_experiment()
config.data = QM9Config(datadir = "qm9_dataset_from_notebook", #lp set this, also added this folder to .gitignore
                        num_pts_train=1000,
                        num_pts_test=200,
                        num_pts_valid=200)

In [3]:
dataloaders = get_dataloaders(config)



Conditioning on ['H_thermo', 'homo']


In [4]:
dataloaders.get_databatch()

{'num_atoms': tensor([19, 18, 19, 16, 19, 20, 17, 20, 23, 18, 14, 18, 20, 20, 16, 18, 18, 18,
         18, 19, 19, 16, 19, 20, 17, 13, 16, 21, 19, 17, 21, 15]),
 'charges': tensor([[[6],
          [6],
          [6],
          [6],
          [6],
          [6],
          [8],
          [6],
          [6],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0],
          [0],
          [0],
          [0]],
 
         [[6],
          [6],
          [6],
          [6],
          [8],
          [6],
          [6],
          [8],
          [7],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0],
          [0],
          [0],
          [0],
          [0]],
 
         [[6],
          [8],
          [6],
          [6],
          [6],
          [6],
          [6],
          [6],
          [6

In [5]:
#take validation set dataloader and take 2 different batches
valid_dataloader = dataloaders.validation()
batch_1 , batch_5 = None, None
for i,x in enumerate(valid_dataloader):
    if i == 1:
        batch_1=x
    elif i==5:
        batch_5=x

In [6]:
print(f"Max number of nodes in batch 1: {max(batch_1["num_atoms"])} ; Max number of nodes in batch 5: {max(batch_5["num_atoms"])}")

Max number of nodes in batch 1: 25 ; Max number of nodes in batch 5: 23


In [7]:
print(f"Positions tensor shape in batch 1: {batch_1["positions"].shape} ; Positions tensor shape in batch 5: {batch_5["positions"].shape}")

Positions tensor shape in batch 1: torch.Size([32, 25, 3]) ; Positions tensor shape in batch 5: torch.Size([32, 23, 3])


In [8]:
print(f"Charges tensor shape in batch 1: {batch_1["charges"].shape} ; Charges tensor shape in batch 5: {batch_5["charges"].shape}")

Charges tensor shape in batch 1: torch.Size([32, 25, 1]) ; Charges tensor shape in batch 5: torch.Size([32, 23, 1])


In [9]:
context_batch_1 = qm9utils.prepare_context(config.noising_model.conditioning, 
                                   batch_1, 
                                   dataloaders.property_norms)
context_batch_5 = qm9utils.prepare_context(config.noising_model.conditioning, 
                                   batch_5, 
                                   dataloaders.property_norms)
print(f"Context tensor shape in batch 1: {context_batch_1.shape} ; Context tensor shape in batch 5: {context_batch_5.shape}")

Context tensor shape in batch 1: torch.Size([32, 25, 2]) ; Context tensor shape in batch 5: torch.Size([32, 23, 2])


In [10]:
batch_1

{'num_atoms': tensor([21, 23, 21, 24, 13, 22, 23, 17, 14, 19, 20, 17, 18, 21, 23, 21, 20, 16,
         23, 15, 16, 16, 21, 17, 17, 21, 16, 17, 18, 16, 21, 18]),
 'charges': tensor([[[6],
          [6],
          [6],
          [6],
          [6],
          [6],
          [8],
          [6],
          [8],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0],
          [0],
          [0]],
 
         [[6],
          [6],
          [6],
          [8],
          [6],
          [6],
          [6],
          [6],
          [8],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0]],
 
         [[6],
          [6],
          [6],
          [6],
          [6],
          [6],
          [8

In [11]:
batch_2

{'num_atoms': tensor([13, 18, 13, 12, 12, 18, 21, 18, 16, 14, 16, 19, 19, 17, 18, 13, 23, 15,
         21, 14, 16, 23, 19, 18, 14, 23, 17, 20, 18, 15, 16, 16]),
 'charges': tensor([[[7],
          [6],
          [6],
          [6],
          [7],
          [6],
          [6],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0]],
 
         [[6],
          [6],
          [8],
          [6],
          [6],
          [8],
          [6],
          [7],
          [6],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0],
          [0],
          [0],
          [0],
          [0]],
 
         [[7],
          [6],
          [6],
          [8],
          [6],
          [8],
          [6],
          [6],
          [7

In [12]:
max_num_atoms_1 = max(batch_1["num_atoms"])
max_num_atoms_2 = max(batch_2["num_atoms"])
print(max_num_atoms_1,max_num_atoms_2 )

tensor(24) tensor(23)


In [13]:
shape_charges_1 = batch_1["charges"].shape
shape_charges_2 = batch_2["charges"].shape
print(shape_charges_1, shape_charges_2)

torch.Size([32, 24, 1]) torch.Size([32, 23, 1])


In [5]:
dataloaders.get_databach_keys()

['num_atoms',
 'charges',
 'positions',
 'index',
 'A',
 'B',
 'C',
 'mu',
 'alpha',
 'homo',
 'lumo',
 'gap',
 'r2',
 'zpve',
 'U0',
 'U',
 'H',
 'G',
 'Cv',
 'omega1',
 'zpve_thermo',
 'U0_thermo',
 'U_thermo',
 'H_thermo',
 'G_thermo',
 'Cv_thermo']

In [10]:
batch = dataloaders.get_databatch()
batch

{'num_atoms': tensor([16, 18, 16, 22, 19, 19, 13, 17, 17, 18, 21, 22, 17, 19, 18, 21, 17, 18,
         15, 23, 18, 19, 21, 22, 21, 25, 16, 21, 18, 19, 25, 18]),
 'charges': tensor([[[8],
          [6],
          [6],
          [6],
          [8],
          [6],
          [7],
          [6],
          [8],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0]],
 
         [[6],
          [6],
          [6],
          [6],
          [6],
          [6],
          [7],
          [7],
          [7],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0]],
 
         [[6],
          [7],
          [6],
          [7],
          [6

In [11]:
print("num atoms", batch["num_atoms"].shape)
print("charges" , batch["charges"].shape)
print("positions", batch["positions"].shape)
print("index", batch["index"].shape)
print("a", batch["A"].shape)
print("b", batch["B"].shape)
print("c", batch["C"].shape)
print("mu", batch["mu"].shape)
print("one_hot", batch["one_hot"].shape)
print("atom_mask", batch["atom_mask"].shape)
print("edge_mask", batch["edge_mask"].shape)

num atoms torch.Size([32])
charges torch.Size([32, 25, 1])
positions torch.Size([32, 25, 3])
index torch.Size([32])
a torch.Size([32])
b torch.Size([32])
c torch.Size([32])
mu torch.Size([32])
one_hot torch.Size([32, 25, 5])
atom_mask torch.Size([32, 25])
edge_mask torch.Size([20000, 1])


In [12]:
23*23*32

16928

hypotesis:
- 32 molecules
- the biggest molecule in the batch has 23 atoms, so the number of atoms per molecule has been padded to 23
- positions is [N molecules, N atoms per mol, 3]
- index is the number of the molecule in the entire dataset
- one hot is the one hot encoding of each atom
- atom mask tells which atoms for each molecule exist and which are padded atoms
- edge mask has shape [23 * 23 * 32 , 1], which means that each molecule is reprsented as a FC graph, and they mask edges between padded atoms

In [13]:
# What they call "charges" is actually the atomic number -> yes
# Check using the onehot:

num_atoms_0 = batch["num_atoms"][0]
charges_0 = batch["charges"][0]
positions_0 = batch["positions"][0]
one_hot_0 = batch["one_hot"][0]
atom_mask_0 =  batch["atom_mask"][0]
edge_mask_0 = batch["edge_mask"][0]

#atom_vocab =  {'H': 0, 'C': 1, 'N': 2, 'O': 3, 'F': 4}

#from charges retrieve indexes of 0 elements, thse should be padded atoms
#indexes_padded_atoms =  torch.nonzero(charges_0 == 0, as_tuple=True)[0]

In [14]:
one_hot_0

tensor([[False, False, False,  True, False],
        [False,  True, False, False, False],
        [False,  True, False, False, False],
        [False,  True, False, False, False],
        [False, False, False,  True, False],
        [False,  True, False, False, False],
        [False, False,  True, False, False],
        [False,  True, False, False, False],
        [False, False, False,  True, False],
        [ True, False, False, False, False],
        [ True, False, False, False, False],
        [ True, False, False, False, False],
        [ True, False, False, False, False],
        [ True, False, False, False, False],
        [ True, False, False, False, False],
        [ True, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False],
        [False, False, False, False, False],
        [F

In [15]:
charges_0

tensor([[8],
        [6],
        [6],
        [6],
        [8],
        [6],
        [7],
        [6],
        [8],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [1],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0],
        [0]])

In [16]:
atom_mask_0

tensor([ True,  True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True, False, False, False, False,
        False, False, False, False, False])

In [17]:
batch

{'num_atoms': tensor([16, 18, 16, 22, 19, 19, 13, 17, 17, 18, 21, 22, 17, 19, 18, 21, 17, 18,
         15, 23, 18, 19, 21, 22, 21, 25, 16, 21, 18, 19, 25, 18]),
 'charges': tensor([[[8],
          [6],
          [6],
          [6],
          [8],
          [6],
          [7],
          [6],
          [8],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0]],
 
         [[6],
          [6],
          [6],
          [6],
          [6],
          [6],
          [7],
          [7],
          [7],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [1],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0],
          [0]],
 
         [[6],
          [7],
          [6],
          [7],
          [6

confirmed hypotesis:
The .get_databatch() function returns a dictionary for 32 molecules (batch size = 32). 
The biggest molecule in the batch has 23 total atoms (heavy + hydrogens): all molecules have been padded to reach 23 atoms. 
Padded atoms can be recognized by the "charges" tensor where they have 0, by the "one_hot" where they have all False, and by the "atom_mask".

The dictionary has the following relevant keys:
- num_atoms : number of TOTAL atoms (heavy + hydrogens). SHAPE: [32]
- charges : this is indeeed the ATOMIC NUMBER, from which we can retrieve the atom type. 0 is for padded atoms. SHAPE: [32, 23, 1]
- positions : 3d coordinates of each atom. SHAPE: [32, 23, 3] == [N molecules, N atoms per mol (after padding), 3]
- index: maybe is the number (index) of the molecule in the entire dataset
- one_hot: Bool torch tensor built upon what they call "atom_encoder" which is indeed a vocabulary. This is the bool version of the one hot encoding (instead of having [1,0,0] you have [True, False, False] for each atom; padded atom have a tensor full of False). SHAPE: [32, 23, 5] == [N molecules, N atoms per mol (after padding), len(vocabulary)]
- atom mask: Bool torch tensor that tells which atoms for each molecule exist (True) and which are padded atoms (False). SHAPE: [32, 23] 
- edge mask: Bool torch tensor which tells what edges exist. It has shape [23 * 23 * 32 , 1], which means that each molecule is reprsented as a FC graph. The masked edges are self loops and edges that include at least one padded node. 

In [18]:
edge_mask = batch["edge_mask"]
edge_mask

tensor([[False],
        [ True],
        [ True],
        ...,
        [False],
        [False],
        [False]])

In [19]:
edge_mask[:23]

tensor([[False],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False],
        [False]])

In [20]:
edge_mask[23:46]

tensor([[False],
        [False],
        [ True],
        [False],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [ True],
        [False],
        [False],
        [False],
        [False],
        [False]])

In [25]:
train_dataloader = dataloaders.test()

In [26]:
for x in train_dataloader:
    print(x)
    break

{'num_atoms': tensor([23, 18, 15, 20, 18, 21, 18, 20, 16, 20, 21, 16, 18, 17, 16, 20, 17, 17,
        20, 20, 21, 22, 18, 19, 19, 19, 21, 17, 10, 22, 15, 17]), 'charges': tensor([[[6],
         [6],
         [6],
         [6],
         [6],
         [6],
         [6],
         [6],
         [8],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1]],

        [[6],
         [6],
         [6],
         [6],
         [7],
         [7],
         [6],
         [6],
         [7],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [1],
         [0],
         [0],
         [0],
         [0],
         [0]],

        [[8],
         [6],
         [6],
         [6],
         [7],
         [6],
         [6],
         [6],
         [1],
         [1],
         [1],
         [1],
         [1],