In [1]:
import torch_geometric as pyg
import numpy as np
from ogb.lsc import PCQM4Mv2Dataset
root_dir = "data"

DGL backend not selected or invalid.  Assuming PyTorch for now.


Setting the default backend to "pytorch". You can change it in the ~/.dgl/config.json file or export the DGLBACKEND environment variable.  Valid options are: pytorch, mxnet, tensorflow (all lowercase)


In [2]:
def summarize_data(data, print_classes = True):
    print(f"n classes: {data.num_classes}")
    data.print_summary()

    if print_classes:
        true_array = np.zeros(data.num_classes)
        num_array = np.zeros(data.num_classes)


        for i in data:
            y = i.y.numpy()
            nans = np.isnan(y)
            true_array = true_array + np.nan_to_num(y)
            num_array = num_array + np.invert(nans)


        for i in range(data.num_classes):
            print(f"{i:>3}: {int(true_array[0,i])} of {int(num_array[0,i])}. {true_array[0,i]/num_array[0,i]:>4f} are true. "\
                f"{num_array[0,i]/len(data):.4f} of the data have this label.")

# Tox21

In [3]:
data = pyg.datasets.MoleculeNet(root_dir, "Tox21")

In [4]:
summarize_data(data)

n classes: 12
MoleculeNet (#graphs=7831):
+------------+----------+----------+
|            |   #nodes |   #edges |
|------------+----------+----------|
| mean       |     18.6 |     38.6 |
| std        |     11.3 |     25   |
| min        |      1   |      0   |
| quantile25 |     11   |     22   |
| median     |     16   |     34   |
| quantile75 |     23   |     50   |
| max        |    132   |    290   |
+------------+----------+----------+
  0: 309 of 7265. 0.042533 are true. 0.9277 of the data have this label.
  1: 237 of 6758. 0.035070 are true. 0.8630 of the data have this label.
  2: 768 of 6549. 0.117270 are true. 0.8363 of the data have this label.
  3: 300 of 5821. 0.051538 are true. 0.7433 of the data have this label.
  4: 793 of 6193. 0.128048 are true. 0.7908 of the data have this label.
  5: 350 of 6955. 0.050324 are true. 0.8881 of the data have this label.
  6: 186 of 6450. 0.028837 are true. 0.8236 of the data have this label.
  7: 942 of 5832. 0.161523 are true. 0.7

# PCBA

Takes quite a bit of RAM

In [6]:
data = pyg.datasets.MoleculeNet(root_dir, "PCBA")

In [7]:
summarize_data(data)

n classes: 128


100%|██████████| 437929/437929 [00:44<00:00, 9736.92it/s] 


MoleculeNet (#graphs=437929):
+------------+----------+----------+
|            |   #nodes |   #edges |
|------------+----------+----------|
| mean       |       26 |     56.2 |
| std        |        7 |     15.4 |
| min        |        1 |      0   |
| quantile25 |       22 |     46   |
| median     |       26 |     56   |
| quantile75 |       30 |     64   |
| max        |      332 |    672   |
+------------+----------+----------+
  0: 15957 of 161024. 0.099097 are true. 0.3677 of the data have this label.
  1: 561 of 197266. 0.002844 are true. 0.4505 of the data have this label.
  2: 177 of 148863. 0.001189 are true. 0.3399 of the data have this label.
  3: 529 of 124752. 0.004240 are true. 0.2849 of the data have this label.
  4: 720 of 201906. 0.003566 are true. 0.4610 of the data have this label.
  5: 5813 of 195258. 0.029771 are true. 0.4459 of the data have this label.
  6: 5652 of 223674. 0.025269 are true. 0.5108 of the data have this label.
  7: 2299 of 207836. 0.011062 are 

# QM9

In [4]:
data = pyg.datasets.QM9(root_dir+"/qm9")


Downloading https://deepchemdata.s3-us-west-1.amazonaws.com/datasets/molnet_publish/qm9.zip
Extracting dataqm9/raw/qm9.zip
Downloading https://ndownloader.figshare.com/files/3195404
Processing...
100%|██████████| 133885/133885 [01:16<00:00, 1759.99it/s]
Done!


In [5]:
summarize_data(data)

n classes: 19


100%|██████████| 130831/130831 [00:18<00:00, 6970.26it/s]


QM9 (#graphs=130831):
+------------+----------+----------+
|            |   #nodes |   #edges |
|------------+----------+----------|
| mean       |     18   |     37.3 |
| std        |      2.9 |      6.3 |
| min        |      3   |      4   |
| quantile25 |     16   |     34   |
| median     |     18   |     38   |
| quantile75 |     20   |     42   |
| max        |     29   |     56   |
+------------+----------+----------+
  0: 349705 of 130831. 2.672953 are true. 1.0000 of the data have this label.
  1: 9849112 of 130831. 75.281185 are true. 1.0000 of the data have this label.
  2: -855170 of 130831. -6.536453 are true. 1.0000 of the data have this label.
  3: 42133 of 130831. 0.322044 are true. 1.0000 of the data have this label.
  4: 897303 of 130831. 6.858492 are true. 1.0000 of the data have this label.
  5: 155611783 of 130831. 1189.410643 are true. 1.0000 of the data have this label.
  6: 530773 of 130831. 4.056937 are true. 1.0000 of the data have this label.
  7: -1462555362

# ZINC

In [4]:
data = pyg.datasets.ZINC(root_dir+"/ZINC")

Downloading https://www.dropbox.com/s/feo9qle74kg48gy/molecules.zip?dl=1
Extracting data/ZINC/molecules.zip
Downloading https://raw.githubusercontent.com/graphdeeplearning/benchmarking-gnns/master/data/molecules/train.index
Downloading https://raw.githubusercontent.com/graphdeeplearning/benchmarking-gnns/master/data/molecules/val.index
Downloading https://raw.githubusercontent.com/graphdeeplearning/benchmarking-gnns/master/data/molecules/test.index
Processing...
Processing train dataset: 100%|██████████| 220011/220011 [00:15<00:00, 13755.51it/s]
Processing val dataset: 100%|██████████| 24445/24445 [00:02<00:00, 10434.23it/s]
Processing test dataset: 100%|██████████| 5000/5000 [00:00<00:00, 12238.82it/s]
Done!


In [5]:
summarize_data(data, print_classes=False)

n classes: 218362


100%|██████████| 220011/220011 [00:21<00:00, 10339.91it/s]


ZINC (#graphs=220011):
+------------+----------+----------+
|            |   #nodes |   #edges |
|------------+----------+----------|
| mean       |     23.2 |     49.8 |
| std        |      4.5 |     10.6 |
| min        |      6   |     10   |
| quantile25 |     20   |     42   |
| median     |     23   |     50   |
| quantile75 |     26   |     56   |
| max        |     38   |     88   |
+------------+----------+----------+


KeyboardInterrupt: 

# pcqm4m-v2

```
wget http://ogb-data.stanford.edu/data/lsc/pcqm4m-v2-train.sdf.tar.gz
md5sum pcqm4m-v2-train.sdf.tar.gz # fd72bce606e7ddf36c2a832badeec6ab
tar -xf pcqm4m-v2-train.sdf.tar.gz # extracted pcqm4m-v2-train.sdf
```

In [1]:
from rdkit import Chem

suppl = Chem.SDMolSupplier('data/pcqm4mv2__/raw/pcqm4m-v2-train.sdf')

In [18]:
mol = suppl[0]
mol.GetConformer().GetPositions()

array([[ 4.9919, -5.2514,  4.0126],
       [ 6.1051, -3.0257,  3.52  ],
       [ 4.5521, -3.9001,  1.914 ],
       [ 6.3372, -1.9217,  2.7029],
       [ 4.7751, -2.7953,  1.0929],
       [ 2.8586,  1.2252, -1.7853],
       [ 2.8118,  0.8707, -3.0956],
       [ 5.789 , -0.835 , -0.8455],
       [ 4.6658, -0.476 , -3.0127],
       [ 5.215 , -4.0391,  3.1392],
       [ 5.677 , -1.7955,  1.4745],
       [ 4.8499, -0.2104, -1.5946],
       [ 5.9121, -0.5519,  0.613 ],
       [ 3.9134,  0.7241, -0.934 ],
       [ 5.0405,  0.6404,  1.1008],
       [ 3.716 ,  0.0207, -3.7371],
       [ 3.9796,  1.1019,  0.3172],
       [ 5.2985,  1.1457,  2.1772]])