# generate fake dataset

In [1]:
import os
# import shutil
import numpy
# import pandas
# import random
# import ruamel.yaml as yaml
import pickle
# import torch
# from torch_geometric.data import Data
from tqdm import tqdm
# import matplotlib.pyplot as plt

from typing import Literal, Union


### file structure:

```bash
agdb_null_test
|---raw  # basic file dir
|   |---[name].pkl
|---extra  # basic file dir
|   |---[name].attr1.pkl
|   |---[name].attr2.pkl
|---dataset_attributes.pkl  # basic file
|---raw.tar  # tarfile of dir raw
|---attr1.tar  # tarfile of extra/attr1
|---attr2.tar  # tarfile of extra/attr2
|---gen_data.ipynb  # for generating data
```

## null_test dataset


In [2]:
root_dir = './'
assert os.path.basename(os.getcwd()) == 'agdb_null_test'  # check if in the right dir

In [3]:
dataset_attributes = {
    "xc": "m062x",
    "basis": "def2tzvp",
    "unit": {
        "energy": "eV",
        "length": "Bohr",
    },
    "doc": """This is a test dataset for building agdb modules""",
}
with open(os.path.join(root_dir, 'dataset_attributes.pkl'), 'wb') as f:
    pickle.dump(dataset_attributes, f)

In [4]:
rand_data_cnt = int(2)  # number of data to generate

for i in range(rand_data_cnt):
    natm = numpy.random.randint(4,7)
    name = f'{i+1:03d}'
    fname_raw = f"{name}.pkl"
    fname_attr1 = fname_raw.replace(".pkl", ".attr1.pkl")
    fname_attr2 = fname_raw.replace(".pkl", ".attr2.pkl")
    data_raw = {
        "x": numpy.random.randint(1,10,(natm,)),
        "pos": numpy.random.randn(natm,3),
        "name": name,
        "etol": numpy.random.randn(1)[0],
    }
    data_attr1 = {
        "attr1_1": numpy.random.randn(5,),
        "attr1_2": numpy.random.randn(5,3),
    }
    data_attr2 = {
        "attr2_1": numpy.random.randn(5,),
        "attr2_2": numpy.random.randn(5,3),
    }
    with open(os.path.join(root_dir, "raw", fname_raw), 'wb') as f:
        pickle.dump(data_raw, f)
    with open(os.path.join(root_dir, "extra", fname_attr1), 'wb') as f:
        pickle.dump(data_attr1, f)
    with open(os.path.join(root_dir, "extra", fname_attr2), 'wb') as f:
        pickle.dump(data_attr2, f)
    print(f"{data_raw=},\n{data_attr1=},\n{data_attr2=}")

data_raw={'x': array([1, 3, 5, 6]), 'pos': array([[-1.63331094,  0.75480048,  0.9601171 ],
       [-0.47974295,  1.01928488, -1.20201389],
       [ 0.50422725,  2.32178196, -0.99859271],
       [-0.15990044, -1.33673388, -1.677831  ]]), 'name': '001', 'etol': -0.08012218238361365},
data_attr1={'attr1_1': array([-1.66051672, -0.45761608,  0.5147151 , -1.85360247, -0.0466721 ]), 'attr1_2': array([[-0.11453922, -0.09535394,  1.54399523],
       [-0.63859771,  1.11285543, -0.56919963],
       [-0.08672203,  1.29176386,  1.15865253],
       [-0.44347449, -0.98350267,  1.64920487],
       [-0.36067238, -0.17904819,  0.65143063]])},
data_attr2={'attr2_1': array([0.98006123, 0.54315023, 3.39872933, 2.20801463, 0.76020792]), 'attr2_2': array([[-0.48417219,  0.37551309, -1.66056492],
       [-0.10938012, -0.63680362, -0.25223974],
       [ 0.80915984, -0.8739465 ,  1.09154243],
       [-0.21891837, -0.02398193, -0.53424223],
       [-0.52542046,  0.3889833 ,  1.09186296]])}
data_raw={'x': array(

In [1]:
!pwd

/nvme/louzekun/atom-graph-database/datatools/ceph_datasets/agdb_null_test


In [2]:
# tar files
!tar -cvf raw.tar raw
!tar -cvf attr1.tar extra/*.attr1.pkl
!tar -cvf attr2.tar extra/*.attr2.pkl
!ls -alh

raw/
raw/001.pkl
raw/002.pkl
extra/001.attr1.pkl
extra/002.attr1.pkl
extra/001.attr2.pkl
extra/002.attr2.pkl
total 68K
drwxrwxr-x. 5 louzekun louzekun 4.0K Jan 11 20:08 .
drwxrwxr-x. 4 louzekun louzekun  130 Jan 11 19:02 ..
-rw-rw-r--. 1 louzekun louzekun  314 Jan 11 19:45 anonymous_example.json
-rw-rw-r--. 1 louzekun louzekun  316 Jan 11 19:43 anonymous.json
-rw-rw-r--. 1 louzekun louzekun  10K Jan 11 20:08 attr1.tar
-rw-rw-r--. 1 louzekun louzekun  10K Jan 11 20:08 attr2.tar
-rw-rw-r--. 1 louzekun louzekun  146 Jan 11 19:38 dataset_attributes.pkl
drwxrwxr-x. 2 louzekun louzekun  110 Jan 11 18:43 extra
drwxrwxr-x. 2 louzekun louzekun    6 Jan 11 20:08 .ipynb_checkpoints
-rw-rw-r--. 1 louzekun louzekun 9.6K Jan 11 19:46 proc_data.ipynb
drwxrwxr-x. 2 louzekun louzekun   48 Jan 11 18:43 raw
-rw-rw-r--. 1 louzekun louzekun  10K Jan 11 20:08 raw.tar
-rw-rw-r--. 1 louzekun louzekun  713 Jan 11 20:08 README.md
-rw-r--r--. 1 louzekun louzekun    0 Jan 11 19:46 syncRecord.txt


In [4]:
# upload to ceph
# NOTICE: the first line is dryrun!
!sensesync --dryrun cp ./ s3://SJBRX22W1ND46X2GCRR9:DyWgIBe68vTOuBeLlpnwbvk1I6x6FnHsXkQXJkza@agdb_null_test.10.140.2.204:80/
# !sensesync cp ./ s3://SJBRX22W1ND46X2GCRR9:DyWgIBe68vTOuBeLlpnwbvk1I6x6FnHsXkQXJkza@agdb_null_test.10.140.2.204:80/

2023/01/11 20:08:20.544552 sensesync[127134] <INFO>: Found: 15, copied: 15, deleted: 0, failed: 0, Size: 43.193 KiB


In [11]:
# maybe you have no access to change the policy, just contact admins of your ceph group
# error might look like this: An error occurred (AccessDenied) when calling the PutBucketPolicy operation: Unknown
!aws --endpoint-url=http://10.140.2.204:80 s3api put-bucket-policy --bucket agdb_null_test --policy file://./anonymous.json


An error occurred (AccessDenied) when calling the PutBucketPolicy operation: Unknown
