## Imports

In [2]:
# import lmdb
# import torch
from DataClasses import lmdb_dataset
import time
import pandas as pd
import matplotlib.pyplot as plt
# import seaborn as sns
import pickle
from tqdm import tqdm
import os

## Open datasets

In [21]:
root_dir = "../../ocp_datasets/data/is2re/"
file = 'data_mod.lmdbz'
datasets = {
    
    'train_10k':     '10k/train/',
    'train_100k':    '100k/train/',
    'train_all':     'all/train/',

    'val_id':        'all/val_id/',
    'val_ood_ads':   'all/val_ood_ads/',
    'val_ood_cat':   'all/val_ood_cat/',
    'val_ood_both':  'all/val_ood_both/',

    'test_id':       'all/test_id/',
    'test_ood_ads':  'all/test_ood_ads/',
    'test_ood_cat':  'all/test_ood_cat/',
    'test_ood_both': 'all/test_ood_both/'
}

datasets = {k:(root_dir + v + file) for k, v in datasets.items()}

In [None]:
train_10k = lmdb_dataset(datasets['train_10k'])

In [23]:
train_all = lmdb_dataset(datasets['train_all'])

In [3]:
test_challenge = lmdb_dataset('/share/catalyst/ocp_datasets/is2re_test_challenge_2021/data_mod1_0_50039.lmdbz')

In [4]:
test_challenge.describe(3228)

total entries: 50040
info for item: 3228
edge_index:...............<class 'torch.Tensor'>..... [2, 2302]
pos:......................<class 'torch.Tensor'>.....   [67, 3]
cell:.....................<class 'torch.Tensor'>..... [1, 3, 3]
atomic_numbers:...........<class 'torch.Tensor'>.....      [67]
natoms:...................       <class 'int'>.....        67
cell_offsets:.............<class 'torch.Tensor'>..... [2302, 3]
distances:................<class 'torch.Tensor'>.....    [2302]
fixed:....................<class 'torch.Tensor'>.....      [67]
sid:......................       <class 'int'>.....      3241
tags:.....................<class 'torch.Tensor'>.....      [67]
voronoi_volumes:..........<class 'torch.Tensor'>.....      [67]
voronoi_surface_areas:....<class 'torch.Tensor'>.....      [67]
spherical_domain_radii:...<class 'torch.Tensor'>.....      [67]
cell_offsets_new:.........<class 'torch.Tensor'>.....  [912, 3]
distances_new:............<class 'torch.Tensor'>.....     [912]
con

In [6]:
type(test_challenge[3219])

torch_geometric.data.data.Data

#### Calculate neighbours for each edge

In [110]:
def t2str(tensor, index):
    return tensor.T[index].numpy()

system = train_10k[0]
edge_angles = system['edge_angles'][0]
edge_index = system['edge_index_new']

edge_angles['nb'] = list(map(tuple,t2str(edge_index, edge_angles.index)))
edge_angles

Unnamed: 0_level_0,edge_theta,edge_to_z,edge_phi,nb
edge_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,1.627043,0.095988,-3.064711,"(0, 39)"
4,1.229412,1.653501,1.529088,"(0, 81)"
6,1.132122,1.285322,1.928316,"(0, 45)"
8,1.619575,3.08094,0.0,"(0, 74)"
10,2.704089,1.73027,-1.213277,"(0, 56)"
12,2.151265,1.647651,1.563712,"(0, 28)"
14,1.71236,1.886601,-1.197399,"(0, 37)"
16,2.358017,0.846375,2.991061,"(0, 35)"
18,2.867928,1.389703,-2.443446,"(0, 69)"
20,2.727654,1.481655,1.944193,"(0, 43)"


## Analyzing dataset

#### Describe dataset function (already used as method .describe() in lmdb_dataset)

#### create DataFrame of .natom keys

In [None]:
# create DataFrame of .natom keys
def natom_hist(dataset):
    from collections import defaultdict
    import pandas as pd
    sec = time.time()
    dic = defaultdict(int)
    for struct in dataset:
        dic[struct.natoms] += 1
    print(f'done for: {time.time() - sec:.2f} s')

    dic_mod = {k:[v] for k,v in dict(dic).items()}
    return pd.DataFrame.from_dict(dic_mod, orient='index', columns=['N']).sort_index()


# TODO multiprocessing edition of function
# from multiprocessing import Pool
# def natom_hist(dataset):
#     from collections import defaultdict
#     import pandas as pd
#     sec = time.time()
#     dic = defaultdict(int)
#     for struct in dataset:
#         dic[struct.natoms] += 1
#     print(time.time() - sec)

#     dic_mod = {k:[v] for k,v in dict(dic).items()}
#     return pd.DataFrame.from_dict(dic_mod, orient='index', columns=['N']).sort_index()

In [54]:
# long time to run (~20 min)
train_10k_df = natom_hist(train_10k)

done for: 3.75 s


#### Sparce distribution

In [55]:
def df_distr(dic):
    dic = dic.to_dict()
    key = list(dic.keys())[0]
    dic1 = {key:{i:0 for i in range(1, 301)}}
    
    for k, v in dic[key].items():
        dic1[key][k] = v
    return pd.DataFrame.from_dict(dic1)

In [56]:
frames = (train_10k_df,)
frames_distr = [0]*len(frames)

In [57]:
for i, frame in enumerate(frames):
    frames_distr[i] = df_distr(frame)

In [None]:
df_all = pd.concat(frames_distr, axis=1)
df_all.columns = ['df_train_all', 'dataset_test_id_df', 'dataset_train_10k_df', 'dataset_val_ood_both_df']
df_all_norm = df_all.fillna(0)
df_all_norm = df_all_norm.apply(lambda x: x / x.max())
df_all_norm.plot()

In [None]:
dataset_train_keys_10k_keys = set(dataset_train_10k[0].keys)
dataset_val_ood_both_keys = set(dataset_val_ood_both[0].keys)
dataset_test_keys_id_keys = set(dataset_test_id[0].keys)

In [None]:
print(dataset_train_keys_10k_keys, dataset_val_ood_both_keys, dataset_test_keys_id_keys, sep='\n')
print(dataset_train_keys_10k_keys - dataset_test_keys_id_keys)

In [None]:
print(dataset_train_10k[0])
# for i in dataset_train_keys_10k_keys:
#         print((i, getattr(dataset_train_10k[0], i)))

for i in dataset_train_keys_10k_keys:
        temp = getattr(dataset_train_10k[0], i)
        print(i, temp)
#         if type(temp) not in [float, int]:
#             print(temp.shape)
#         else: print(temp)

In [None]:
dataset_train_10k[0]['y_relaxed']

In [None]:
from collections import Counter
# Counter(list(dataset_train_10k[0].atomic_numbers))
dataset_train_10k[0].atomic_numbers[0].item()
Counter([dataset_train_10k[0].atomic_numbers[i].item() for i in range(dataset_train_10k[0].atomic_numbers.shape[0])])
# compare = [("\n".join((getattr(dataset_train_10k[0], i), getattr(dataset_test[0], i)))) for i in dataset_test[0].keys]

In [None]:
dataset_train_10k[0].edge_index.shape

In [None]:
dataset_train_10k[0].natoms

In [None]:
86**2/2

In [None]:
print(dataset_train_10k[0])

In [None]:
import pandas as pd

dic = pd.read_pickle("/Users/korovin/Documents/GitHub/ocp_datasets/oc20_data_mapping.pkl")
# or use online https://dl.fbaipublicfiles.com/opencatalystproject/data/oc20_data_mapping.pkl

df = pd.DataFrame.from_dict(dic, orient='index')

In [None]:
df.head(10)

In [None]:
df.loc['random2472718']

## Structure of element to VASP

In [31]:
import numpy as np
from mendeleev import element

def print_var_name(variable):
     for name in locals():
        if eval(name) == variable:
            print(name)

def getAtomSequence (sequence):
    result = list([[sequence[0], 1]])
    for i in range(1, len(sequence)):
        if sequence[i] == result[-1][0]:
            result[-1][1] += 1
        else:
            result.append([sequence[i], 1])
    return dict(result)

def structureToVASP(structure, file='POSCAR', str_name='structure', relaxed=False):
    with open(f'{str_name}_POSCAR{"_relaxed" if relaxed else ""}', 'w') as f:
        f.write(str_name + '\n')
        f.write(str(1.0) + '\n')
        for axis in np.array(structure.cell[0]):
            for i in range(3):
                f.write(str(axis[i]) + '   ')
                if i == 2:
                    f.write('\n')
        atoms = getAtomSequence(np.array(structure.atomic_numbers, dtype=int))
        for k in atoms.keys():
            f.write('   ' + element(round(k)).symbol)
        f.write('\n')
        for v in atoms.values():
            f.write('   ' + str(round(v)))
        f.write('\n')
        f.write('Cartesian\n')
        for position in np.array(structure.pos if not relaxed else structure.pos_relaxed):
            for i in range(3):
                f.write(str(position[i]) + '   ')
                if i == 2:
                    f.write('\n')          
    return None

i = 3219
PATH = 'POSCAR/'
for relaxed in [True, False] :
    structureToVASP(train_all[i], relaxed=relaxed, str_name=f'{PATH}train_all[{i}]')

## PyG training

In [None]:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root='/tmp/Cora', name='Cora')

In [None]:
import time
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

sec = time.time()

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

model = Net()
data = dataset[0]
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % int(200/10) == 0:
        print(epoch,':', time.time() - sec)

print('Total:', time.time() - sec)

model.eval()
_, pred = model(data).max(dim=1)
correct = int(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / int(data.test_mask.sum())
print('Accuracy: {:.4f}'.format(acc))