## Imports

In [None]:
import os
import time

# import lmdb
import seaborn as sns
import pickle
import torch

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from tqdm import tqdm
from dataloader import lmdb_dataset

## Open datasets

In [None]:
root_dir = "../../ocp_datasets/data/is2re/"
file_name = 'data_mod2.lmdb'

datasets = {
    
    'train_10k':     '10k/train/',
    'train_100k':    '100k/train/',
    'train_all':     'all/train/',

    'val_id':        'all/val_id/',
    'val_ood_ads':   'all/val_ood_ads/',
    'val_ood_cat':   'all/val_ood_cat/',
    'val_ood_both':  'all/val_ood_both/',

    'test_id':       'all/test_id/',
    'test_ood_ads':  'all/test_ood_ads/',
    'test_ood_cat':  'all/test_ood_cat/',
    'test_ood_both': 'all/test_ood_both/'
}

datasets = {k:(root_dir + v + file_name) for k, v in datasets.items()}

In [None]:
train_10k = lmdb_dataset(datasets['train_10k'])

## Analyzing dataset

#### Describe dataset function (already used as method .describe() in lmdb_dataset)

#### create DataFrame of .natom keys

In [None]:
# create DataFrame of .natom keys
def natom_hist(dataset):
    from collections import defaultdict
    import pandas as pd
    sec = time.time()
    dic = defaultdict(int)
    for struct in dataset:
        dic[struct.natoms] += 1
    print(f'done for: {time.time() - sec:.2f} s')

    dic_mod = {k:[v] for k,v in dict(dic).items()}
    return pd.DataFrame.from_dict(dic_mod, orient='index', columns=['N']).sort_index()


# TODO multiprocessing edition of function
# from multiprocessing import Pool
# def natom_hist(dataset):
#     from collections import defaultdict
#     import pandas as pd
#     sec = time.time()
#     dic = defaultdict(int)
#     for struct in dataset:
#         dic[struct.natoms] += 1
#     print(time.time() - sec)

#     dic_mod = {k:[v] for k,v in dict(dic).items()}
#     return pd.DataFrame.from_dict(dic_mod, orient='index', columns=['N']).sort_index()

In [None]:
# long time to run (~20 min)
train_10k_df = natom_hist(train_10k)

#### Sparce distribution

In [None]:
def df_distr(dic):
    dic = dic.to_dict()
    key = list(dic.keys())[0]
    dic1 = {key:{i:0 for i in range(1, 301)}}
    
    for k, v in dic[key].items():
        dic1[key][k] = v
    return pd.DataFrame.from_dict(dic1)

In [None]:
frames = (train_10k_df,)
frames_distr = [0]*len(frames)

In [None]:
for i, frame in enumerate(frames):
    frames_distr[i] = df_distr(frame)

In [None]:
df_all = pd.concat(frames_distr, axis=1)
df_all.columns = ['df_train_all', 'dataset_test_id_df', 'dataset_train_10k_df', 'dataset_val_ood_both_df']
df_all_norm = df_all.fillna(0)
df_all_norm = df_all_norm.apply(lambda x: x / x.max())
df_all_norm.plot()

In [None]:
dataset_train_keys_10k_keys = set(dataset_train_10k[0].keys)
dataset_val_ood_both_keys = set(dataset_val_ood_both[0].keys)
dataset_test_keys_id_keys = set(dataset_test_id[0].keys)

In [None]:
print(dataset_train_keys_10k_keys, dataset_val_ood_both_keys, dataset_test_keys_id_keys, sep='\n')
print(dataset_train_keys_10k_keys - dataset_test_keys_id_keys)

In [None]:
print(dataset_train_10k[0])
# for i in dataset_train_keys_10k_keys:
#         print((i, getattr(dataset_train_10k[0], i)))

for i in dataset_train_keys_10k_keys:
        temp = getattr(dataset_train_10k[0], i)
        print(i, temp)
#         if type(temp) not in [float, int]:
#             print(temp.shape)
#         else: print(temp)

In [None]:
dataset_train_10k[0]['y_relaxed']

In [None]:
from collections import Counter
# Counter(list(dataset_train_10k[0].atomic_numbers))
dataset_train_10k[0].atomic_numbers[0].item()
Counter([dataset_train_10k[0].atomic_numbers[i].item() for i in range(dataset_train_10k[0].atomic_numbers.shape[0])])
# compare = [("\n".join((getattr(dataset_train_10k[0], i), getattr(dataset_test[0], i)))) for i in dataset_test[0].keys]

In [None]:
dataset_train_10k[0].edge_index.shape

In [None]:
dataset_train_10k[0].natoms

In [None]:
86**2/2

In [None]:
print(dataset_train_10k[0])

In [None]:
import pandas as pd

dic = pd.read_pickle("/Users/korovin/Documents/GitHub/ocp_datasets/oc20_data_mapping.pkl")
# or use online https://dl.fbaipublicfiles.com/opencatalystproject/data/oc20_data_mapping.pkl

df = pd.DataFrame.from_dict(dic, orient='index')

In [None]:
df.head(10)

In [None]:
df.loc['random2472718']

### Calculate thetas

In [None]:
test = train_10k[0]['edge_angles']

In [None]:
pi_step = []
n_steps = 10
for i in range(n_steps+1):
    pi_step.append(i*np.pi/n_steps)

In [None]:
def to_bins_torch(array_of_dfs, pi_step=pi_step):
    thetas = []

    for df in array_of_dfs:
        theta = torch.tensor(df['edge_theta'].values) #.to('cpu')
        theta = torch.histc(theta, bins=10, min=0, max=np.pi)
        theta = torch.reshape(theta, (1, theta.shape[0]))
        thetas.append(theta)
        
    thetas = torch.cat(thetas, 0)
    
    return thetas

In [None]:
%%time
to_bins_torch(test)

## Structure of element

In [None]:
import numpy as np
from mendeleev import element

def print_var_name(variable):
     for name in locals():
        if eval(name) == variable:
            print(name)

def getAtomSequence (sequence):
    result = list([[sequence[0], 1]])
    for i in range(1, len(sequence)):
        if sequence[i] == result[-1][0]:
            result[-1][1] += 1
        else:
            result.append([sequence[i], 1])
    return dict(result)

def structureToVASP(structure, file='POSCAR', str_name='structure', relaxed=False):
    with open(f'{str_name}_POSCAR{"_relaxed" if relaxed else ""}', 'w') as f:
        f.write(str_name + '\n')
        f.write(str(1.0) + '\n')
        for axis in np.array(structure.cell[0]):
            for i in range(3):
                f.write(str(axis[i]) + '   ')
                if i == 2:
                    f.write('\n')
        atoms = getAtomSequence(np.array(structure.atomic_numbers, dtype=int))
        for k in atoms.keys():
            f.write('   ' + element(round(k)).symbol)
        f.write('\n')
        for v in atoms.values():
            f.write('   ' + str(round(v)))
        f.write('\n')
        f.write('Cartesian\n')
        for position in np.array(structure.pos if not relaxed else structure.pos_relaxed):
            for i in range(3):
                f.write(str(position[i]) + '   ')
                if i == 2:
                    f.write('\n')          
    return None

for relaxed in [True, False] :
    structureToVASP(dataset_train_10k[0], relaxed=relaxed, str_name=dataset_train_10k[0])

## PyG training

In [None]:
from torch_geometric.datasets import Planetoid

dataset = Planetoid(root='/tmp/Cora', name='Cora')

In [None]:
import time
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv

sec = time.time()

class Net(torch.nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.conv1 = GCNConv(dataset.num_node_features, 16)
        self.conv2 = GCNConv(16, dataset.num_classes)

    def forward(self, data):
        x, edge_index = data.x, data.edge_index

        x = self.conv1(x, edge_index)
        x = F.relu(x)
        x = F.dropout(x, training=self.training)
        x = self.conv2(x, edge_index)

        return F.log_softmax(x, dim=1)

model = Net()
data = dataset[0]
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=5e-4)

model.train()
for epoch in range(200):
    optimizer.zero_grad()
    out = model(data)
    loss = F.nll_loss(out[data.train_mask], data.y[data.train_mask])
    loss.backward()
    optimizer.step()
    if epoch % int(200/10) == 0:
        print(epoch,':', time.time() - sec)

print('Total:', time.time() - sec)

model.eval()
_, pred = model(data).max(dim=1)
correct = int(pred[data.test_mask].eq(data.y[data.test_mask]).sum().item())
acc = correct / int(data.test_mask.sum())
print('Accuracy: {:.4f}'.format(acc))