In [1]:
import os

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
import torch.optim as optim


from datetime import datetime
from torch import nn
from torch_geometric.data import DataLoader
from torch_geometric.nn import MessagePassing
from torch_scatter import scatter
from torch.utils.tensorboard import SummaryWriter

from DataClasses import lmdb_dataset, Dataset
from ModelFunctions import train, evaluate, inference

In [2]:
from torch_geometric.data import Data

In [3]:
def my_reshape(tensor):
    return torch.reshape(tensor, (tensor.shape[0], 1))

In [4]:
def convert_angles(array):
    array[:, 1] = np.pi - array[:, 1]
    array[:, 3] = - array[:, 3]
    return array

def restore_edge_angles(list_of_arrays):
    el_new = []
    for el in list_of_arrays:
        el_new.append(el)
        el_new.append(convert_angles(el.copy()))        
    return el_new       

In [5]:
def make_df(npndarray):
    df = pd.DataFrame(npndarray, columns=('edge_id', 'edge_theta', 'z', 'phi'))
    return df

In [6]:
def to_bins_torch_constrains(system,
                            direct_neighbors_only=True, dist_threshold=5):
    
    keys_to_mask = ['cell_offsets_new', 'contact_solid_angles', 'distances_new']
    array_of_dfs = restore_edge_angles(system['edge_angles'])
    
    array_of_dfs = restore_edge_angles(system['edge_angles'])
    end_points = system['edge_index_new'][1]
    original_offsets = system['cell_offsets_new']
    filtered_id = np.array(list(range(len(array_of_dfs))))
    
    if direct_neighbors_only :
        mask = (system['direct_neighbor'] == 1)
        for key in keys_to_mask:
            system[key] = system[key][mask]
        array_of_dfs = [array_of_dfs[i] for i in range(len(array_of_dfs)) if mask[i]]
        system['edge_index_new'] = ((system['edge_index_new'].T)[mask]).T
        system['direct_neighbor'] = torch.ones(sum(mask))
        filtered_id = filtered_id[mask]
    
    if dist_threshold != None :
        mask = (system['distances_new'] < dist_threshold)
        array_of_dfs = [array_of_dfs[i] for i in range(len(array_of_dfs)) if mask[i]]
        system['edge_index_new'] = ((system['edge_index_new'].T)[mask]).T
        for key in keys_to_mask:
            system[key] = system[key][mask]
        system['direct_neighbor'] = system['direct_neighbor'][mask]
        filtered_id = filtered_id[mask]
        
    thetas = []
    
    for df in array_of_dfs:
        df = make_df(df)
        df = df.loc[np.isin(df.index, filtered_id)].copy()
        df['end_point'] = end_points[df.index]
        df['cell_offsets'] = list(map(tuple, original_offsets[df.index]))
        df.drop_duplicates(subset=['end_point', 'cell_offsets'], inplace=True)
        theta = torch.tensor(df['edge_theta'].values)#.to('cpu')
        theta = torch.histc(theta, bins=10, min=0, max=np.pi)
        theta = torch.reshape(theta, (1, theta.shape[0]))
        thetas.append(theta)
        
    thetas = torch.cat(thetas, 0).float()
    
#     return_dict = {'edge_index_new_f' : edge_index, 'cell_offsets_new_f' : cell_offsets ,
#                 'distances_new_f' : distances ,'direct_neighbor_f' : direct_neighbors ,
#                 'contact_solid_angles_f' : contact_solid_angles ,
#                 'thetas' : thetas.float()}
    
#     for key in return_dict:
#         system[key] = eturn_dict[key]

    system['thetas'] = thetas
    
    return system
    

In [7]:
#вызывается каждый раз, когда датасет отдаёт элемент (систему)
#делаем из данных матрицу векторов-атомов, список рёбер (edge_index) и матрицу векторов-рёбер; надо писать свою функцию для каждой сети
def preprocessing(system):
    
    system = to_bins_torch_constrains(system)
        
    tags = system['tags'].long()
    tags = F.one_hot(tags, num_classes=3)
    
    atom_numbers = system['atomic_numbers'].long()
    atom_numbers = F.one_hot(atom_numbers, num_classes=100)
    
    voronoi_volumes = system['voronoi_volumes'].float()
    voronoi_volumes = my_reshape(voronoi_volumes)
    
    atom_features = (tags, atom_numbers, voronoi_volumes)#, spherical_radii)
    atom_embeds = torch.cat(atom_features, 1)
    
    edge_index = system['edge_index_new'].long()
    
    distances = system['distances_new'].float()
    distances = my_reshape(distances)
    
    
    thetas = system['thetas']
#     angles = system['contact_solid_angles'].float().to(device)
#     angles = my_reshape(angles)

    edge_features = (distances, thetas)
    
    edges_embeds = torch.cat(edge_features, 1)
    
    
    return Data(x=atom_embeds.to(device), edge_index=edge_index.to(device), edge_attr=edges_embeds.to(device))

$$
\mathbf{x}_i^{(k)} = \gamma^{(k)} \left( \mathbf{x}_i^{(k-1)}, \square_{j \in \mathcal{N}(i)} \, \phi^{(k)}\left(\mathbf{x}_i^{(k-1)}, \mathbf{x}_j^{(k-1)},\mathbf{e}_{j,i}\right) \right)
$$

Гамма лежит в апдейт, квадратик в aggr, а фи в месседж; в этом примере квадратик -- суммирование

In [8]:
class GConv(MessagePassing):
    def __init__(self, dim_atom=104, dim_edge=11, out_channels=2):
        super(GConv, self).__init__(aggr='add')  # "Add" aggregation
        self.phi_output = 3
        self.lin_phi = torch.nn.Linear(dim_atom*2+dim_edge, self.phi_output, bias=False)
        self.lin_gamma = torch.nn.Linear(dim_atom + self.phi_output, out_channels, bias=False)
        self.nonlin = nn.Sigmoid()

    def forward(self, batch):
        x = batch['x']
        edge_index = batch['edge_index']
        edge_attr = batch['edge_attr']
        
        # x has shape [N -- количество атомов в системе(батче), in_channels -- размерность вектора-атома]
        # edge_index has shape [2, E] -- каждое ребро задаётся парой вершин

        # Start propagating messages. 
    
        return self.propagate(edge_index, x=x, edge_attr=edge_attr, size=None)

    def message(self, x, x_i, x_j, edge_attr):
        concatenated = torch.cat((x_i, x_j, edge_attr), 1)
        phi = self.lin_phi(concatenated)
        phi = self.nonlin(phi)
        return phi
        
    def update(self, aggr_out, x, edge_attr, edge_index):
                
        concatenated = torch.cat((x, aggr_out), 1)
        gamma = self.lin_gamma(concatenated)
        gamma = self.nonlin(gamma)

        return Data(x=gamma, edge_attr=edge_attr, edge_index=edge_index)

In [9]:
class ConvNN(nn.Module):
    
    def __init__(self, dim_atom=104, dim_edge=1):
        
        super().__init__()          
        self.conv_last = GConv(dim_atom=dim_atom, dim_edge=dim_edge, out_channels=2)
        
        self.lin = torch.nn.Linear(2, 1, bias=True)
        
    def forward(self, batch):
        convoluted_last = self.conv_last(batch)['x']
        scattered = scatter(convoluted_last, batch['batch'], dim=0, reduce='sum')
        summed = scattered
        energy = self.lin(summed)
        
        return energy

In [10]:
#config
batch_size = 50
num_workers = 0

features_cols = ['atomic_numbers', 'edge_index_new', 'distances_new', 
                 'contact_solid_angles', 'tags', 'voronoi_volumes', 'edge_angles'] #он не нужен 

target_col = 'y_relaxed'
lr = 0.001
epochs = 20

In [11]:
#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
print(device)

cpu


In [12]:
#инициализируем тренировочный датасети и тренировочный итератор
train_dataset_file_path= os.path.expanduser("../../ocp_datasets/data/is2re/10k/train/data_mod2.lmdbz")

training_set = Dataset(train_dataset_file_path, features_cols, target_col, preprocessing=preprocessing)

In [13]:
%%time
training_set[0]

CPU times: user 1.9 s, sys: 7.22 ms, total: 1.91 s
Wall time: 1.91 s


(Data(edge_attr=[912, 11], edge_index=[2, 912], x=[86, 104]),
 -0.025550085000020317)

In [14]:
training_generator = DataLoader(training_set, batch_size=batch_size, num_workers=num_workers)

In [15]:
#инициализируем валидационный датасет и валидационный итератор
val_dataset_file_path = os.path.expanduser("../../ocp_datasets/data/is2re/10k/train/data_mod2.lmdbz")

valid_set = Dataset(val_dataset_file_path, features_cols, target_col, preprocessing=preprocessing)
valid_generator = DataLoader(valid_set, batch_size=batch_size, num_workers=num_workers)

In [16]:
try:
    lmdb_dataset(train_dataset_file_path).describe()
except:
    pass

item: 0
atomic_numbers:...........      [86]
cell:..................... [1, 3, 3]
cell_offsets:............. [2964, 3]
cell_offsets_new:......... [1214, 3]
contact_solid_angles:.....    [1214]
direct_neighbor:..........    [1214]
distances:................    [2964]
distances_new:............    [1214]
edge_angles:..............       607
edge_index:............... [2, 2964]
edge_index_new:........... [2, 1214]
fixed:....................      [86]
force:....................   [86, 3]
natoms:...................        86
pos:......................   [86, 3]
pos_relaxed:..............   [86, 3]
sid:......................   2472718
spherical_domain_radii:...      [86]
tags:.....................      [86]
voronoi_surface_areas:....      [86]
voronoi_volumes:..........      [86]
y_init:...................    6.2825
y_relaxed:................   -0.0256


In [17]:
#model
model = ConvNN(dim_atom=training_set[0][0]['x'].shape[1], dim_edge=training_set[0][0]['edge_attr'].shape[1])

#optimizer and loss
optimizer = optim.AdamW(model.parameters(), lr=lr)
criterion = nn.L1Loss()

#переносим на куду если она есть
model = model.to(device)
criterion = criterion.to(device)

In [18]:
timestamp = str(datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))

print(timestamp)

2021-09-16-16-32-08


In [19]:
#tensorboard writer, при первом запуске надо руками сделать папку для логов

# server
#log_folder_path = "../../ocp_results/logs/tensorboard/out_base_model"

# colab
# log_folder_path = "/content/drive/MyDrive/ocp_results/logs/tensorboard/out_base_model"

# user_specific 
log_file_path = os.path.expanduser("~/Documents/ocp_datasets_hd/logs/tensorboard_airi")

writer = SummaryWriter(log_file_path + '/' + timestamp)

In [20]:
%%time
logfile_str = {
    "train_dataset_file_path": train_dataset_file_path,
    "val_dataset_file_path": val_dataset_file_path,
    "features_cols": features_cols,
    "target_col": target_col,
    "batch_size": batch_size,
    "num_workers": num_workers,
    "epochs": epochs,
    "lr": lr
}

#граф модели
trace_system = dict(list(next(iter(training_generator))[0]))
writer.add_graph(model, trace_system)
writer.add_text(timestamp, str(logfile_str))

CPU times: user 1min 31s, sys: 380 ms, total: 1min 31s
Wall time: 1min 31s


## Training

In [None]:
%%time
loss = []
loss_eval = []

print(timestamp)
print(f'Start training model {str(model)}')
for i in range(epochs):
    loss.append(train(model, training_generator, optimizer, criterion, epoch=i, writer=writer, device=device))
    loss_eval.append(evaluate(model, valid_generator, criterion, epoch=i, writer=writer, device=device))

2021-09-16-16-32-08
Start training model ConvNN(
  (conv_last): GConv(
    (lin_phi): Linear(in_features=219, out_features=3, bias=False)
    (lin_gamma): Linear(in_features=107, out_features=2, bias=False)
    (nonlin): Sigmoid()
  )
  (lin): Linear(in_features=2, out_features=1, bias=True)
)
epoch 0


In [None]:
loss_eval