In [1]:
import os

import numpy as np
import pandas as pd

import torch
import torch.nn.functional as F
import torch.optim as optim


from datetime import datetime
from torch import nn
from torch_geometric.data import Data, DataLoader
from torch_geometric.nn import MessagePassing, DataParallel
from torch_scatter import scatter
from torch.utils.tensorboard import SummaryWriter

from DataClasses import lmdb_dataset, Dataset, DataListLoader
from ModelFunctions import train, evaluate, inference

In [2]:
import sys
sys.path.append(os.path.expanduser('../ocpmodels/models'))
sys.path.append(os.path.expanduser('../../ocp-airi'))

from spinconv_with_embeds_single import spinconv

In [3]:
#вызывается каждый раз, когда датасет отдаёт элемент (систему)
#делаем из данных матрицу векторов-атомов, список рёбер (edge_index) и матрицу векторов-рёбер; надо писать свою функцию для каждой сети
def preprocessing(system):
    keys = ['pos', 'atomic_numbers', 'cell', 'natoms']
    features_dict = {}
    for key in keys:
        features_dict[key] = system[key]
    return Data(**features_dict)

In [4]:
#config
batch_size = 70
num_workers = 0

features_cols = ['feature_1']

target_col = 'y_relaxed'
lr = 0.001
epochs = 30

In [5]:
# #чтобы тензор по умолчанию заводился на куде
# if torch.cuda.is_available():
#     torch.set_default_tensor_type('torch.cuda.FloatTensor')
#     print('cuda')

In [6]:
#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
print(device)

cuda


In [7]:
#инициализируем тренировочный датасети и тренировочный итератор
train_dataset_file_path= os.path.expanduser("../../ocp_datasets/data/is2re/all/train/data.lmdb")

training_set = Dataset(train_dataset_file_path, features_cols, target_col, preprocessing=preprocessing)
training_generator = DataListLoader(training_set, batch_size=batch_size)

In [8]:
#инициализируем валидационный датасет и валидационный итератор
val_dataset_file_path = os.path.expanduser("../../ocp_datasets/data/is2re/all/val_ood_both/data.lmdb")

valid_set = Dataset(val_dataset_file_path, features_cols, target_col, preprocessing=preprocessing)
valid_generator = DataListLoader(valid_set, batch_size=batch_size, num_workers=num_workers)

In [9]:
try:
    lmdb_dataset(train_dataset_file_path).describe()
except:
    pass

total entries: 460328
info for item: 0
edge_index:...............<class 'torch.Tensor'>..... [2, 2964]
pos:......................<class 'torch.Tensor'>.....   [86, 3]
cell:.....................<class 'torch.Tensor'>..... [1, 3, 3]
atomic_numbers:...........<class 'torch.Tensor'>.....      [86]
natoms:...................       <class 'int'>.....        86
cell_offsets:.............<class 'torch.Tensor'>..... [2964, 3]
force:....................<class 'torch.Tensor'>.....   [86, 3]
distances:................<class 'torch.Tensor'>.....    [2964]
fixed:....................<class 'torch.Tensor'>.....      [86]
sid:......................       <class 'int'>.....   2472718
tags:.....................<class 'torch.Tensor'>.....      [86]
y_init:...................     <class 'float'>.....    6.2825
y_relaxed:................     <class 'float'>.....   -0.0256
pos_relaxed:..............<class 'torch.Tensor'>.....   [86, 3]


In [10]:

# #model
# model = spinconv(None, None, 1, otf_graph=True, regress_forces=False, 0)
# model = DataParallel(model)
# model = model.to(device)

# #optimizer and loss
# optimizer = optim.AdamW(model.parameters(), lr=lr)
# criterion = nn.L1Loss()

# #переносим на куду если она есть
# criterion = criterion.to(device)

In [11]:
timestamp = str(datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))

print(timestamp)

2021-10-05-14-40-50


In [12]:
#tensorboard writer, при первом запуске надо руками сделать папку для логов

# server
#log_folder_path = "../../ocp_results/logs/tensorboard/out_base_model"

# colab
# log_folder_path = "/content/drive/MyDrive/ocp_results/logs/tensorboard/out_base_model"

# user_specific 
log_file_path = "../../logs/tensorboard_airi"

writer = SummaryWriter(log_file_path + '/' + timestamp)

In [13]:
%%time
logfile_str = {
    "train_dataset_file_path": train_dataset_file_path,
    "val_dataset_file_path": val_dataset_file_path,
    "features_cols": features_cols,
    "target_col": target_col,
    "batch_size": batch_size,
    "num_workers": num_workers,
    "epochs": epochs,
    "lr": lr,
    "type":'id'
}

#граф модели
try:
    #trace_system = dict(list(next(iter(training_generator))[0]))
    writer.add_graph(model, trace_system)
except:
    print('no graph')
writer.add_text(timestamp, str(logfile_str))

no graph
CPU times: user 250 µs, sys: 56 µs, total: 306 µs
Wall time: 250 µs


In [14]:
%ls ../logs/tensorboard_airi

[0m[01;34m2021-09-22-19-37-27[0m/  [01;34m2021-09-23-14-29-22[0m/  [01;34m2021-09-24-16-50-37[0m/
[01;34m2021-09-22-19-38-48[0m/  [01;34m2021-09-23-14-29-49[0m/  [01;34m2021-09-24-16-51-56[0m/
[01;34m2021-09-22-19-42-58[0m/  [01;34m2021-09-23-14-41-34[0m/  [01;34m2021-09-24-16-55-06[0m/
[01;34m2021-09-22-19-44-54[0m/  [01;34m2021-09-23-14-50-43[0m/  [01;34m2021-09-24-16-56-28[0m/
[01;34m2021-09-22-19-45-56[0m/  [01;34m2021-09-23-15-32-41[0m/  [01;34m2021-09-24-16-57-11[0m/
[01;34m2021-09-22-20-33-00[0m/  [01;34m2021-09-23-19-37-50[0m/  [01;34m2021-09-24-17-07-45[0m/
[01;34m2021-09-22-21-05-04[0m/  [01;34m2021-09-23-20-00-39[0m/  [01;34m2021-09-24-17-12-50[0m/
[01;34m2021-09-23-08-30-01[0m/  [01;34m2021-09-24-16-10-40[0m/  [01;34m2021-09-24-17-13-20[0m/
[01;34m2021-09-23-08-31-39[0m/  [01;34m2021-09-24-16-16-35[0m/  [01;34m2021-09-24-17-14-59[0m/
[01;34m2021-09-23-08-41-24[0m/  [01;34m2021-09-24-16-17-49[0m/  [01;34m20

## Training

In [15]:
# %%time
# loss = []
# loss_eval = []

# print(timestamp)
# print(f'Start training model {str(model)}')
# for i in range(epochs):
#     loss.append(train(model, training_generator, optimizer, criterion, epoch=i, writer=writer, device=device))
#     loss_eval.append(evaluate(model, valid_generator, criterion, epoch=i, writer=writer, device=device))

In [None]:

embed_discr = {"group_onehot" : list(range(18)),
               "period_onehot": list(range(19, 27)),
               "block_onehot" : list(range(28, 32)),
               "electronegativity" : 33,
               "radius" : 34,
               "valence" : 35,
               "ionization" : 36,
               "affinity" : 37,
               "volume": 38
              }


#model
model = spinconv(None, None, 1, otf_graph=True, regress_forces=False, custom_embedding_value=torch.tensor(list(range(27))))
model = DataParallel(model)
model = model.to(device)
#optimizer and loss
optimizer = optim.AdamW(model.parameters(), lr=lr)
criterion = nn.L1Loss()

#переносим на куду если она есть
criterion = criterion.to(device)
timestamp = str(datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))

print(timestamp)

log_file_path = "../logs/tensorboard_airi"

writer = SummaryWriter(log_file_path + '/' + timestamp)
# %%time
logfile_str = {
    "train_dataset_file_path": train_dataset_file_path,
    "val_dataset_file_path": val_dataset_file_path,
    "features_cols": features_cols,
    "target_col": target_col,
    "batch_size": batch_size,
    "num_workers": num_workers,
    "epochs": epochs,
    "lr": lr,
    "type":'id',
    "custom_embedding_type": "group+period"
}

#граф модели
try:
    #trace_system = dict(list(next(iter(training_generator))[0]))
    writer.add_graph(model, trace_system)
except:
    print('no graph')
writer.add_text(timestamp, str(logfile_str))
# %%time
loss = []
loss_eval = []

print(timestamp)
print(f'Start training model {str(model)}')
for i in range(epochs):
    loss.append(train(model, training_generator, optimizer, criterion, epoch=i, writer=writer, device=device))
    loss_eval.append(evaluate(model, valid_generator, criterion, epoch=i, writer=writer, device=device))
    path = '_'.join((timestamp, 'epoch', str(i), '.pickle'))
    torch.save(model, path)

tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26])
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26])
message tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26])
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26])
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26])
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26])
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 25, 26])
tensor([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
        18, 19, 20, 21, 22, 23, 24, 2

ml-test-server-0:24777:24798 [0] NCCL INFO Channel 08 : 0[57000] -> 1[bc000] via P2P/direct pointer
ml-test-server-0:24777:24799 [1] NCCL INFO Channel 08 : 1[bc000] -> 0[57000] via P2P/direct pointer
ml-test-server-0:24777:24799 [1] NCCL INFO Channel 09 : 1[bc000] -> 0[57000] via P2P/direct pointer
ml-test-server-0:24777:24798 [0] NCCL INFO Channel 09 : 0[57000] -> 1[bc000] via P2P/direct pointer
ml-test-server-0:24777:24799 [1] NCCL INFO Channel 10 : 1[bc000] -> 0[57000] via P2P/direct pointer
ml-test-server-0:24777:24798 [0] NCCL INFO Channel 10 : 0[57000] -> 1[bc000] via P2P/direct pointer
ml-test-server-0:24777:24799 [1] NCCL INFO Channel 11 : 1[bc000] -> 0[57000] via P2P/direct pointer
ml-test-server-0:24777:24798 [0] NCCL INFO Channel 11 : 0[57000] -> 1[bc000] via P2P/direct pointer
ml-test-server-0:24777:24799 [1] NCCL INFO 12 coll channels, 16 p2p channels, 16 p2p channels per peer
ml-test-server-0:24777:24798 [0] NCCL INFO 12 coll channels, 16 p2p channels, 16 p2p channels per

In [None]:
model, valid_generator, criterion, epoch=i, writer=writer, device=device)