# Colab only

## Prerequisites

### Google colab

This notebook can be used in colab (**this is the fastest way to run calculation on unconfigured system**):

In google colab https://colab.research.google.com/ go to File | Open notebook | GitHub - 
insert the path to the current notebook and open it: https://github.com/alexnkorovin/ocp-airi/blob/dev/airi_utils/our_base_model.ipynb

Before start:

1. Put this shared folder with datasets in your Google Drive root folder /drive/MyDrive/

This folders  can are available by the **sharing** link below:

*   ocp_datasets [[ share link to drive](https://drive.google.com/drive/folders/1Nn9t-zTJiRP1-34rdAugv6aY_2-BSQfN?usp=sharing)]<br>

```
Note:
if this folder is saved by sharing link it should contain the following files

ocp-datasets/data/is2re/train/all/val_ood_both/data.lmdb
ocp-datasets/data/is2re/train/all/test_ood_both/data.lmdb
ocp-datasets/data/is2re/train/all/test_ood_both/structures.pkl

 ```
2. Enable GPU support in Edit/Notebook Settings

### on local pc

download specified data files by [link](https://drive.google.com/drive/folders/1Nn9t-zTJiRP1-34rdAugv6aY_2-BSQfN?usp=sharing) into local folder.


### Use the cell below it to mount your google drive to dataset
 - go by the link
 - log in under your google accout
 - copy token key
 - imput it to this the imput line in this notebook

## Enviroment installation

### on local pc
```
$ conda install pytorch-geometric -c rusty1s -c conda-forge
```
or via pip Wheels

```
$ python -c "import torch; print(torch.__version__)"
>>> 1.9.0 - > {TORCH}=1.9.0
python -c "import torch; print(torch.version.cuda)"
>>> 11.1 - > {CUDA}=cu111
```

substite {TORCH} and {CUDA} in commands below by appropriate for your system
```
pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
pip install torch-sparse -f https://pytorch-geometric.com/whl/torch-{TORCH}+{CUDA}.html
pip install torch-geometric
```

#### on colab and also local pc (but on locat preferable is conda way)

In [None]:
# # This might take about 10 min in Colab (нужно только в колабе)
# !pip install -q torch-scatter -f https://pytorch-geometric.com/whl/torch-1.4.0+cu101.html
# !pip install -q torch-sparse -f https://pytorch-geometric.com/whl/torch-1.4.0+cu101.html
# !pip install -q torch-geometric

## Import and definitions

In [None]:
import os
import pickle

import numpy as np
import pandas as pd
import torch

import torch.nn.functional as F
import torch.optim as optim

from datetime import datetime
from torch import nn
from torch_geometric.data import Dataset, DataLoader
from torch.utils.tensorboard import SummaryWriter

from dataloader import lmdb_dataset

# Input

In [None]:
# DTASETS
train_dataset_file_path = os.path.expanduser("../../ocp_datasets_ssd/data/is2re/100k/train/data_mod.lmdb")
val_dataset_file_path = os.path.expanduser("../../ocp_datasets_ssd/data/is2re/all/val_ood_both/data_mod.lmdb")

In [None]:
# PARAMETERS
batch_size = 81
num_workers = 0
MAX_LEN = 300
epochs = 20

In [None]:
# FEAUTURES
features_cols = ['pos', 'atomic_numbers', 'tags', 'voronoi_volumes', 'voronoi_surface_areas', 'spherical_domain_radii']
target_col = 'y_relaxed'

In [None]:
# LOG_Path
log_file_path = "../logs/tensorboard_airi"

In [None]:
# PARAMETERS for logging
logfile_str = {
    "train_dataset_file_path": train_dataset_file_path,
    "val_dataset_file_path": val_dataset_file_path,
    "features_cols": features_cols,
    "target_col": target_col,
    "batch_size": batch_size,
    "num_workers": num_workers,
    "epochs": epochs,
    "MAX_LEN": MAX_LEN
}

# All Model

## Functions

In [None]:
def my_reshape(tensor):
    return torch.reshape(tensor, (tensor.shape[0], 1))

In [None]:
def simple_preprocessing(system, features_fields):
    
    tags = system['tags'].long().to(device)[:MAX_LEN]
    tags = F.one_hot(tags, num_classes=3)
    
    atom_numbers = system['atomic_numbers'].long().to(device)[:MAX_LEN]
    atom_numbers = F.one_hot(atom_numbers, num_classes=100)
    
    pos = system['pos'].to(device)[:MAX_LEN]
    
    v_volumes = system['voronoi_volumes'].float().to(device)
    v_volumes = my_reshape(v_volumes)
    
    v_areas = system['voronoi_surface_areas'].float().to(device)
    v_areas = my_reshape(v_areas)
    
    spherical_radii = system['spherical_domain_radii'].float().to(device)
    spherical_radii = my_reshape(spherical_radii)
    
    atom_features = (tags, atom_numbers, pos, v_volumes, v_areas, spherical_radii)
    atom_embeds = torch.cat(atom_features, 1)
                    
    #padding
    pad_value = -10000#0#-float("Inf")
    pads = torch.full((MAX_LEN-atom_embeds.shape[0], atom_embeds.shape[1]), pad_value)
    padding_mask = torch.cat((torch.full((atom_embeds.shape[0], ), False), torch.full((MAX_LEN-atom_embeds.shape[0], ), True)))
    atom_embeds = torch.cat((atom_embeds, pads))
    
    return (atom_embeds, padding_mask)

In [None]:
#датасет, который умеет возвращать эелемент и собственную длину
class Dataset(Dataset):

    def __init__(self, data, features_fields, target_field, type_='train', preprocessing=simple_preprocessing):
        
        self.data = lmdb_dataset({"src": data})
        self.length = len(self.data)
        #self.target = data[target_field]
        self.type_ = type_
        self.preprocessing = preprocessing
        self.features_fields = features_fields
        self.target = target_field

    def __len__(self):
        return self.length

    def __getitem__(self, index):
        
        system = self.preprocessing(self.data[index], self.features_fields)
        
        if self.type_ == 'train':
            y = self.data[index][self.target]
            
            return system, y

In [None]:
#собственно нейросеть
class NN(nn.Module):
    
    def __init__(self, dim_atom=106):
        
        super().__init__() 
        
        dim_hidden = 32
        self.lin1 = nn.Linear(dim_atom, dim_hidden)
        self.encoder_layer = nn.TransformerEncoderLayer(d_model=dim_hidden, nhead=1)
        self.transformer_encoder = nn.TransformerEncoder(self.encoder_layer, num_layers=4)
        self.lin2 = nn.Linear(dim_hidden, dim_hidden//4, bias=True)
        self.lin3 = nn.Linear(dim_hidden//4, 1, bias=True)
        
    def forward(self, batch):
        
        padded, src_key_padding_mask = batch[0], batch[1]
        
        padded = self.lin1(padded)
                                
        padded = padded.permute((1, 0, 2))
        embeds = self.transformer_encoder(padded, src_key_padding_mask=src_key_padding_mask)                
        embeds = embeds.permute((1, 0, 2))
        
        embeds_4 = self.lin2(embeds)
        
        summed = torch.sum(embeds_4, 1)
                
        energy = self.lin3(summed)
        
        return energy

In [None]:
def send_scalars(lr, loss, writer, step=-1, epoch=-1, type_='train'):
    if type_ == 'train':
        writer.add_scalar('lr per step on train', lr, step) 
        writer.add_scalar('loss per step on train', loss, step)
    if type_ == 'val':
        writer.add_scalar('loss per epoch on val', loss, epoch)

In [None]:
def send_hist(model, writer, step):
    for name, weight in model.named_parameters():
        try:
            writer.add_histogram(name, weight, step)
        except:
            pass

In [None]:
#train -- ходим по батчам из итератора, обнуляем градиенты, предсказываем у, считаем лосс, считаем градиенты, делаем шаг оптимайзера, записываем лосс
def train(model, iterator, optimizer, criterion, print_every=10, epoch=0, writer=None):
    
    epoch_loss = 0
    
    model.train()
    
    for i, (systems, ys) in enumerate(iterator):
        
        optimizer.zero_grad()
        predictions = model(systems).squeeze()
        
        loss = criterion(predictions.float(), ys.to(device).float())
        loss.backward()     
        
        optimizer.step()
        
        batch_loss = loss.item() 
        epoch_loss += batch_loss  
        
        if writer != None:
            
            lr = optimizer.param_groups[0]['lr']
            
            step = i + epoch*len(iterator)
            
            send_hist(model, writer, i)
            send_scalars(lr, batch_loss, writer, step=step, epoch=epoch, type_='train')
        
        if not (i+1) % print_every:
            print(f'step {i} from {len(iterator)} at epoch {epoch}')
            print(f'Loss: {batch_loss}')
        
    return epoch_loss / len(iterator)

In [None]:
def evaluate(model, iterator, criterion, epoch=0, writer=False):
    
    epoch_loss = 0
    
#    model.train(False)
    model.eval()  
    
    with torch.no_grad():
        for systems, ys in iterator:   

            predictions = model(systems).squeeze()
            loss = criterion(predictions.float(), ys.to(device).float())        

            epoch_loss += loss.item()
            
    overall_loss = epoch_loss / len(iterator)

    if writer != None:
        send_scalars(None, overall_loss, writer, step=None, epoch=epoch, type_='val')
                
    print(f'epoch loss {overall_loss}')
            
    return overall_loss

In [None]:
def inferens(model, iterator):
    y = torch.tensor([])

#    model.train(False)
    model.eval()  
    
    with torch.no_grad():
        for systems in iterator:   
          predictions = model(systems).squeeze()
          y = torch.cat((y, predictions))
      
    return y

## DATA

In [None]:
#инициализируем тренировочный датасети и тренировочный итератор
training_set = Dataset(train_dataset_file_path, features_cols, target_col)
training_generator = DataLoader(training_set, batch_size=batch_size, num_workers=num_workers)

In [None]:
training_set.data.describe()

In [None]:
#инициализируем валидационный датасет и валидационный итератор
valid_set = Dataset(val_dataset_file_path, features_cols, target_col)
valid_generator = DataLoader(valid_set, batch_size=batch_size, num_workers=num_workers)

## MODEL CORE

In [None]:
#чтобы тензор по умолчанию заводился на куде
if torch.cuda.is_available():
    torch.set_default_tensor_type('torch.cuda.FloatTensor')
    print('cuda')

In [None]:
#set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')  
print(device)

In [None]:
#model
model = NN(dim_atom=next(iter(training_generator))[0][0].shape[2])

#optimizer and loss
optimizer = optim.Adam(model.parameters(), lr=0.001)
criterion = nn.L1Loss()

#переносим на куду если она есть
model = model.to(device)
criterion = criterion.to(device)

In [None]:
timestamp = str(datetime.now().strftime("%Y-%m-%d-%H-%M-%S"))

print(timestamp)

In [None]:
#tensorboard writer, при первом запуске надо руками сделать папку для логов

# server
# log_folder_path = "../../ocp_results/logs/tensorboard/out_base_model"

# colab
# log_folder_path = "/content/drive/MyDrive/ocp_results/logs/tensorboard/out_base_model"

# user_specific 
writer = SummaryWriter(log_file_path + '/' + timestamp)

In [None]:
# граф модели
trace_system = next(iter(training_generator))[0]
writer.add_graph(model, (trace_system,))
writer.add_text(timestamp, str(logfile_str))

# Training

In [None]:
%%time
loss = []
loss_eval = []

print(timestamp)
#print(f'Start training model {str(model)}')
for i in range(epochs):
    print(f'epoch {i}')
    loss.append(train(model, training_generator, optimizer, criterion, epoch=i, writer=writer))
    print(f'validation on epoch {i} starts')
    loss_eval.append(evaluate(model, valid_generator, criterion, epoch=i, writer=writer))
    print('=========================================================================================================')

In [None]:
loss_eval