Data Preprocess

In [1]:
from data import prepare_data
train_dataset_path,val_dataset_path = prepare_data(
    file_path = './example/ads/raw_data/adsorption_energy.csv',
    save_path = './example/ads/data',
    _col_name='smiles',
    test_size=0.2,
    )

2024-12-13 02:52:01.380000: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-12-13 02:52:01.433847: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-12-13 02:52:01.433901: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-12-13 02:52:01.435002: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-13 02:52:01.441960: I tensorflow/core/platform/cpu_feature_guar

fused_multi_tensor is not installed corrected
fused_layer_norm is not installed corrected
fused_softmax is not installed corrected
Preparing data...
Converting SMILES to XYZ...
Data already exists, skipping...
Converting data to LMDB...
Done


Train

In [2]:
# dataset & hyparams
from utils import LMDBDataset,batch_collate_fn
from torch.utils.data import DataLoader, Dataset
num_epochs = 200
batch_size=128
learning_rate = 1e-4
patience = 2000

train_dataset = LMDBDataset(train_dataset_path)
val_dataset = LMDBDataset(val_dataset_path)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True,collate_fn=batch_collate_fn)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False,collate_fn=batch_collate_fn)

In [None]:
# model
import torch
import torch.nn as nn
from model import *
from data import *

mol2input = Mol2Input()
device = torch.device("cuda:0" if torch.cuda.is_available() else torch.device('cpu'))
fitting_model = FittingNet(output_dim=train_dataset[0][2].shape[0])
unimol_model = UniMolModel(output_dim=1, data_type='molecule', remove_hs=False,)
fitting_model.to(device)
fitting_model.train()
unimol_model.to(device)
unimol_model.train()
optimizer = torch.optim.Adam(list(fitting_model.parameters()) + list(unimol_model.parameters()), lr=learning_rate)
criterion = nn.MSELoss()

In [None]:
# train
train_loss = []
val_loss = []
MIN_LOSS  = 1E3
_patience = 0
save_path = '/vepfs/fs_users/ycjin/Delta-ML-Framework/tasks/demo/example/ads/data',
for epoch in range(num_epochs):
    for coord,atype,target in train_loader:
        input_dict = mol2input.coord2unimol_inputs(coord,atype)
        for k in input_dict.keys(): input_dict[k] = input_dict[k].to(device)
        atomic_reprs = unimol_model(return_repr=True,**input_dict)['atomic_reprs']
        pred = []
        for repr in atomic_reprs:
            p = torch.sum(fitting_model(repr))
            pred.append(p)
        pred = torch.stack(pred).reshape(-1,1)
        loss = criterion(pred,target.to(device))
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        train_loss.append(loss.item())

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {np.mean(train_loss):.4f}')

    with torch.no_grad():
        for coord,atype,target in val_loader:
            input_dict = mol2input.coord2unimol_inputs(coord,atype)
            for k in input_dict.keys(): input_dict[k] = input_dict[k].to(device)
            atomic_reprs = unimol_model(return_repr=True,**input_dict)['atomic_reprs']
            pred = []
            for repr in atomic_reprs:
                p = torch.sum(fitting_model(repr))
                pred.append(p)
            pred = torch.stack(pred).reshape(-1,1)
            loss = criterion(pred,target.to(device))
            val_loss.append(loss.item())

    print(f'Epoch {epoch+1}/{num_epochs}, Train Loss: {np.mean(train_loss):.4f}, Val Loss: {np.mean(val_loss):.4f}')
    if np.mean(val_loss) < MIN_LOSS:
        MIN_LOSS = np.mean(val_loss)
        pth_save_path = './example/ads/pth/'
        if not os.path.exists(pth_save_path):
            os.makedirs(pth_save_path)
        torch.save(unimol_model.state_dict(), pth_save_path +'atomic_model_nh.pth')
        torch.save(fitting_model.state_dict(), pth_save_path + 'atomic_fit_nh.pth')
        _patience = 0
    else:
        _patience+=1
        if _patience > patience:
            print('Early stopping')
            break


inference

In [17]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from model import *
from data import *
from utils import *

pth_save_path = './example/ads/pth/'

mol2input = Mol2Input()
device = torch.device("cuda:0" if torch.cuda.is_available() else torch.device('cpu'))
fitting_model = FittingNet(output_dim=1)
unimol_model = UniMolModel(output_dim=1, data_type='molecule', remove_hs=False)
fitting_model.load_state_dict(torch.load(pth_save_path +'atomic_fit_nh.pth'))
unimol_model.load_state_dict(torch.load(pth_save_path + 'atomic_model_nh.pth'))
fitting_model.to(device)
fitting_model.eval()
unimol_model.to(device)
unimol_model.eval()
print('Load model successfully!')

Load model successfully!


In [10]:
from ase import Atoms
from ase.io import read,write
import py3Dmol
from ase.io import read
input_file = '/vepfs/fs_users/ycjin/ads_predict_tools/example/pub25/data/xyz/400.xyz' # input molecule file
atom = read(input_file)
sym = np.array(atom.get_chemical_symbols())
coord = [torch.tensor(atom.get_positions())[sym != 'H']]
atype = [np.array(atom.get_chemical_symbols())[sym != 'H']]
atom = atom[sym != 'H']
t = coord[0][0].clone()
coord[0][0] = coord[0][4]
coord[0][4] = t
input_dict = mol2input.coord2unimol_inputs(coord,atype)
for k in input_dict.keys(): input_dict[k] = input_dict[k].to(device)
atomic_reprs = unimol_model(return_repr=True,**input_dict)['atomic_reprs']
pred = []
for repr in atomic_reprs:
    atomic_p = fitting_model(repr)
    p = torch.sum(fitting_model(repr))

print(p)

tensor(2.4000, device='cuda:0', grad_fn=<SumBackward0>)


In [4]:
# 绘制吸附能贡献图
value = np.array(atomic_p.detach().cpu()).reshape(-1)*1800+255
value = np.int16(value)

setting = {
    'H':[0.3],
    'C':[0.3],
    'O':[0.3],
    'N':[0.3],
    'S':[0.3],
    'F':[0.3],
    'Cl':[0.3],
    'Br':[0.3],
    'I':[0.3]
}

def self_hex(n):
    return hex(n)[2:].zfill(2)

write('md_n.xyz', atom)
view = py3Dmol.view(width=300, height=300)
view.addModel(open('md_n.xyz').read(), format='xyz')
for i,type in enumerate(atype[0]):
    view.setStyle({'index':i}, {'sphere': {'scale':setting[type][0],'color':'#'+self_hex(value[i])+self_hex(value[i])+self_hex(value[i])}})

# 改变某个原子的颜色深浅
view.zoomTo(animate=True)
view.show()

view_2 = py3Dmol.view(width=300, height=300)
view_2.addModel(open('md_n.xyz').read(), format='xyz')
view.setStyle({'sphere': {'scale': 0.3}})
view.zoomTo(animate=True)
view.show()