In [None]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import torch

import matplotlib.pyplot as plt

from dGbyG.utils.custom_tools import rapid_process_result, rapid_linear_reg
from dGbyG.network.Dataset import Train_Dataset
from dGbyG.network.GNNetwork import MP_network
from dGbyG.train.trainer import Model
from dGbyG.config import train_data_path, inference_model_path

Preparing the train data and network

In [None]:
TrainingData_df = pd.read_csv(train_data_path)
mean_std = TrainingData_df.loc[:,'std'].mean()

Scale = []
for n, sem in zip(TrainingData_df.loc[:,'n'], TrainingData_df.loc[:,'SEM']):
    if np.isnan(sem):
        scale = mean_std
    else:
        scale = (sem**2 + mean_std**2/n)**0.5
    Scale.append(scale)
Scale = np.array(Scale)

#SEM = np.nan_to_num(TrainingData_df.loc[:,'SEM'], nan=mean_std)

equation = TrainingData_df.loc[:, 'reaction']
standard_dG_prime = TrainingData_df.loc[:, 'standard_dg_prime']
weight = 1/np.array(Scale)/np.median(Scale) #(1/(SEM+1))/np.median((1/(SEM+1)))

10-fold cross-validation

In [None]:
results_dir = '../data/results_data/cross_validation_results/10_fold_cross_validation/'
for n in range(100):
    name = os.path.join(results_dir, str(n))
    if not os.path.exists(name+'.csv'):
        print(n)
        dG = standard_dG_prime + np.random.randn(standard_dG_prime.shape[0]) * Scale
        TrainSet = Train_Dataset(equations=equation, dGs=dG, weights=weight)

        network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
        model = Model()
        model.network = network
        Loss, Result_df = model.cross_validation(TrainSet, mode=10, epochs=9000, lr=1e-4, weight_decay=1e-6)
        Result_df = pd.concat([pd.Series(standard_dG_prime), Result_df], axis=1)

        np.save(name+'.npy', Loss), Result_df.to_csv(name+'.csv')

10-fold cross validation of unweighing

In [None]:
results_dir = '../data/results_data/cross_validation_results/10_fold_cross_validation_unweighing/'
for n in range(100):
    name = os.path.join(results_dir, str(n))
    if not os.path.exists(name+'.csv'):
        print(n)
        dG = standard_dG_prime + np.random.randn(standard_dG_prime.shape[0]) * Scale
        TrainSet = Train_Dataset(equations=equation, dGs=dG, weights=None)

        network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
        model = Model()
        model.network = network
        Loss, Result_df = model.cross_validation(TrainSet, mode=10, epochs=9000, lr=1e-4, weight_decay=1e-6)
        Result_df = pd.concat([pd.Series(standard_dG_prime), Result_df], axis=1)

        np.save(name+'.npy', Loss), Result_df.to_csv(name+'.csv')

10-fold cross validation without random error

In [None]:
results_dir = '../data/results_data/cross_validation_results/10_fold_cross_validation_without_random_dG/'
for n in range(100):
    name = os.path.join(results_dir, str(n))
    if not os.path.exists(name+'.csv'):
        print(n)
        TrainSet = Train_Dataset(equations=equation, dGs=standard_dG_prime, weights=weight)

        network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
        model = Model()
        model.network = network
        Loss, Result_df = model.cross_validation(TrainSet, mode=10, epochs=9000, lr=1e-4, weight_decay=1e-6)
        Result_df = pd.concat([pd.Series(standard_dG_prime), Result_df], axis=1)

        np.save(name+'.npy', Loss), Result_df.to_csv(name+'.csv')

10-fold cross validation of unweighing and without random error

In [None]:
results_dir = '../data/results_data/cross_validation_results/10_fold_cross_validation_unweighing_without_random_dG'
for n in range(100):
    name = os.path.join(results_dir, str(n))
    if not os.path.exists(name+'.csv'):
        print(n)
        TrainSet = Train_Dataset(equations=equation, dGs=standard_dG_prime, weights=None)

        network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
        model = Model()
        model.network = network
        Loss, Result_df = model.cross_validation(TrainSet, mode=10, epochs=9000, lr=1e-4, weight_decay=1e-6)
        Result_df = pd.concat([pd.Series(standard_dG_prime), Result_df], axis=1)

        np.save(name+'.npy', Loss), Result_df.to_csv(name+'.csv')

5-fold cross-validation

In [None]:
results_dir = '../data/results_data/cross_validation_results/5_fold_cross_validation/'
for n in range(20):
    name = os.path.join(results_dir, str(n))
    if not os.path.exists(name+'.csv'):
        print(n)
        dG = standard_dG_prime + np.random.randn(standard_dG_prime.shape[0]) * Scale
        TrainSet = Train_Dataset(equations=equation, dGs=dG, weights=weight)

        network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
        model = Model()
        model.network = network
        Loss, Result_df = model.cross_validation(TrainSet, mode=5, epochs=9000, lr=1e-4, weight_decay=1e-6)
        Result_df = pd.concat([pd.Series(standard_dG_prime), Result_df], axis=1)

        np.save(name+'.npy', Loss), Result_df.to_csv(name+'.csv')

2-fold cross-validation

In [None]:
results_dir = '../data/results_data/cross_validation_results/2_fold_cross_validation/'
for n in range(20):
    name = os.path.join(results_dir, str(n))
    if not os.path.exists(name+'.csv'):
        print(n)
        dG = standard_dG_prime + np.random.randn(standard_dG_prime.shape[0]) * Scale
        TrainSet = Train_Dataset(equations=equation, dGs=dG, weights=weight)

        network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
        model = Model()
        model.network = network
        Loss, Result_df = model.cross_validation(TrainSet, mode=2, epochs=9000, lr=1e-4, weight_decay=1e-6)
        Result_df = pd.concat([pd.Series(standard_dG_prime), Result_df], axis=1)

        np.save(name+'.npy', Loss), Result_df.to_csv(name+'.csv')

Train the network

In [None]:
for n in range(100):
    name = '10_fold_cross_validation_with_random_dG_'+str(n)
    dG = standard_dG_prime + np.random.randn(standard_dG_prime.shape[0]) * Scale
    TrainSet = Train_Dataset(equations=equation, dGs=dG, weights=weight)

    network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
    model = Model()
    model.network = network

    loss_history, Result_df, i = model.train(TrainSet, 9000, 1e-4, 1e-6)
    torch.save(model.network.state_dict(), '../network/best_model_params/'+str(n)+'.pt')
