In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm, trange
import torch

import matplotlib.pyplot as plt

from dGbyG.utils.custom_tools import rapid_process_result, rapid_linear_reg
from dGbyG.network.Dataset import Train_Dataset
from dGbyG.network.GNNetwork import MP_network
from dGbyG.train.trainer import Model
from dGbyG.config import train_data_path, inference_model_path

Preparing the train data and network

In [2]:
TrainingData_df = pd.read_csv(train_data_path)
mean_std = TrainingData_df.loc[:,'std'].mean()

Scale = []
for n, sem in zip(TrainingData_df.loc[:,'n'], TrainingData_df.loc[:,'SEM']):
    if np.isnan(sem):
        scale = mean_std
    else:
        scale = (sem**2 + mean_std**2/n)**0.5
    Scale.append(scale)
Scale = np.array(Scale)

equation = TrainingData_df.loc[:, 'reaction']
standard_dG_prime = TrainingData_df.loc[:, 'standard_dg_prime']
weight = 1/np.array(Scale)/np.median(Scale)

In [3]:
def classify_by_ec(ec_startswith):
    children, not_children = [], []
    all_class = set()
    for EC in TrainingData_df.loc[:, 'EC']:
        if pd.isna(EC):
            children.append(False), not_children.append(False)
            continue

        EC = eval(EC)
        remove_ec, add_ec = set(), set()
        for ec in EC:
            if '&' in ec:
                remove_ec.add(ec)
                add_ec |= set(ec.split('&'))
        EC = (set(EC) | add_ec) - set(remove_ec)

        child, not_child = True, True
        for ec in EC:
            if not ec.startswith(ec_startswith):
                child = False
            else:
                not_child = False
        children.append(child), not_children.append(not_child)
        all_class |= EC
    return children, not_children, all_class

EC cross validation

In [4]:
ec_classify_num = []
for n in range(1,7):
    children, not_children, all_class = classify_by_ec(str(n))
    ec_classify_num.append([sum(children), sum(not_children), sum(children)+sum(not_children)])
ec_classify_num

[[116, 333, 449],
 [146, 302, 448],
 [49, 401, 450],
 [58, 392, 450],
 [65, 385, 450],
 [10, 443, 453]]

In [6]:
formation_idx = np.where(TrainingData_df.loc[:, 'type'] == 'formation dg')[0]
Mode = 'manual'
for ec in range(1,7):
    children, not_children, all_class = classify_by_ec(str(ec))
    train_idx = list(formation_idx) + list(np.where(not_children)[0])
    val_idx = np.where(children)[0]
    
    results_dir = '../data/results_data/cross_validation_results/EC{0}_cross_validation/'.format(ec)
    if not os.path.isdir(results_dir):
        os.mkdir(results_dir)
    for n in range(20):
        name = os.path.join(results_dir, str(n))
        if not os.path.exists(name+'.csv'):
            print(n)
            dG = standard_dG_prime + np.random.randn(standard_dG_prime.shape[0]) * Scale
            TrainSet = Train_Dataset(equations=equation, dGs=dG, weights=weight)

            model = Model()
            model.network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
            Loss, Result_df = model.cross_validation(TrainSet, mode=Mode, train_idx=train_idx, val_idx=val_idx, epochs=9000, lr=1e-4, weight_decay=1e-6)
            Result_df = pd.concat([pd.Series(standard_dG_prime), Result_df], axis=1)

            np.save(name+'.npy', Loss), Result_df.to_csv(name+'.csv')

In [None]:

results_dir = '../data/results_data/cross_validation_results/EC{0}_cross_validation/'.format(ec)
if not os.path.isdir(results_dir):
	os.mkdir(results_dir)
for n in range(20):
    name = os.path.join(results_dir, str(n))
    if not os.path.exists(name+'.csv'):
        print(n)
        dG = standard_dG_prime + np.random.randn(standard_dG_prime.shape[0]) * Scale
        TrainSet = Train_Dataset(equations=equation, dGs=dG, weights=weight)

        network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
        model = Model()
        model.network = network
        Loss, Result_df = model.cross_validation(TrainSet, mode=Mode, train_idx=formation_idx, val_idx=reaction_idx, epochs=9000, lr=1e-4, weight_decay=1e-6)
        Result_df = pd.concat([pd.Series(standard_dG_prime), Result_df], axis=1)

        np.save(name+'.npy', Loss), Result_df.to_csv(name+'.csv')

Formation-reaction cross validation 

In [31]:
formation_idx = np.where(TrainingData_df.loc[:, 'type'] == 'formation dg')[0]
reaction_idx = np.where(TrainingData_df.loc[:, 'type'] == 'reaction dg')[0]

In [9]:
Mode = 'manual'
results_dir = '../data/results_data/cross_validation_results/formation2reaction_cross_validation/'
if not os.path.isdir(results_dir):
	os.mkdir(results_dir)
for n in range(20):
    name = os.path.join(results_dir, str(n))
    if not os.path.exists(name+'.csv'):
        print(n)
        dG = standard_dG_prime + np.random.randn(standard_dG_prime.shape[0]) * Scale
        TrainSet = Train_Dataset(equations=equation, dGs=dG, weights=weight)

        network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
        model = Model()
        model.network = network
        Loss, Result_df = model.cross_validation(TrainSet, mode=Mode, train_idx=formation_idx, val_idx=reaction_idx, epochs=9000, lr=1e-4, weight_decay=1e-6)
        Result_df = pd.concat([pd.Series(standard_dG_prime), Result_df], axis=1)

        np.save(name+'.npy', Loss), Result_df.to_csv(name+'.csv')

In [11]:
Mode = 'manual'
results_dir = '../data/results_data/cross_validation_results/reaction2formation_cross_validation/'
if not os.path.isdir(results_dir):
	os.mkdir(results_dir)
for n in range(20):
    name = os.path.join(results_dir, str(n))
    if not os.path.exists(name+'.csv'):
        print(n)
        dG = standard_dG_prime + np.random.randn(standard_dG_prime.shape[0]) * Scale
        TrainSet = Train_Dataset(equations=equation, dGs=dG, weights=weight)

        network = MP_network(atom_dim=TrainSet[0].x.size(1), bond_dim=TrainSet[0].edge_attr.size(1), emb_dim=300, num_layer=2)
        model = Model()
        model.network = network
        Loss, Result_df = model.cross_validation(TrainSet, mode=Mode, train_idx=reaction_idx, val_idx=formation_idx, epochs=9000, lr=1e-4, weight_decay=1e-6)
        Result_df = pd.concat([pd.Series(standard_dG_prime), Result_df], axis=1)

        np.save(name+'.npy', Loss), Result_df.to_csv(name+'.csv')