In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [5]:
import os
os.getpid()

6627

In [2]:
import os
import pandas as pd
import numpy as np

from ase import Atoms
from ase.db import connect
import schnetpack
import pandas as pd

import torch
import torch.nn.functional as F
from torch.optim import Adam

import schnetpack as spk
import schnetpack.atomistic as atm
import schnetpack.representation as rep
from schnetpack.datasets import *
from schnetpack.data import Structure

In [3]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [4]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [5]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [6]:
def gen_magnetic_shielding_parameters(magnetic_shielding_tensors, path='magnetic_shielding_parameters.pkl'):

    if os.path.exists(path):
        magnetic_shielding_parameters = pd.read_pickle(path)
        return magnetic_shielding_parameters

    x = magnetic_shielding_tensors.columns.values[2:]
    x = magnetic_shielding_tensors[x].values
    x = x.reshape(-1,3,3)
    x = x + np.transpose(x,(0,2,1))
    x = 0.5 * x
    w, v = np.linalg.eigh(x)

    sigma_iso = np.sum(w, axis=1)/3 
    omega = w[:,2] - w[:,0]
    kappa = 3 * (sigma_iso - w[:,1])/omega

    magnetic_shielding_parameters = magnetic_shielding_tensors[magnetic_shielding_tensors.columns.values[:2]]
    magnetic_shielding_parameters = pd.DataFrame(magnetic_shielding_parameters)
    magnetic_shielding_parameters["sigma_iso"] = sigma_iso
    magnetic_shielding_parameters["omega"] = omega
    magnetic_shielding_parameters["kappa"] = kappa
    magnetic_shielding_parameters.to_pickle(path)
    return magnetic_shielding_parameters


def load_dataset(dataset_molecule_names, champs_path='CHAMPS_train.db'):
    def create_db(db_path, molecule_names):
        with connect(db_path) as db:
            for name in molecule_names:
                mol = molecules.get_group(name)
                atoms = Atoms(symbols=mol.atom.values, positions=[(row.x,row.y,row.z) for row in mol.itertuples()])
                try:
                    mol_msp = msp.get_group(name)
                    sigma_iso = mol_msp['sigma_iso'].values.reshape(-1,1)
                    omega = mol_msp['omega'].values.reshape(-1,1)
                    kappa = mol_msp['kappa'].values.reshape(-1,1)
                except KeyError:
                    sigma_iso, omega, kappa = [None] * 3
                db.write(atoms, name=name, data=dict(sigma_iso=sigma_iso, omega=omega, kappa=kappa))

    if not os.path.exists(champs_path):
        create_db(db_path=champs_path, molecule_names=dataset_molecule_names)    
    dataset = schnetpack.data.AtomsData(champs_path, properties=['sigma_iso', 'omega', 'kappa'])
    return dataset

In [7]:
magnetic_shielding_parameters = gen_magnetic_shielding_parameters(magnetic_shielding_tensors)
train_molecule_names = train.molecule_name.unique()
molecules = pd.read_csv('../../data/input/structures.csv')
molecules = molecules.groupby('molecule_name')
msp = magnetic_shielding_parameters.groupby('molecule_name')
dataset = load_dataset(dataset_molecule_names=train_molecule_names)

In [8]:
magnetic_shielding_parameters.molecule_name.unique().shape

(85003,)

In [9]:
# import sys
# !{sys.executable} -m pip install schnetpack

In [10]:
class MagneticShielding(atm.Atomwise):
    def __init__(self, property):
        super(MagneticShielding, self).__init__(return_contributions=True)
        self.property = property
        
    def forward(self, inputs):
        result = super().forward(inputs)
        
        atom_mask = inputs[Structure.atom_mask].byte()
        
        yi = inputs[self.property]
        yi = torch.masked_select(yi.squeeze(dim=2), atom_mask)
        inputs[self.property+'_true'] = yi
        
        yi = result['yi']
        yi = torch.masked_select(yi.squeeze(dim=2), atom_mask)
        result[self.property+'_pred'] = yi
        
        return result

In [11]:
def schnet_model(property):
    reps = rep.SchNet(n_interactions=6)
    output = MagneticShielding(property=property)
    model = atm.AtomisticModel(reps, output)
    model = model.to(device)
    return model

In [27]:
def train_model(property, max_epochs=500):
    # split in train and val
    n_dataset = len(dataset)
    print(n_dataset)
    n_val = n_dataset // 10
    train_data, val_data, test_data = dataset.create_splits(n_dataset-n_val*2, n_val)
    train_loader = spk.data.AtomsLoader(train_data, batch_size=64, num_workers=2)
    val_loader = spk.data.AtomsLoader(val_data, batch_size=64, num_workers=2)

    # create model
    model = schnet_model(property)

    # create trainer
    target_key = property+'_true'
    output_key = property+'_pred'
    opt = Adam(model.parameters(), lr=1e-4)
    loss = lambda b, p: F.mse_loss(p[output_key], b[target_key])
    metrics = [
        spk.metrics.MeanAbsoluteError(target_key, output_key, name='MAE_'+property),
        spk.metrics.RootMeanSquaredError(target_key, output_key, name='RMSE_'+property),
    ]
    hooks = [
        spk.train.MaxEpochHook(max_epochs),
        spk.train.CSVHook(property+'/log', metrics, every_n_epochs=1),
    ]
    trainer = spk.train.Trainer(property+'/output', model, loss,
                            opt, train_loader, val_loader, hooks=hooks)

    # start training
    trainer.train(device)
    
    # evaluation
    model.load_state_dict(torch.load(property+'/output/best_model'))
    test_loader = spk.data.AtomsLoader(test_data, batch_size=256, num_workers=2)
    model.eval()

    df = pd.DataFrame()
    df['metric'] = ['MAE', 'RMSE']
    df['training'] = evaluate_dataset(metrics, model, train_loader, device)
    df['validation'] = evaluate_dataset(metrics, model, val_loader, device)
    df['test'] = evaluate_dataset(metrics, model, test_loader, device)
    display(df)
    
    return test_data

In [28]:
# This function comes from the following script:
# https://github.com/atomistic-machine-learning/schnetpack/blob/v0.2.1/src/scripts/schnetpack_qm9.py
def evaluate_dataset(metrics, model, loader, device):
    for metric in metrics:
        metric.reset()

    with torch.no_grad():
        for batch in loader:
            batch = {
                k: v.to(device)
                for k, v in batch.items()
            }
            result = model(batch)

            for metric in metrics:
                metric.add_batch(batch, result)

    results = [
        metric.aggregate() for metric in metrics
    ]
    return results

In [29]:
def show_history(property):
    df = pd.read_csv(property+'/log/log.csv')
    display(df.tail())
    max_value = None # df['RMSE_'+property].min()*5
    _ = df[['MAE_'+property,'RMSE_'+property]].plot(ylim=(0,max_value))

In [30]:
def test_prediction(dataset, property):
    # create model
    model = schnet_model(property)
    
    # load best parameters
    model.load_state_dict(torch.load(property+'/output/best_model'))
    loader = spk.data.AtomsLoader(dataset, batch_size=256, num_workers=2)
    model.eval()
    
    # predict shielding parameters
    targets = []
    predictions = []
    with torch.no_grad():
        for batch in loader:
            batch = {
                k: v.to(device)
                for k, v in batch.items()
            }
            result = model(batch)
            targets += batch[property+'_true'].tolist()
            predictions += result[property+'_pred'].tolist()
    return targets, predictions

In [31]:
def show_predictions(dataset, property):
    targets, predictions = test_prediction(dataset, property)
    df_pred = pd.DataFrame()
    df_pred['Target'] = targets
    df_pred['Prediction'] = predictions
    df_pred.plot.scatter(x='Target', y='Prediction', title=property)
    return df_pred

In [32]:
device = torch.device("cuda")
used_test_data = dict()
df_pred_list = []
for p in ['sigma_iso', 'omega', 'kappa']:
    print(p)
    used_test_data[p] = train_model(p, max_epochs=50)
    df_pred_i = show_history(p)
    df_pred_list.append(df_pred_i)

sigma_iso
85003


Unnamed: 0,metric,training,validation,test
0,MAE,5.692817,5.739599,5.685256
1,RMSE,11.10609,11.905205,11.062726


Unnamed: 0,Time,Learning rate,Train loss,Validation loss,MAE_sigma_iso,RMSE_sigma_iso
45,1565250000.0,0.0001,8492.307022,90.058972,4.644823,9.467138
46,1565250000.0,0.0001,8282.059177,87.795876,4.594982,9.347694
47,1565250000.0,0.0001,8077.765785,85.35535,4.546069,9.21709
48,1565250000.0,0.0001,7881.010014,83.092205,4.497587,9.094334
49,1565251000.0,0.0001,7694.477837,80.85554,4.447851,8.971342


omega
85003


Unnamed: 0,metric,training,validation,test
0,MAE,16.957989,16.92157,16.962412
1,RMSE,29.822474,29.470088,29.884574


Unnamed: 0,Time,Learning rate,Train loss,Validation loss,MAE_omega,RMSE_omega
51,508.329687,0.0001,46186.849564,562.323407,14.426946,23.700291
52,517.915929,0.0001,45068.589722,542.2953,14.456377,23.274223
53,527.623453,0.0001,43936.273132,533.066784,14.520654,23.075304
54,537.403065,0.0001,42988.169006,516.231778,14.341161,22.707861
55,547.176671,0.0001,42007.520981,512.26618,14.391035,22.620689


kappa
85003


Unnamed: 0,metric,training,validation,test
0,MAE,0.16871,0.168654,0.16866
1,RMSE,0.234153,0.234253,0.234286


Unnamed: 0,Time,Learning rate,Train loss,Validation loss,MAE_kappa,RMSE_kappa
45,454.475311,0.0001,4.163489,0.037333,0.137273,0.193252
46,464.304072,0.0001,4.106265,0.036978,0.136538,0.192334
47,474.013097,0.0001,4.050663,0.036636,0.135836,0.191446
48,483.6913,0.0001,3.996587,0.036309,0.135158,0.19059
49,493.515197,0.0001,3.943953,0.035996,0.134503,0.189769


In [2]:
set(['tertiary_distance_2', 'dist_C_0_y', 'dist_H_1_x', 'molecule_atom_index_0_dist_min_diff', 'molecule_atom_index_0_dist_max_div', 'adC3', 'dist_C_3_y', 'tertiary_angle_1', 'yukawa_H.y', 'cos_f0_f1', 'dist_C_1_y', 'dist_to_type_1_mean', 'dist_O_0_y', 'cos_c1', 'adC2', 'dist_C_0_x', 'molecule_atom_index_0_dist_min_div', 'dist_to_type_std', 'adC1', 'tertiary_distance_1', 'dist_H_0_y', 'molecule_dist_min', 'max_distance_y', 'inv_distPE', 'dist_xyz', 'eem_0', 'dist_O_0_x', 'dist_to_type_mean', 'cos_c0_c1', 'cos_c0', 'adN1', 'tertiary_angle_0', 'tertiary_distance_4', 'dist_H_0_x', 'dist_C_1_x', 'inv_distP', 'molecule_atom_index_0_dist_mean_diff', 'tertiary_atom_1', 'tertiary_angle_2', 'mean_molecule_atom_0_dist_xyz', 'dist_C_2_y', 'dist_H_1_y', 'dist_C_3_x', 'dist_H_2_y', 'dist_H_3_y', 'link0', 'yukawa_H.x', 'dist_C_2_x', 'dist_N_0_y', 'dist_to_type_0_mean', 'dist_N_0_x', 'eem_1', 'tertiary_angle_3', 'distance_c1', 'dist_H_3_x', 'tertiary_distance_3', 'cos_f0', 'cos_f1', 'tertiary_atom_2'])-set(['tertiary_distance_2', 'dist_C_0_y', 'dist_H_1_x', 'molecule_atom_index_0_dist_min_diff', 'molecule_atom_index_0_dist_max_div', 'adC3', 'dist_C_3_y', 'tertiary_angle_1', 'yukawa_H.y', 'cos_f0_f1', 'dist_C_1_y', 'dist_to_type_1_mean', 'dist_O_0_y', 'cos_c1', 'adC2', 'dist_C_0_x', 'molecule_atom_index_0_dist_min_div', 'dist_to_type_std', 'adC1', 'tertiary_distance_1', 'dist_H_0_y', 'molecule_dist_min', 'max_distance_y', 'inv_distPE', 'dist_xyz', 'eem_0', 'dist_O_0_x', 'dist_to_type_mean', 'cos_c0_c1', 'cos_c0', 'adN1', 'tertiary_angle_0', 'tertiary_distance_4', 'dist_H_0_x', 'dist_C_1_x', 'inv_distP', 'molecule_atom_index_0_dist_mean_diff', 'tertiary_atom_1', 'tertiary_angle_2', 'mean_molecule_atom_0_dist_xyz', 'dist_C_2_y', 'dist_H_1_y', 'dist_C_3_x', 'dist_H_2_y', 'dist_H_3_y', 'link0', 'yukawa_H.x', 'dist_C_2_x', 'dist_N_0_y', 'dist_to_type_0_mean', 'dist_N_0_x', 'eem_1', 'tertiary_angle_3', 'distance_c1', 'dist_H_3_x', 'tertiary_distance_3', 'cos_f0', 'cos_f1', 'tertiary_atom_2'])
set(['tertiary_distance_2', 'dist_C_0_y', 'dist_H_1_x', 'molecule_atom_index_0_dist_min_diff', 'molecule_atom_index_0_dist_max_div', 'adC3', 'dist_C_3_y', 'tertiary_angle_1', 'yukawa_H.y', 'cos_f0_f1', 'dist_C_1_y', 'dist_to_type_1_mean', 'dist_O_0_y', 'cos_c1', 'adC2', 'dist_C_0_x', 'molecule_atom_index_0_dist_min_div', 'dist_to_type_std', 'adC1', 'tertiary_distance_1', 'dist_H_0_y', 'molecule_dist_min', 'max_distance_y', 'inv_distPE', 'dist_xyz', 'eem_0', 'dist_O_0_x', 'dist_to_type_mean', 'cos_c0_c1', 'cos_c0', 'adN1', 'tertiary_angle_0', 'tertiary_distance_4', 'dist_H_0_x', 'dist_C_1_x', 'inv_distP', 'molecule_atom_index_0_dist_mean_diff', 'tertiary_atom_1', 'tertiary_angle_2', 'mean_molecule_atom_0_dist_xyz', 'dist_C_2_y', 'dist_H_1_y', 'dist_C_3_x', 'dist_H_2_y', 'dist_H_3_y', 'link0', 'yukawa_H.x', 'dist_C_2_x', 'dist_N_0_y', 'dist_to_type_0_mean', 'dist_N_0_x', 'eem_1', 'tertiary_angle_3', 'distance_c1', 'dist_H_3_x', 'tertiary_distance_3', 'cos_f0', 'cos_f1', 'tertiary_atom_2'])

set()

In [34]:
df_pred_i