In [1]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import os
import pandas as pd
import numpy as np

from ase import Atoms
from ase.db import connect
import schnetpack
import pandas as pd

import torch
import torch.nn.functional as F
from torch.optim import Adam

import schnetpack as spk
import schnetpack.atomistic as atm
import schnetpack.representation as rep
from schnetpack.datasets import *
from schnetpack.data import Structure

In [2]:
import os
os.getpid()

14937

In [3]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [4]:
import numpy as np
import pandas as pd
molecules = pd.read_csv(f'{file_folder}/structures.csv')
molecules = molecules.groupby('molecule_name')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
mulliken_charges = mulliken_charges.groupby('molecule_name')

In [5]:
mulliken_charges.get_group('dsgdb9nsd_000003')

Unnamed: 0,molecule_name,atom_index,mulliken_charge
9,dsgdb9nsd_000003,0,-0.589706
10,dsgdb9nsd_000003,1,0.294853
11,dsgdb9nsd_000003,2,0.294853


In [6]:
mulliken_charges.get_group('dsgdb9nsd_000003').mulliken_charge.sum()

0.0

In [7]:
pd.read_csv(f'{file_folder}/mulliken_charges.csv').mulliken_charge.describe()

count    1.533537e+06
mean    -2.256222e-10
std      2.254392e-01
min     -7.334500e-01
25%     -1.915330e-01
50%      9.867800e-02
75%      1.273960e-01
max      7.289810e-01
Name: mulliken_charge, dtype: float64

In [8]:
pd.read_csv(f'{file_folder}/mulliken_charges.csv').mulliken_charge.hist(bins=50)

<matplotlib.axes._subplots.AxesSubplot at 0x7fceb38bb630>

In [9]:
from ase import Atoms
from ase.db import connect

def create_db(db_path, molecule_names):
    with connect(db_path) as db:
        for name in molecule_names:
            mol = molecules.get_group(name)
            atoms = Atoms(symbols=mol.atom.values,
                          positions=[(row.x,row.y,row.z) for row in mol.itertuples()])
            try:
                charges = mulliken_charges.get_group(name).mulliken_charge.values.reshape(-1,1)
                total_charge = 0.0
            except KeyError:
                charges = None
                total_charge = None
            db.write(atoms, name=name,
                     data={'mulliken_charges': charges, 'total_charge': total_charge})

In [10]:
train = pd.read_csv(f'{file_folder}/train.csv')
train_molecule_names = train.molecule_name.unique()

champs_path = 'CHAMPS_train_mulliken_charges.db'
dataset_size =  len(train_molecule_names) # 12000
dataset_molecule_names = train_molecule_names[:dataset_size]
create_db(db_path=champs_path, molecule_names=dataset_molecule_names)

In [11]:
test = pd.read_csv(f'{file_folder}/test.csv')
test_molecule_names = test.molecule_name.unique()

In [12]:
champs_test_path = 'CHAMPS_test_mulliken_charges.db'
create_db(db_path=champs_test_path, molecule_names=test_molecule_names)

In [13]:
import schnetpack
database = schnetpack.data.AtomsData(champs_path, properties=['mulliken_charges', 'total_charge'])
database_test = schnetpack.data.AtomsData(champs_test_path, properties=['mulliken_charges', 'total_charge'])

In [14]:
import pandas as pd

import torch
import torch.nn.functional as F
from torch.optim import Adam

import schnetpack as spk
import schnetpack.atomistic as atm
import schnetpack.representation as rep
from schnetpack.datasets import *

device = torch.device("cuda")

In [15]:
# This function comes from the following script:
# https://github.com/atomistic-machine-learning/schnetpack/blob/v0.2.1/src/scripts/schnetpack_qm9.py
def evaluate_dataset(metrics, model, loader, device):
    for metric in metrics:
        metric.reset()

    with torch.no_grad():
        for batch in loader:
            batch = {
                k: v.to(device)
                for k, v in batch.items()
            }
            result = model(batch)

            for metric in metrics:
                metric.add_batch(batch, result)

    results = [
        metric.aggregate() for metric in metrics
    ]
    return results

In [16]:
from schnetpack.data import Structure

class Mulliken(atm.Atomwise):
    def __init__(self):
        super(Mulliken, self).__init__(return_contributions=True)
        
    def forward(self, inputs):
        result = super().forward(inputs)
        
        atom_mask = inputs[Structure.atom_mask].byte()
        
        yi = inputs['mulliken_charges']
        yi = torch.masked_select(yi.squeeze(dim=2), atom_mask)
        inputs['mulliken_true'] = yi
        
        yi = result['yi']
        yi = torch.masked_select(yi.squeeze(dim=2), atom_mask)
        result['mulliken_pred'] = yi
        
        return result

In [17]:
def schnet_model():
    reps = rep.SchNet(n_interactions=6)
    output = Mulliken()
    model = atm.AtomisticModel(reps, output)
    model = model.to(device)
    return model

In [18]:
n_dataset = len(database)
n_val = n_dataset // 10
train_data, val_data, test_data = database.create_splits(n_dataset - 2*n_val, n_val)

In [19]:
len(val_data),len(train_data), len(test_data), type(test_data), type(database_test)

(25500, 204009, 25500, schnetpack.data.AtomsData, schnetpack.data.AtomsData)

In [25]:
def train_model(max_epochs=500):
    # split in train and val
    n_dataset = len(database)
    n_val = n_dataset // 10
    train_data, val_data, test_data = database.create_splits(n_dataset - 2*n_val, n_val)
    train_loader = spk.data.AtomsLoader(train_data, batch_size=64, num_workers=2)
    val_loader = spk.data.AtomsLoader(val_data, batch_size=64, num_workers=2)

    # create model
    model = schnet_model()

    # create trainer
    true_key = 'mulliken_true'
    pred_key = 'mulliken_pred'
    opt = Adam(model.parameters(), lr=1e-4)
    loss = lambda b, p: F.mse_loss(p[pred_key], b[true_key])
    metrics = [
        spk.metrics.MeanAbsoluteError(true_key, pred_key, name='MAE_mulliken'),
        spk.metrics.RootMeanSquaredError(true_key, pred_key, name='RMSE_mulliken'),
    ]
    hooks = [
        spk.train.MaxEpochHook(max_epochs),
        spk.train.CSVHook('log', metrics, every_n_epochs=1),
    ]
    trainer = spk.train.Trainer('output', model, loss,
                            opt, train_loader, val_loader, hooks=hooks)

    # start training
    trainer.train(device)
    
    # evaluation
    model.load_state_dict(torch.load('output/best_model'))
    test_loader = spk.data.AtomsLoader(test_data2, batch_size=256, num_workers=2)
    model.eval()

    df = pd.DataFrame()
    df['metric'] = ['MAE_mulliken', 'RMSE_mulliken']
    df['training'] = evaluate_dataset(metrics, model, train_loader, device)
    df['validation'] = evaluate_dataset(metrics, model, val_loader, device)
    df['test'] = evaluate_dataset(metrics, model, test_loader, device)
    display(df)
    
    return test_data

In [26]:
def show_history():
    df = pd.read_csv('log/log.csv')
    display(df.tail())
    max_value = None # df['RMSE_mulliken'].min()*5
    _ = df[['MAE_mulliken','RMSE_mulliken']].plot(ylim=(0,max_value))

In [27]:
def test_prediction(dataset, model_path):
    # create model
    model = schnet_model()
    
    # load best parameters
    model.load_state_dict(torch.load(model_path))
    loader = spk.data.AtomsLoader(dataset, batch_size=256, num_workers=2)
    model.eval()
    
    # predict mulliken charges
    targets = []
    predictions = []
    with torch.no_grad():
        for batch in loader:
            batch = {
                k: v.to(device)
                for k, v in batch.items()
            }
            result = model(batch)
            targets += batch['mulliken_true'].tolist()
            predictions += result['mulliken_pred'].tolist()
    return targets, predictions

In [28]:
used_test_data = train_model(max_epochs=2)
show_history()

RuntimeError: CUDA out of memory. Tried to allocate 2.00 MiB (GPU 0; 7.44 GiB total capacity; 6.48 GiB already allocated; 1024.00 KiB free; 490.56 MiB cached)

In [22]:
# targets, predictions = test_prediction(dataset, '../../data/temp/mpnn_keras/best_model_charges')
# df_pred = pd.DataFrame()
# df_pred['Target'] = targets
# df_pred['Prediction'] = predictions

In [20]:
# def show_predictions(dataset):
#     targets, predictions = test_prediction(dataset)
#     df_pred = pd.DataFrame()
#     df_pred['Target'] = targets
#     df_pred['Prediction'] = predictions
#     df_pred.plot.scatter(x='Target', y='Prediction')

In [21]:
# used_test_data = train_model(max_epochs=200)
# show_history()