In [3]:
%%HTML
<style>
   div#notebook-container    { width: 95%; }
   div#menubar-container     { width: 65%; }
   div#maintoolbar-container { width: 99%; }
</style>

In [1]:
import os
os.getpid()

7364

In [47]:
import os
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold

from ase import Atoms
from ase.db import connect
import schnetpack
import pandas as pd

import torch
import torch.nn.functional as F
from torch.optim import Adam

import schnetpack as spk
import schnetpack.atomistic as atm
import schnetpack.representation as rep
from schnetpack.datasets import *
from schnetpack.data import Structure

In [32]:
def reduce_mem_usage(df, verbose=True):
    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    start_mem = df.memory_usage().sum() / 1024**2    
    for col in df.columns:
        col_type = df[col].dtypes
        if col_type in numerics:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)    
    end_mem = df.memory_usage().sum() / 1024**2
    if verbose: print('Mem. usage decreased to {:5.2f} Mb ({:.1f}% reduction)'.format(end_mem, 100 * (start_mem - end_mem) / start_mem))
    return df

In [33]:
file_folder =  '../../data/input'
os.listdir(file_folder)

['sample_submission.csv',
 'magnetic_shielding_tensors.csv',
 'potential_energy.csv',
 'scalar_coupling_contributions.csv',
 'dipole_moments.csv',
 'mulliken_charges.csv',
 'train.csv',
 'test.csv',
 'structures.csv',
 'structures']

In [34]:
train = pd.read_csv(f'{file_folder}/train.csv')
test = pd.read_csv(f'{file_folder}/test.csv')
magnetic_shielding_tensors = pd.read_csv(f'{file_folder}/magnetic_shielding_tensors.csv')
dipole_moments = pd.read_csv(f'{file_folder}/dipole_moments.csv')
mulliken_charges = pd.read_csv(f'{file_folder}/mulliken_charges.csv')
potential_energy = pd.read_csv(f'{file_folder}/potential_energy.csv')
scalar_coupling_contributions = pd.read_csv(f'{file_folder}/scalar_coupling_contributions.csv')
structures = pd.read_csv(f'{file_folder}/structures.csv')

In [35]:
device = torch.device("cuda")

In [36]:
def gen_magnetic_shielding_parameters(magnetic_shielding_tensors, path='magnetic_shielding_parameters.pkl'):

    if os.path.exists(path):
        magnetic_shielding_parameters = pd.read_pickle(path)
        return magnetic_shielding_parameters

    x = magnetic_shielding_tensors.columns.values[2:]
    x = magnetic_shielding_tensors[x].values
    x = x.reshape(-1,3,3)
    x = x + np.transpose(x,(0,2,1))
    x = 0.5 * x
    w, v = np.linalg.eigh(x)

    sigma_iso = np.sum(w, axis=1)/3 
    omega = w[:,2] - w[:,0]
    kappa = 3 * (sigma_iso - w[:,1])/omega

    magnetic_shielding_parameters = magnetic_shielding_tensors[magnetic_shielding_tensors.columns.values[:2]]
    magnetic_shielding_parameters = pd.DataFrame(magnetic_shielding_parameters)
    magnetic_shielding_parameters["sigma_iso"] = sigma_iso
    magnetic_shielding_parameters["omega"] = omega
    magnetic_shielding_parameters["kappa"] = kappa
    magnetic_shielding_parameters.to_pickle(path)
    return magnetic_shielding_parameters


def load_dataset(dataset_molecule_names, champs_path='CHAMPS_train.db'):
    
    def create_db(db_path, molecule_names):
        with connect(db_path) as db:
            for name in molecule_names:
                mol = molecules.get_group(name)
                atoms = Atoms(symbols=mol.atom.values, positions=[(row.x,row.y,row.z) for row in mol.itertuples()])
                try:
                    mol_msp = msp.get_group(name)
                    sigma_iso = mol_msp['sigma_iso'].values.reshape(-1,1)
                    omega = mol_msp['omega'].values.reshape(-1,1)
                    kappa = mol_msp['kappa'].values.reshape(-1,1)
                except KeyError:
                    sigma_iso, omega, kappa = [None] * 3
                    
                db.write(atoms, name=name, data=dict(sigma_iso=sigma_iso, omega=omega, kappa=kappa))

    if not os.path.exists(champs_path):
        create_db(db_path=champs_path, molecule_names=dataset_molecule_names)    
    dataset = schnetpack.data.AtomsData(champs_path, properties=['sigma_iso', 'omega', 'kappa'])
    return dataset

In [37]:
magnetic_shielding_parameters = gen_magnetic_shielding_parameters(magnetic_shielding_tensors)
train_molecule_names = train.molecule_name.unique()
molecules = pd.read_csv('../../data/input/structures.csv')
molecules = molecules.groupby('molecule_name')
msp = magnetic_shielding_parameters.groupby('molecule_name')
# dataset = load_dataset(dataset_molecule_names=train_molecule_names)

In [51]:
len(molecules)

130775

In [38]:
test_molecule_names = test.molecule_name.unique()

In [65]:
class MagneticShielding(atm.Atomwise):
    def __init__(self, property):
        super(MagneticShielding, self).__init__(return_contributions=True)
        self.property = property
        
    def forward(self, inputs):
        result = super().forward(inputs)
        
        atom_mask = inputs[Structure.atom_mask].byte()
        
        yi = inputs[self.property]
        yi = torch.masked_select(yi.squeeze(dim=2), atom_mask)
        inputs[self.property+'_true'] = yi
        
        yi = result['yi']
        yi = torch.masked_select(yi.squeeze(dim=2), atom_mask)
        result[self.property+'_pred'] = yi
        
        return result

In [66]:
def schnet_model(property):
    reps = rep.SchNet(n_interactions=6)
    output = MagneticShielding(property=property)
    model = atm.AtomisticModel(reps, output)
    model = model.to(device)
    return model

In [67]:
def test_prediction(dataset, property):
    # create model
    model = schnet_model(property)
    
    # load best parameters
    model.load_state_dict(torch.load(property+'/output/best_model'))
    loader = spk.data.AtomsLoader(dataset, batch_size=256, num_workers=2)
    model.eval()
    
    # predict shielding parameters
    targets = []
    predictions = []
    with torch.no_grad():
        for batch in loader:
            batch = {
                k: v.to(device)
                for k, v in batch.items()
            }
            
            result = model(batch)
            targets += batch[property+'_true'].tolist()
            predictions += result[property+'_pred'].tolist()
    return targets, predictions

In [68]:
# This function comes from the following script:
# https://github.com/atomistic-machine-learning/schnetpack/blob/v0.2.1/src/scripts/schnetpack_qm9.py
def evaluate_dataset(property, metrics, model, loader, device):
    for metric in metrics:
        metric.reset()
        
    targets = []
    predictions = []
    with torch.no_grad():
        for batch in loader:
            batch = {
                k: v.to(device)
                for k, v in batch.items()
            }
            
            result = model(batch)
            targets += batch[property+'_true'].tolist()
            predictions += result[property+'_pred'].tolist()

            for metric in metrics:
                metric.add_batch(batch, result)

    
    for metric in metrics:
        print(metric.n_entries)
    results = [
        metric.aggregate() for metric in metrics
    ]
    return results, targets, predictions

In [73]:
tst_dataset = load_dataset(dataset_molecule_names=test_molecule_names, champs_path=f'CHAMPS_test.db')
# tst_loader = spk.data.AtomsLoader(tst_dataset, batch_size=32, num_workers=2)

In [74]:
def train_model(property, max_epochs=500, ):
    
    list_ = []
    kfold = KFold(n_splits=5, shuffle=True, random_state=42)
    for fold, (train_index, val_index) in enumerate(kfold.split(train_molecule_names)):

        trn_dataset = load_dataset(dataset_molecule_names=train_molecule_names[train_index], champs_path=f'kfold{fold}_CHAMPS_train.db')
        val_dataset = load_dataset(dataset_molecule_names=train_molecule_names[val_index], champs_path=f'kfold{fold}_CHAMPS_valid.db')
        
        
        print(len(trn_dataset), len(val_dataset))
        
        trn_loader = spk.data.AtomsLoader(trn_dataset, batch_size=32, num_workers=2)
        val_loader = spk.data.AtomsLoader(val_dataset, batch_size=32, num_workers=2)

        # create model
        model = schnet_model(property)

        # create trainer
        target_key = property+'_true'
        output_key = property+'_pred'
        opt = Adam(model.parameters(), lr=1e-4)
        loss = lambda b, p: F.mse_loss(p[output_key], b[target_key])
        metrics = [
            spk.metrics.MeanAbsoluteError(target_key, output_key, name='MAE_'+property),
            spk.metrics.RootMeanSquaredError(target_key, output_key, name='RMSE_'+property),
        ]
        hooks = [
            spk.train.MaxEpochHook(max_epochs),
            spk.train.CSVHook(property+'/log', metrics, every_n_epochs=1),
        ]
        trainer = spk.train.Trainer(property+'/output', model, loss, opt, trn_loader, val_loader, hooks=hooks)

        # start training
        trainer.train(device)

        # evaluation
        model.load_state_dict(torch.load(property+'/output/best_model'))
        tst_loader = spk.data.AtomsLoader(tst_dataset, batch_size=256, num_workers=2)
        model.eval()

        # predict
#         val_targets, val_predictions = test_prediction(val_dataset, property)
#         test_targets, test_predictions = test_prediction(tst_dataset, property)

        trn_metric, trn_true, trn_pred = evaluate_dataset(property, metrics, model, trn_loader, device)
        val_metric, val_true, val_pred = evaluate_dataset(property, metrics, model, val_loader, device)
        tst_metric, tst_true, tst_pred = evaluate_dataset(property, metrics, model, tst_loader, device)

        d_ = {}
        d_['property'] = property
        d_['kfold'] = fold
        d_['training'] = trn_metric
        d_['validation'] = val_metric
        d_['val_targets'] = val_true
        d_['val_predictions'] = val_pred
        
        d_['test'] = tst_metric
        df['test_targets'] = tst_true
        df['test_predictions'] = tst_pred
        list_.append(d_)
        print(d_['training'], d_['validation'])
        
        
    return list_

In [75]:
# def show_history(property):
#     df = pd.read_csv(property+'/log/log.csv')
#     display(df.tail())
#     max_value = None # df['RMSE_'+property].min()*5
#     _ = df[['MAE_'+property,'RMSE_'+property]].plot(ylim=(0,max_value))

In [76]:
device = torch.device("cuda")
df_his = pd.DataFrame()
for p in ['sigma_iso', 'omega', 'kappa']:
    print(p)
    list_i = train_model(p, max_epochs=50)
    df_i = pd.DataFrame(list_i)
    df_his = pd.concat([df_his, df_i], axis=0)
    break

sigma_iso
68002 17001
1226505.0
1226505.0
307032.0
307032.0


IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)

In [23]:
def show_predictions(dataset, property):
    targets, predictions = test_prediction(dataset, property)
    df_pred = pd.DataFrame()
    df_pred['Target'] = targets
    df_pred['Prediction'] = predictions
    return df_pred
    

In [None]:
# trn_dataset = load_dataset(dataset_molecule_names=train_molecule_names, champs_path=f'CHAMPS_train.db')

In [27]:
df_pred = show_predictions(tst_dataset, 'sigma_iso')

IndexError: Dimension out of range (expected to be in range of [-2, 1], but got 2)