In [2]:
from ase.io import read, write
import numpy as np
import pandas as pd
import os
from ase.units import Hartree, kcal, mol
kcal_mol = kcal/mol
Hart_to_kcalmol = Hartree/kcal_mol

# [W4-11 Dataset](http://www.thch.uni-bonn.de/tc.old/downloads/GMTKN/GMTKN55/W4-11.html)

Prepare trajectory with reference atomization energies and CCSD(T) total energies.

In [18]:
w4p = '/home/awills/Documents/Research/datasets/W4-11'
w4cp = '/home/awills/Documents/Research/swxcd/aegis/w411'
w4ref = pd.read_csv(os.path.join(w4p, 'ref.dat'), delimiter='\t', skiprows=1, index_col=False, header=None)[[1, 11]]
w4ref.columns = ['name', 'en'] #energy in kcal/mol
w4ref['en_H'] = -w4ref['en'].values/Hart_to_kcalmol
w4ref['name'] = w4ref['name'].str.strip()

w4cc = pd.read_csv(os.path.join(w4cp, 'progress'), delimiter='\t')
w4cc.columns = [i.strip() for i in w4cc.columns]
w4cc['atoms.symbols'] = w4cc['atoms.symbols'].str.strip()

sdirs = sorted([i for i in os.listdir(w4p) if os.path.isdir(os.path.join(w4p, i))])
atoms = []
for idir in sdirs:
    at = read(os.path.join(w4p,idir,'struc.xyz'), ':')[0]
    sym = str(at.symbols)
    cce = w4cc[w4cc['atoms.symbols'] == sym]['etot  (Har)'].values
    eref = w4ref[w4ref['name'] == idir]['en_H'].values
    print(idir, at, eref, cce)
    try:
        at.info['atomization'] = eref[0]
        at.info['energy'] = cce[0]
        atoms.append(at)
    except:
        #energy not in dataframe, so not one of the AEs. probably single atom
        continue
write(os.path.join(w4p, 'w411.traj'), atoms)

acetaldehyde Atoms(symbols='COHCH3', pbc=False) [-1.08024505] [-153.65854659]
acetic Atoms(symbols='C2O2H4', pbc=False) [-1.28128265] [-228.85096309]
al Atoms(symbols='Al', pbc=False) [] []
alcl Atoms(symbols='AlCl', pbc=False) [-0.19540422] [-702.02772785]
alcl3 Atoms(symbols='AlCl3', pbc=False) [-0.49824108] [-1621.83173868]
alf Atoms(symbols='AlF', pbc=False) [-0.26100004] [-341.97854242]
alf3 Atoms(symbols='AlF3', pbc=False) [-0.68678963] [-541.67300648]
alh Atoms(symbols='AlH', pbc=False) [-0.11724126] [-242.69949789]
alh3 Atoms(symbols='AlH3', pbc=False) [-0.33970642] [-243.91895368]
allene Atoms(symbols='C3H4', pbc=False) [-1.12205477] [-116.503411]
b Atoms(symbols='B', pbc=False) [] []
b2 Atoms(symbols='B2', pbc=False) [-0.10750276] [-49.30775175]
b2h6 Atoms(symbols='B2H6', pbc=False) [-0.96735273] [-53.18736169]
be Atoms(symbols='Be', pbc=False) [] []
be2 Atoms(symbols='Be2', pbc=False) [-0.00425332] [-29.2706554]
becl2 Atoms(symbols='BeCl2', pbc=False) [-0.35899697] [-934.498

The polarized case.

In [3]:
w4p = '/home/awills/Documents/Research/datasets/W4-11'
w4cp = '/home/awills/Documents/Research/swxcd/aegis/w411_p'
w4ref = pd.read_csv(os.path.join(w4p, 'ref.dat'), delimiter='\t', skiprows=1, index_col=False, header=None)[[1, 11]]
w4ref.columns = ['name', 'en'] #energy in kcal/mol
w4ref['en_H'] = -w4ref['en'].values/Hart_to_kcalmol
w4ref['name'] = w4ref['name'].str.strip()

w4cc = pd.read_csv(os.path.join(w4cp, 'progress'), delimiter='\t')
w4cc.columns = [i.strip() for i in w4cc.columns]
w4cc['atoms.symbols'] = w4cc['atoms.symbols'].str.strip()

sdirs = sorted([i for i in os.listdir(w4p) if os.path.isdir(os.path.join(w4p, i))])
atoms = []
for idir in sdirs:
    at = read(os.path.join(w4p,idir,'struc.xyz'), ':')[0]
    sym = str(at.symbols)
    cce = w4cc[w4cc['atoms.symbols'] == sym]['etot  (Har)'].values
    eref = w4ref[w4ref['name'] == idir]['en_H'].values
    print(idir, at, eref, cce)
    try:
        at.info['atomization'] = eref[0]
        at.info['energy'] = cce[0]
        atoms.append(at)
    except:
        #energy not in dataframe, so not one of the AEs. probably single atom
        continue
write(os.path.join(w4cp, 'calc_ref_results.traj'), atoms)

acetaldehyde Atoms(symbols='COHCH3', pbc=False) [-1.08024505] [-153.65854663]
acetic Atoms(symbols='C2O2H4', pbc=False) [-1.28128265] [-228.85096313]
al Atoms(symbols='Al', pbc=False) [] []
alcl Atoms(symbols='AlCl', pbc=False) [-0.19540422] [-702.02772785]
alcl3 Atoms(symbols='AlCl3', pbc=False) [-0.49824108] [-1621.83173863]
alf Atoms(symbols='AlF', pbc=False) [-0.26100004] [-341.97854249]
alf3 Atoms(symbols='AlF3', pbc=False) [-0.68678963] [-541.67300646]
alh Atoms(symbols='AlH', pbc=False) [-0.11724126] [-242.6994979]
alh3 Atoms(symbols='AlH3', pbc=False) [-0.33970642] [-243.91895368]
allene Atoms(symbols='C3H4', pbc=False) [-1.12205477] [-116.50341088]
b Atoms(symbols='B', pbc=False) [] []
b2 Atoms(symbols='B2', pbc=False) [-0.10750276] [-49.32538976]
b2h6 Atoms(symbols='B2H6', pbc=False) [-0.96735273] [-53.18736149]
be Atoms(symbols='Be', pbc=False) [] []
be2 Atoms(symbols='Be2', pbc=False) [-0.00425332] [-29.2706554]
becl2 Atoms(symbols='BeCl2', pbc=False) [-0.35899697] [-934.49

In [4]:
for idx, at in enumerate(atoms):
    print(idx, at, at.info)

0 Atoms(symbols='COHCH3', pbc=False) {'': True, 'atomization': -1.0802450452229493, 'energy': -153.65854662583965}
1 Atoms(symbols='C2O2H4', pbc=False) {'': True, 'atomization': -1.2812826474411092, 'energy': -228.8509631250595}
2 Atoms(symbols='AlCl', pbc=False) {'': True, 'atomization': -0.19540422113454556, 'energy': -702.0277278479019}
3 Atoms(symbols='AlCl3', pbc=False) {'': True, 'atomization': -0.498241083217283, 'energy': -1621.8317386292113}
4 Atoms(symbols='AlF', pbc=False) {'': True, 'atomization': -0.261000043528812, 'energy': -341.9785424909245}
5 Atoms(symbols='AlF3', pbc=False) {'': True, 'atomization': -0.6867896309652065, 'energy': -541.6730064592647}
6 Atoms(symbols='AlH', pbc=False) {'': True, 'atomization': -0.11724125779957686, 'energy': -242.699497896186}
7 Atoms(symbols='AlH3', pbc=False) {'': True, 'atomization': -0.33970642495416614, 'energy': -243.91895367998052}
8 Atoms(symbols='C3H4', pbc=False) {'': True, 'atomization': -1.1220547725524272, 'energy': -116.5