In [2]:
from ase.io import read, write
import numpy as np
import pandas as pd
import os
from ase.units import Hartree, kcal, mol
kcal_mol = kcal/mol
Hart_to_kcalmol = Hartree/kcal_mol

# [W4-11 Dataset](http://www.thch.uni-bonn.de/tc.old/downloads/GMTKN/GMTKN55/W4-11.html)

Prepare trajectory with reference atomization energies and CCSD(T) total energies.

In [3]:
w4p = '/home/awills/Documents/Research/datasets/W4-11'
w4cp = '/home/awills/Documents/Research/swxcd/aegis/w411'
w4ref = pd.read_csv(os.path.join(w4p, 'ref.dat'), delimiter='\t', skiprows=1, index_col=False, header=None)[[1, 11]]
w4ref.columns = ['name', 'en'] #energy in kcal/mol
w4ref['en_H'] = -w4ref['en'].values/Hart_to_kcalmol
w4ref['name'] = w4ref['name'].str.strip()

w4cc = pd.read_csv(os.path.join(w4cp, 'progress'), delimiter='\t')
w4cc.columns = [i.strip() for i in w4cc.columns]
w4cc['atoms.symbols'] = w4cc['atoms.symbols'].str.strip()

#charges and multiplicities. mult is 2S+1, pyscf expects just 2S, so save separately.
w4cm = pd.read_csv(os.path.join(w4p, 'chargemult.dat'))

sdirs = sorted([i for i in os.listdir(w4p) if os.path.isdir(os.path.join(w4p, i))])
atoms = []
for idir in sdirs:
    at = read(os.path.join(w4p,idir,'struc.xyz'), ':')[0]
    sym = str(at.symbols)
    cce = w4cc[w4cc['atoms.symbols'] == sym]['etot  (Har)'].values
    eref = w4ref[w4ref['name'] == idir]['en_H'].values
    print(idir, at, eref, cce)
    try:
        at.info['atomization'] = eref[0]
        at.info['energy'] = cce[0]
        atoms.append(at)
    except:
        #energy not in dataframe, so not one of the AEs. probably single atom
        continue
write(os.path.join(w4p, 'w411.traj'), atoms)

acetaldehyde Atoms(symbols='COHCH3', pbc=False) [-1.08024505] [-153.65854659]
acetic Atoms(symbols='C2O2H4', pbc=False) [-1.28128265] [-228.85096309]
al Atoms(symbols='Al', pbc=False) [] []
alcl Atoms(symbols='AlCl', pbc=False) [-0.19540422] [-702.02772785]
alcl3 Atoms(symbols='AlCl3', pbc=False) [-0.49824108] [-1621.83173868]
alf Atoms(symbols='AlF', pbc=False) [-0.26100004] [-341.97854242]
alf3 Atoms(symbols='AlF3', pbc=False) [-0.68678963] [-541.67300648]
alh Atoms(symbols='AlH', pbc=False) [-0.11724126] [-242.69949789]
alh3 Atoms(symbols='AlH3', pbc=False) [-0.33970642] [-243.91895368]
allene Atoms(symbols='C3H4', pbc=False) [-1.12205477] [-116.503411]
b Atoms(symbols='B', pbc=False) [] []
b2 Atoms(symbols='B2', pbc=False) [-0.10750276] [-49.30775175]
b2h6 Atoms(symbols='B2H6', pbc=False) [-0.96735273] [-53.18736169]
be Atoms(symbols='Be', pbc=False) [] []
be2 Atoms(symbols='Be2', pbc=False) [-0.00425332] [-29.2706554]
becl2 Atoms(symbols='BeCl2', pbc=False) [-0.35899697] [-934.498

The polarized case.

In [25]:
w4p = '/home/awills/Documents/Research/datasets/W4-11'
w4cp = '/home/awills/Documents/Research/swxcd/aegis/w411_p'
w4ref = pd.read_csv(os.path.join(w4p, 'ref.dat'), delimiter='\t', skiprows=1, index_col=False, header=None)[[1, 11]]
w4ref.columns = ['name', 'en'] #energy in kcal/mol
w4ref['en_H'] = -w4ref['en'].values/Hart_to_kcalmol
w4ref['name'] = w4ref['name'].str.strip()

w4cc = pd.read_csv(os.path.join(w4cp, 'progress'), delimiter='\t')
w4cc.columns = [i.strip() for i in w4cc.columns]
w4cc['atoms.symbols'] = w4cc['atoms.symbols'].str.strip()

#charges and multiplicities. mult is 2S+1, pyscf expects just 2S, so save separately.
w4cm = pd.read_csv(os.path.join(w4p, 'chargemult.dat'), delimiter=' ', index_col=False, header=None)
w4cm.columns = ['name', 'charge', 'mult']
sdirs = sorted([i for i in os.listdir(w4p) if os.path.isdir(os.path.join(w4p, i))])
atoms = []
singles = []
for idir in sdirs:
    at = read(os.path.join(w4p,idir,'struc.xyz'), ':')[0]
    sym = str(at.symbols)
    cce = w4cc[w4cc['atoms.symbols'] == sym]['etot  (Har)'].values
    eref = w4ref[w4ref['name'] == idir]['en_H'].values
    charge = w4cm[w4cm['name'] == idir]['charge'].values[0]
    mult = w4cm[w4cm['name'] == idir]['mult'].values[0]
    print(idir, at, eref, cce)
    try:
        at.info['atomization'] = eref[0]
        at.info['energy'] = cce[0]
        at.info['charge'] = charge
        at.info['multiplicity'] = mult
        at.info['spin'] = mult - 1
        atoms.append(at)
    except:
        #energy not in dataframe, so not one of the AEs. probably single atom
        at.info['atomization'] = None
        at.info['energy'] = None
        at.info['charge'] = charge
        at.info['multiplicity'] = mult
        at.info['spin'] = mult - 1
        singles.append(at)
write(os.path.join(w4cp, 'calc_ref_results_s.traj'), atoms)
write(os.path.join(w4cp, 'single_ats.traj'), singles)

acetaldehyde Atoms(symbols='COHCH3', pbc=False) [-1.08024505] [-153.65854663]
acetic Atoms(symbols='C2O2H4', pbc=False) [-1.28128265] [-228.85096313]
al Atoms(symbols='Al', pbc=False) [] []
alcl Atoms(symbols='AlCl', pbc=False) [-0.19540422] [-702.02772785]
alcl3 Atoms(symbols='AlCl3', pbc=False) [-0.49824108] [-1621.83173863]
alf Atoms(symbols='AlF', pbc=False) [-0.26100004] [-341.97854249]
alf3 Atoms(symbols='AlF3', pbc=False) [-0.68678963] [-541.67300646]
alh Atoms(symbols='AlH', pbc=False) [-0.11724126] [-242.6994979]
alh3 Atoms(symbols='AlH3', pbc=False) [-0.33970642] [-243.91895368]
allene Atoms(symbols='C3H4', pbc=False) [-1.12205477] [-116.50341088]
b Atoms(symbols='B', pbc=False) [] []
b2 Atoms(symbols='B2', pbc=False) [-0.10750276] [-49.32538976]
b2h6 Atoms(symbols='B2H6', pbc=False) [-0.96735273] [-53.18736149]
be Atoms(symbols='Be', pbc=False) [] []
be2 Atoms(symbols='Be2', pbc=False) [-0.00425332] [-29.2706554]
becl2 Atoms(symbols='BeCl2', pbc=False) [-0.35899697] [-934.49

# BH76
The intent here is to have a trajectory of all the structures individually, which is post-processed after an evaluation across to trajectory to yield deviation values. Slightly different from AE datasets.

In [37]:
bhp = '/home/awills/Documents/Research/datasets/BH76'
bhref = pd.read_csv(os.path.join(bhp, 'ref.dat'), delimiter='\t', skiprows=1, index_col=False, header=None)
#staggered columns for multiple stoichiometric arrangements, but always either -1 -1 1 or -1 1
bhref.columns = ['num', 'name1', 'name2', 'name3', 'sto1', 'sto2', 'sto3', 'eref'] #energy in kcal/mol
bhref['eref_H'] = bhref['eref'].values/Hart_to_kcalmol

#strip whitespace in name columns
bhref['name1'] = bhref['name1'].str.strip()
bhref['name2'] = bhref['name2'].str.strip()
bhref['name3'] = bhref['name3'].str.strip()

cm = pd.read_csv(os.path.join(bhp, 'chargemult.dat'), delimiter=' ', index_col=False, header=None)
cm.columns = ['name', 'charge', 'mult']
cm['name'] = cm['name'].str.strip()
sdirs = sorted([i for i in os.listdir(bhp) if os.path.isdir(os.path.join(bhp, i))])
atoms = []
for idir in sdirs:
    at = read(os.path.join(bhp,idir,'struc.xyz'), ':')[0]
    sym = str(at.symbols)
    charge = cm[cm['name'] == idir]['charge'].values[0]
    mult = cm[cm['name'] == idir]['mult'].values[0]
    at.info['name'] = idir
    at.info['dirname'] = idir
    at.info['dataset'] = 'bh76'
    at.info['charge'] = charge
    at.info['multiplicity'] = mult
    at.info['spin'] = mult - 1
    atoms.append(at)

write(os.path.join(bhp, 'strucs_cm.traj'), atoms)
tst = read(os.path.join(bhp, 'strucs_cm.traj'), ':')
for idx, at in enumerate(tst):
    print(idx, at.info)

0 {'': True, 'name': 'C2H5', 'dirname': 'C2H5', 'dataset': 'bh76', 'charge': 0, 'multiplicity': 2, 'spin': 1}
1 {'': True, 'name': 'C2H6', 'dirname': 'C2H6', 'dataset': 'bh76', 'charge': 0, 'multiplicity': 1, 'spin': 0}
2 {'': True, 'name': 'C5H8', 'dirname': 'C5H8', 'dataset': 'bh76', 'charge': 0, 'multiplicity': 1, 'spin': 0}
3 {'': True, 'name': 'CH2OH', 'dirname': 'CH2OH', 'dataset': 'bh76', 'charge': 0, 'multiplicity': 2, 'spin': 1}
4 {'': True, 'name': 'CH4', 'dirname': 'CH4', 'dataset': 'bh76', 'charge': 0, 'multiplicity': 1, 'spin': 0}
5 {'': True, 'name': 'H2', 'dirname': 'H2', 'dataset': 'bh76', 'charge': 0, 'multiplicity': 1, 'spin': 0}
6 {'': True, 'name': 'H2O', 'dirname': 'H2O', 'dataset': 'bh76', 'charge': 0, 'multiplicity': 1, 'spin': 0}
7 {'': True, 'name': 'H2S', 'dirname': 'H2S', 'dataset': 'bh76', 'charge': 0, 'multiplicity': 1, 'spin': 0}
8 {'': True, 'name': 'HS', 'dirname': 'HS', 'dataset': 'bh76', 'charge': 0, 'multiplicity': 2, 'spin': 1}
9 {'': True, 'name': '

# MB16-43
As above, intent is to have trajectory of structures to post-process with stoichiometry from reference data separately.

In [36]:
mbp = '/home/awills/Documents/Research/datasets/MB16-43'
mbref = pd.read_csv(os.path.join(mbp, 'ref.dat'), delimiter='\t', skiprows=1, index_col=False, header=None)
#staggered columns for multiple stoichiometric arrangements, but always either -1 -1 1 or -1 1
#sysname is name of folder with structure, is included in stoich calc
mbref.columns = ['num', 'sysname', 'name1', 'name2', 'name3', 'name4', 'name5', 'name6', 'name7', 'name8', 'name9', 'name10',
                'sto1', 'sto2', 'sto3', 'sto4', 'sto5', 'sto6', 'sto7', 'sto8', 'sto9', 'sto10', 'sto11', 'eref']
mbref['eref_H'] = mbref['eref'].values/Hart_to_kcalmol
namecols = ['sysname'] + ['name{}'.format(i) for i in np.arange(1,11)]
#strip whitespace in name columns
#for n in namecols:
#    mbref[n] = mbref[n].str.strip()

cm = pd.read_csv(os.path.join(mbp, 'chargemult.dat'), delimiter=' ', index_col=False, header=None)
cm.columns = ['name', 'charge', 'mult']
cm['name'] = cm['name'].str.strip()
sdirs = sorted([i for i in os.listdir(mbp) if os.path.isdir(os.path.join(mbp, i))])
atoms = []
for idir in sdirs:
    at = read(os.path.join(mbp,idir,'struc.xyz'), ':')[0]
    sym = str(at.symbols)
    charge = cm[cm['name'] == idir]['charge'].values[0]
    mult = cm[cm['name'] == idir]['mult'].values[0]
    at.info['name'] = idir
    at.info['dirname'] = idir
    at.info['dataset'] = 'mb16-43'
    at.info['charge'] = charge
    at.info['multiplicity'] = mult
    at.info['spin'] = mult - 1
    atoms.append(at)

write(os.path.join(mbp, 'strucs_cm.traj'), atoms)
tst = read(os.path.join(mbp, 'strucs_cm.traj'), ':')
for idx, at in enumerate(tst):
    print(idx, at.info)

0 {'': True, 'name': '01', 'dirname': '01', 'dataset': 'mb16-43', 'charge': 0, 'multiplicity': 1, 'spin': 0}
1 {'': True, 'name': '02', 'dirname': '02', 'dataset': 'mb16-43', 'charge': 0, 'multiplicity': 2, 'spin': 1}
2 {'': True, 'name': '03', 'dirname': '03', 'dataset': 'mb16-43', 'charge': 0, 'multiplicity': 1, 'spin': 0}
3 {'': True, 'name': '04', 'dirname': '04', 'dataset': 'mb16-43', 'charge': 0, 'multiplicity': 1, 'spin': 0}
4 {'': True, 'name': '05', 'dirname': '05', 'dataset': 'mb16-43', 'charge': 0, 'multiplicity': 2, 'spin': 1}
5 {'': True, 'name': '06', 'dirname': '06', 'dataset': 'mb16-43', 'charge': 0, 'multiplicity': 2, 'spin': 1}
6 {'': True, 'name': '07', 'dirname': '07', 'dataset': 'mb16-43', 'charge': 0, 'multiplicity': 2, 'spin': 1}
7 {'': True, 'name': '08', 'dirname': '08', 'dataset': 'mb16-43', 'charge': 0, 'multiplicity': 2, 'spin': 1}
8 {'': True, 'name': '09', 'dirname': '09', 'dataset': 'mb16-43', 'charge': 0, 'multiplicity': 1, 'spin': 0}
9 {'': True, 'name'

  a = np.array(obj)
