## HH132 数据集的排除过程

In [1]:
import numpy as np
import pandas as pd
from util import (BOHR, read_mat, read_comp, get_iso, get_aniso, read_by_prompt, get_df_err, get_rmsre_3comp, get_relrmsd_3comp)
import itertools

np.set_printoptions(8, suppress=True, linewidth=150)
pd.set_option('display.max_rows', None)
pd.set_option("display.precision", 3)
pd.set_option("float_format", '{:.3f}'.format)

In [2]:
import matplotlib.pyplot as plt
from matplotlib_inline.backend_inline import set_matplotlib_formats
%matplotlib inline

set_matplotlib_formats('svg')

## 定义与数据读入

In [3]:
df_hh132_ref = pd.read_csv("raw_data/HH132-hait_ref.csv", index_col=[0], header=[0, 1])
mol_hh132 = sorted(list(df_hh132_ref.index))
mol_nsp = list(set(mol_hh132).intersection(df_hh132_ref.index[np.asarray(df_hh132_ref["Spin Polarization"] == "NSP").flatten()]))
mol_sp = list(set(mol_hh132).intersection(df_hh132_ref.index[np.asarray(df_hh132_ref["Spin Polarization"] == "SP").flatten()]))

In [4]:
df_hh132_acvtz = pd.read_csv("raw_data/HH132-hait_aCVTZ.csv", index_col=[0], header=[0, 1])

In [5]:
df_g16_acvtz = pd.read_csv("raw_data/HH132-g16_aCVTZ.csv", index_col=[0], header=[0, 1])

In [6]:
df_dh_acvtz = pd.read_csv("raw_data/HH132-dh_aCVTZ.csv", index_col=[0], header=[0, 1])

## 排除分子具体过程

### 对称性破缺 (tab.6.supp.symm-broken)

In [49]:
mol_exclude_symm = ["Be", "Li2", "BN", "NO", "OCl", "OF", "OH", "SCl", "SF", "SH", "PS", "NCO", "CH3O"]

In [8]:
pd.concat([
    df_hh132_ref["Spin Polarization"].loc[mol_exclude_symm],
    df_g16_acvtz["MP2"].loc[mol_exclude_symm, ["xx", "yy", "zz"]],
    df_hh132_acvtz["MP2"].loc[mol_exclude_symm]
], axis=1).set_axis(pd.MultiIndex.from_tuples([("Spin", "Spin")] + list(itertools.product(["Analytical", "HH132 Original"], ["xx", "yy", "zz"]))), axis="columns")

Unnamed: 0_level_0,Spin,Analytical,Analytical,Analytical,HH132 Original,HH132 Original,HH132 Original
Unnamed: 0_level_1,Spin,xx,yy,zz,xx,yy,zz
Be,SP,7.019,7.019,7.667,7.074,7.074,7.074
Li2,SP,26.837,22.98,39.702,22.589,22.589,39.719
BN,SP,3.427,2.521,2.156,3.446,3.446,2.161
NO,SP,1.444,1.24,0.434,1.445,1.445,0.557
OCl,SP,2.475,2.39,4.289,2.391,2.391,4.296
OF,SP,1.057,1.081,1.814,1.057,1.057,1.816
OH,SP,1.071,0.879,1.244,1.072,1.072,1.245
SCl,SP,4.065,4.387,7.137,4.393,4.393,7.147
SF,SP,3.17,2.786,3.592,3.178,3.178,3.594
SH,SP,2.875,3.449,3.446,3.458,3.458,3.45


In [9]:
(df_hh132_ref["Spin Polarization"] == "NSP").sum()

Unnamed: 1_level_1    75
dtype: int64

In [10]:
75 + 57

132

### MP2 极化率复现问题 (tab.6.supp.mp2-hait-g16})

In [11]:
mol_current = sorted(list(set(mol_hh132) - set(mol_exclude_symm) - set(["H"])))

In [12]:
df_mp2_relerr = (df_hh132_acvtz["MP2"] / df_g16_acvtz["MP2"].loc[:, ["xx", "yy", "zz"]] - 1) * 100
df_mp2_relerr = df_mp2_relerr.loc[mol_current]

In [13]:
mask = (df_mp2_relerr.abs() > 2).sum(axis=1) > 0

In [14]:
pd.concat([
    df_hh132_ref["Spin Polarization"].loc[mask[mask].index],
    df_g16_acvtz["MP2"].loc[mask[mask].index, ["xx", "yy", "zz"]],
    df_mp2_relerr.loc[mask[mask].index],
], axis=1).set_axis(pd.MultiIndex.from_tuples([("Spin", "Spin")] + list(itertools.product(["Analytical", "Relative Error / %"], ["xx", "yy", "zz"]))), axis="columns")

Unnamed: 0_level_0,Spin,Analytical,Analytical,Analytical,Relative Error / %,Relative Error / %,Relative Error / %
Unnamed: 0_level_1,Spin,xx,yy,zz,xx,yy,zz
CH2NH,SP,3.299,2.713,6.678,0.129,0.191,-115.065
HOF,SP,1.443,1.254,4.023,0.028,0.061,-40.274
NOCl,SP,5.814,6.487,3.452,-67.82,-8.42,0.075
Na2,SP,30.066,30.066,26.148,0.069,0.069,2.452
NaLi,SP,26.443,26.443,-11.852,0.1,0.1,-18.332


In [15]:
mol_exclude_mp2 = ["CH2NH", "HOF", "NOCl", "Na2", "NaLi"]

### MP2 极化率复现问题 (tab.6.supp.mp2-dh-g16)

In [16]:
mol_current = sorted(list(set(mol_hh132) - set(mol_exclude_symm) - set(["H"])))

In [17]:
df_mp2_relerr = (df_dh_acvtz["MP2"].loc[:, ["xx", "yy", "zz"]] / df_g16_acvtz["MP2"].loc[:, ["xx", "yy", "zz"]] - 1) * 100
df_mp2_relerr = df_mp2_relerr.loc[mol_current]

In [18]:
mask = (df_mp2_relerr.abs() > 0.5).sum(axis=1) > 0

In [19]:
pd.concat([
    df_g16_acvtz["MP2"].loc[mask[mask].index, ["xx", "yy", "zz"]],
    df_mp2_relerr.loc[mask[mask].index],
], axis=1).set_axis(pd.MultiIndex.from_tuples(itertools.product(["Analytical (Gaussian)", "Relative Error / %"], ["xx", "yy", "zz"])), axis="columns")

Unnamed: 0_level_0,Analytical (Gaussian),Analytical (Gaussian),Analytical (Gaussian),Relative Error / %,Relative Error / %,Relative Error / %
Unnamed: 0_level_1,xx,yy,zz,xx,yy,zz
CH2NH,3.299,2.713,6.678,0.16,0.073,22.152
NOCl,5.814,6.487,3.452,4.268,4.564,0.017
NaLi,26.443,26.443,-11.852,0.225,0.225,-9.846


## 其他密度泛函复现问题

### B2PLYP

**B2PLYP HH132 original**

In [20]:
mol_current = sorted(list(set(mol_hh132) - set(mol_exclude_symm) - set(mol_exclude_mp2) - set(["H"])))

In [21]:
df_relerr = (df_hh132_acvtz["B2PLYP"].loc[:, ["xx", "yy", "zz"]] / df_g16_acvtz["B2PLYP"].loc[:, ["xx", "yy", "zz"]] - 1) * 100
df_relerr = df_relerr.loc[mol_current]

In [22]:
mask = (df_relerr.abs() > 2).sum(axis=1) > 0

In [23]:
pd.concat([
    df_hh132_ref["Spin Polarization"].loc[mask[mask].index],
    df_g16_acvtz["B2PLYP"].loc[mask[mask].index, ["xx", "yy", "zz"]],
    df_relerr.loc[mask[mask].index],
], axis=1).set_axis(pd.MultiIndex.from_tuples([("Spin", "Spin")] + list(itertools.product(["Analytical", "Relative Error / %"], ["xx", "yy", "zz"]))), axis="columns")

Unnamed: 0_level_0,Spin,Analytical,Analytical,Analytical,Relative Error / %,Relative Error / %,Relative Error / %
Unnamed: 0_level_1,Spin,xx,yy,zz,xx,yy,zz
C2H,SP,3.543,3.543,4.02,8.564,8.564,-0.127
CN,SP,3.192,3.192,4.518,-18.839,-18.839,-2.985
HNS,SP,5.796,3.959,3.031,-1.502,3.246,10.542
NaCl,NSP,4.334,4.334,5.471,0.698,0.698,2.058
O3,SP,1.713,4.58,2.124,-0.0,-4.345,-1.471


**B2PLYP DH**

In [24]:
mol_current = sorted(list(set(mol_hh132) - set(mol_exclude_symm) - set(mol_exclude_mp2) - set(["H"])))

In [25]:
df_relerr = (df_dh_acvtz["B2PLYP"].loc[:, ["xx", "yy", "zz"]] / df_g16_acvtz["B2PLYP"].loc[:, ["xx", "yy", "zz"]] - 1) * 100
df_relerr = df_relerr.loc[mol_current]

In [26]:
mask = (df_relerr.abs() > 0.1).sum(axis=1) > 0

In [27]:
pd.concat([
    df_hh132_ref["Spin Polarization"].loc[mask[mask].index],
    df_g16_acvtz["B2PLYP"].loc[mask[mask].index, ["xx", "yy", "zz"]],
    df_relerr.loc[mask[mask].index],
], axis=1).set_axis(pd.MultiIndex.from_tuples([("Spin", "Spin")] + list(itertools.product(["Analytical", "Relative Error / %"], ["xx", "yy", "zz"]))), axis="columns")

Unnamed: 0_level_0,Spin,Analytical,Analytical,Analytical,Relative Error / %,Relative Error / %,Relative Error / %
Unnamed: 0_level_1,Spin,xx,yy,zz,xx,yy,zz
LiCl,NSP,3.894,3.894,4.194,0.051,0.051,0.245
LiH,NSP,4.313,4.313,3.949,-0.101,-0.101,-0.012


### B2GPPLYP

**B2GPPLYP HH132 original**

In [28]:
mol_current = sorted(list(set(mol_hh132) - set(mol_exclude_symm) - set(mol_exclude_mp2) - set(["H"])))

In [29]:
df_relerr = (df_hh132_acvtz["B2GPPLYP"].loc[:, ["xx", "yy", "zz"]] / df_g16_acvtz["B2GPPLYP"].loc[:, ["xx", "yy", "zz"]] - 1) * 100
df_relerr = df_relerr.loc[mol_current]

In [30]:
mask = (df_relerr.abs() > 2).sum(axis=1) > 0

In [31]:
pd.concat([
    df_hh132_ref["Spin Polarization"].loc[mask[mask].index],
    df_g16_acvtz["B2GPPLYP"].loc[mask[mask].index, ["xx", "yy", "zz"]],
    df_relerr.loc[mask[mask].index],
], axis=1).set_axis(pd.MultiIndex.from_tuples([("Spin", "Spin")] + list(itertools.product(["Analytical", "Relative Error / %"], ["xx", "yy", "zz"]))), axis="columns")

Unnamed: 0_level_0,Spin,Analytical,Analytical,Analytical,Relative Error / %,Relative Error / %,Relative Error / %
Unnamed: 0_level_1,Spin,xx,yy,zz,xx,yy,zz
C2H,SP,3.395,3.395,4.009,7.009,7.009,-0.751
CN,SP,3.304,3.304,4.243,-18.696,-18.696,-1.834
HNO,SP,1.49,2.289,2.719,5.253,0.786,1.883
HNS,SP,6.974,3.971,4.786,-18.989,1.673,-30.387
NP,SP,3.357,3.357,6.635,6.48,6.48,-18.12
O2,SP,1.174,1.174,2.193,0.372,0.372,-3.047
O3,SP,1.696,4.518,2.095,0.226,-4.981,-1.451


**B2GPPLYP DH**

In [32]:
mol_current = sorted(list(set(mol_hh132) - set(mol_exclude_symm) - set(mol_exclude_mp2) - set(["H"])))

In [33]:
df_relerr = (df_dh_acvtz["B2GPPLYP"].loc[:, ["xx", "yy", "zz"]] / df_g16_acvtz["B2GPPLYP"].loc[:, ["xx", "yy", "zz"]] - 1) * 100
df_relerr = df_relerr.loc[mol_current]

In [34]:
mask = (df_relerr.abs() > 0.1).sum(axis=1) > 0

In [35]:
pd.concat([
    df_hh132_ref["Spin Polarization"].loc[mask[mask].index],
    df_g16_acvtz["B2GPPLYP"].loc[mask[mask].index, ["xx", "yy", "zz"]],
    df_relerr.loc[mask[mask].index],
], axis=1).set_axis(pd.MultiIndex.from_tuples([("Spin", "Spin")] + list(itertools.product(["Analytical", "Relative Error / %"], ["xx", "yy", "zz"]))), axis="columns")

Unnamed: 0_level_0,Spin,Analytical,Analytical,Analytical,Relative Error / %,Relative Error / %,Relative Error / %
Unnamed: 0_level_1,Spin,xx,yy,zz,xx,yy,zz
LiH,NSP,4.175,4.175,3.762,-0.129,-0.129,-0.02
NP,SP,3.357,3.357,6.635,0.0,0.0,0.812
NaH,NSP,5.326,5.326,7.562,-0.515,-0.515,-0.041


### DSD-PBEPBE-D3

**DSD-PBEPBE-D3 HH132 original**

In [36]:
mol_current = sorted(list(set(mol_hh132) - set(mol_exclude_symm) - set(mol_exclude_mp2) - set(["H"])))

In [37]:
df_relerr = (df_hh132_acvtz["DSD-PBEPBE-D3"].loc[:, ["xx", "yy", "zz"]] / df_g16_acvtz["DSD-PBEPBE-D3"].loc[:, ["xx", "yy", "zz"]] - 1) * 100
df_relerr = df_relerr.loc[mol_current]

In [38]:
mask = (df_relerr.abs() > 2).sum(axis=1) > 0

In [39]:
pd.concat([
    df_hh132_ref["Spin Polarization"].loc[mask[mask].index],
    df_g16_acvtz["DSD-PBEPBE-D3"].loc[mask[mask].index, ["xx", "yy", "zz"]],
    df_relerr.loc[mask[mask].index],
], axis=1).set_axis(pd.MultiIndex.from_tuples([("Spin", "Spin")] + list(itertools.product(["Analytical", "Relative Error / %"], ["xx", "yy", "zz"]))), axis="columns")

Unnamed: 0_level_0,Spin,Analytical,Analytical,Analytical,Relative Error / %,Relative Error / %,Relative Error / %
Unnamed: 0_level_1,Spin,xx,yy,zz,xx,yy,zz
BO,SP,2.308,2.308,2.79,0.141,0.141,3.479
BS,SP,4.437,4.437,6.279,-0.39,-0.39,2.786
C2H,SP,3.275,3.275,4.028,5.621,5.621,-2.152
C2H3,SP,3.458,5.224,3.253,-0.041,-2.868,-1.886
CH2PH,SP,9.288,5.237,5.774,-18.14,-4.902,-1.666
CN,SP,3.168,3.168,3.985,-14.95,-14.95,0.268
F2,SP,0.886,0.886,2.649,2.027,2.027,-32.081
HNO,SP,1.373,2.288,2.662,13.702,0.703,3.008
HNS,SP,6.957,3.972,3.554,-19.78,1.397,-6.832
NP,SP,3.32,3.32,6.949,7.205,7.205,-22.109


**DSD-PBEPBE-D3 DH**

In [40]:
mol_current = sorted(list(set(mol_hh132) - set(mol_exclude_symm) - set(mol_exclude_mp2) - set(["H"])))

In [41]:
df_relerr = (df_dh_acvtz["DSD-PBEPBE-D3"].loc[:, ["xx", "yy", "zz"]] / df_g16_acvtz["DSD-PBEPBE-D3"].loc[:, ["xx", "yy", "zz"]] - 1) * 100
df_relerr = df_relerr.loc[mol_current]

In [42]:
mask = (df_relerr.abs() > 0.1).sum(axis=1) > 0

In [43]:
pd.concat([
    df_hh132_ref["Spin Polarization"].loc[mask[mask].index],
    df_g16_acvtz["DSD-PBEPBE-D3"].loc[mask[mask].index, ["xx", "yy", "zz"]],
    df_relerr.loc[mask[mask].index],
], axis=1).set_axis(pd.MultiIndex.from_tuples([("Spin", "Spin")] + list(itertools.product(["Analytical", "Relative Error / %"], ["xx", "yy", "zz"]))), axis="columns")

Unnamed: 0_level_0,Spin,Analytical,Analytical,Analytical,Relative Error / %,Relative Error / %,Relative Error / %
Unnamed: 0_level_1,Spin,xx,yy,zz,xx,yy,zz
BeH2,NSP,2.931,2.931,3.007,-0.117,-0.117,-0.023
LiH,NSP,4.282,4.282,3.767,-0.203,-0.203,-0.055


## 最终输出的分子

In [48]:
mol_exclude_dh = ["BO", "BS", "C2H", "C2H3", "CH2PH", "CN", "F2", "HNO", "HNS", "NP", "O2", "O3", "P2"]

In [52]:
mol_hh101 = sorted(list(set(mol_hh132) - set(mol_exclude_symm) - set(mol_exclude_mp2) - set(mol_exclude_dh)))

In [53]:
len(mol_hh101)

101

In [72]:
hh132_spin = (df_hh132_ref["Spin Polarization"] == "SP").iloc[:, 0]

In [74]:
print(len(sorted(list(set(hh132_spin[hh132_spin].index).intersection(set(mol_hh101))))))
for i in sorted(list(set(hh132_spin[hh132_spin].index).intersection(set(mol_hh101)))):
    print(i)

26
BH2
BeH
CH2-t
CH2F
CH3
FCO
FH-OH
H2CN
H2O-Li
HCHS
HCO
HCP
HO2
Li
N
N2H2
NH
NH2
Na
OF2
P
PH
PH2
S2
SO-trip
SiH3


In [75]:
print(len(sorted(list(set(hh132_spin[~hh132_spin].index).intersection(set(mol_hh101))))))
for i in sorted(list(set(hh132_spin[~hh132_spin].index).intersection(set(mol_hh101)))):
    print(i)

75
AlF
Ar
BF
BH2Cl
BH2F
BH3
BHF2
BeH2
C2H2
C2H4
CH2BH
CH3BH2
CH3Cl
CH3F
CH3NH2
CH3OH
CH3SH
CH4
CO
CO2
CS
CSO
Cl2
ClCN
ClF
FCN
FNO
H
H2
H2O
HBO
HBS
HCCCl
HCCF
HCHO
HCN
HCONH2
HCOOH
HCl
HF
HNC
HOCl
HOOH
He
LiBH4
LiCN
LiCl
LiH
Mg
Mg2
N2
N2H4
NH2Cl
NH2F
NH2OH
NH3
NH3O
NaCN
NaCl
NaH
Ne
OCl2
P2H4
PH2OH
PH3
PH3O
S2H2
SCl2
SF2
SH2
SO2
SiH3Cl
SiH3F
SiH4
SiO


In [80]:
print(mol_hh101)

['AlF', 'Ar', 'BF', 'BH2', 'BH2Cl', 'BH2F', 'BH3', 'BHF2', 'BeH', 'BeH2', 'C2H2', 'C2H4', 'CH2-t', 'CH2BH', 'CH2F', 'CH3', 'CH3BH2', 'CH3Cl', 'CH3F', 'CH3NH2', 'CH3OH', 'CH3SH', 'CH4', 'CO', 'CO2', 'CS', 'CSO', 'Cl2', 'ClCN', 'ClF', 'FCN', 'FCO', 'FH-OH', 'FNO', 'H', 'H2', 'H2CN', 'H2O', 'H2O-Li', 'HBO', 'HBS', 'HCCCl', 'HCCF', 'HCHO', 'HCHS', 'HCN', 'HCO', 'HCONH2', 'HCOOH', 'HCP', 'HCl', 'HF', 'HNC', 'HO2', 'HOCl', 'HOOH', 'He', 'Li', 'LiBH4', 'LiCN', 'LiCl', 'LiH', 'Mg', 'Mg2', 'N', 'N2', 'N2H2', 'N2H4', 'NH', 'NH2', 'NH2Cl', 'NH2F', 'NH2OH', 'NH3', 'NH3O', 'Na', 'NaCN', 'NaCl', 'NaH', 'Ne', 'OCl2', 'OF2', 'P', 'P2H4', 'PH', 'PH2', 'PH2OH', 'PH3', 'PH3O', 'S2', 'S2H2', 'SCl2', 'SF2', 'SH2', 'SO-trip', 'SO2', 'SiH3', 'SiH3Cl', 'SiH3F', 'SiH4', 'SiO']
