In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn import linear_model
from sklearn.metrics import root_mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import KFold, train_test_split
from sklearn.feature_selection import SequentialFeatureSelector
import pandas as pd
from collections import Counter
import seaborn as sns
from sklearn.preprocessing import StandardScaler

### Read and extract from Excel file

In [None]:
#read excel table
excel=pd.read_excel('GCDFT_compile.xlsx', sheet_name='all_pbed3')
excel.head(5)

In [None]:
#Extract
valences=excel.to_numpy()[0,1:].astype('float')
pKas=excel.to_numpy()[1,1:].astype('float') #transposed and converted to floats
GpKas=pKas*2.303*8.314*298/1000/96.49 #eV
names=np.array(excel.columns[1:]).astype('str')
#In order of d filling and 3, 4, 5d
metals=['Co','Ni','Cu','Rh','Pd','Ag','Ir','Pt','Au']
ads=[]
efs=[]
for i in range(len(metals)):
    nstart=4+5*i
    nend=7+5*i
    ads.append(excel.to_numpy()[nstart:nend,1:].T.astype('float') )
    efs.append(excel.to_numpy()[nstart-1,:])
ads=np.array(ads)
#Extract Ef
efs=np.array(efs).astype('float')
metalefs=efs[:,0]*27.2114
pzcs=efs*-27.2114 - 4.66 #SHE
adpzcs=pzcs[:,1:]
metalpzcs=pzcs[:,0]

In [None]:
#Fit adsorption vs. potential
pots=np.array([0.8,0.4,0.0]) #SHE
fits=[[stats.linregress(pots,i) for i in m] for m in ads]
#Get slope and intercept
gam=np.array([[i.slope for i in m] for m in fits])
Uo=np.array([[-i.intercept/i.slope for i in m] for m in fits])

In [None]:
#Get goodness of fit
adsclean=ads[~np.isnan(ads).all(axis=2)]
fitsclean=[stats.linregress(pots,i) for i in adsclean]
#R2
r2s=abs(np.array([i.rvalue for i in fitsclean]))
print(np.mean(r2s), min(r2s), max(r2s), np.std(r2s))
#Slope err
gamerr=np.array([i.stderr for i in fitsclean])
print(np.mean(gamerr), min(gamerr), max(gamerr),  np.std(gamerr))
#intercept err
yinterr=np.array([i.intercept_stderr for i in fitsclean])
uoerr=np.sqrt(yinterr**2 + gamerr**2)
print(np.mean(uoerr), min(uoerr), max(uoerr),  np.std(uoerr))

### WF or Fermi change

In [None]:
#Change in fermi energy upon adsorption
ef_diff=np.array([i-metalpzcs for i in adpzcs.T]).T
#Find RMSE
mask=np.isnan(np.hstack(ef_diff)).astype('bool')
print('MAE:',mean_absolute_error(np.hstack(ef_diff)[~mask],np.hstack(gam-valences)[~mask]))
print('RMSE:',root_mean_squared_error(np.hstack(ef_diff)[~mask],np.hstack(gam-valences)[~mask]))

In [None]:
fig, axes=plt.subplots(1,2,figsize=(7,3),dpi=200)
#plt.gca().set_aspect('equal')
for m in range(9): #each metal
    #Plot each valence with different shape
    vcolor=['b','g','r']
    vin=0
    for v in [-1,-2,-3]:
        mask=valences==v
        axes[0].plot(ef_diff[m][mask],gam[m][mask],ls='',marker='o',mec='k',mfc=vcolor[vin],markersize=8,alpha=0.6)
        axes[1].plot(ef_diff[m][mask],gam[m][mask]-valences[mask],ls='',marker='o',mec='k',mfc=vcolor[vin],markersize=8,alpha=0.6)
        vin+=1

#Format ax1
[i.set_xlim(-1,2) for i in axes]
axes[0].set_ylim(-2.8,0)
axes[1].set_ylim(-1,2)
axes[1].plot([-1,2], [-1,2],ls=':',c='k',lw=2)
[i.tick_params(axis='both',labelsize=15) for i in axes]

[i.set_xlabel(r'$\Delta \epsilon_F$ (eV)',fontsize=13) for i in axes]
axes[0].set_ylabel(r'$\gamma$ (e)',fontsize=13)
axes[1].set_ylabel(r'$\gamma - z$  (e)',fontsize=13,va='top')
plt.subplots_adjust(wspace=0.4)
plt.show()

### Compile data frames

In [None]:
#Metal properties
#Vacuum WF/fermi
wf=-np.array([0.071418084, 0.064151821, 0.055983892, 0.108732746, 0.08235435, 0.093749664,
            0.140126819, 0.115625054, 0.049829702])
%store -r dcens dwidths spcens spwidths spareas
#d-band center from Norskov book Fig. 8.5
m_ds=np.array([-1.17,-1.73,-2.11,-1.29,-1.83,-2.25,-2.67,-4.3,-3.56]) #eV
Vad2=np.array([1.34,3.32,4.45,1.16,2.78,3.90,1,2.26,3.35])
#Lattice constant from PBE-D3
alat=np.array([3.49, 3.48, 3.57, 3.80, 3.90, 4.08, 3.84, 3.92, 4.10])
#J. Chem. Theory Comput. 2024, 20, 7469−7478
#rVDW=np.array([4.52,4.45,4.42,4.6,4.07,4.52,4.51,4.44,4.26])
#pol=np.array([55,49,47,66,26,55,54,48,36])
#CRC handbook 105th
rVDW=np.array([2.00, 1.97, 1.96, 2.10, 2.10, 2.11, 2.13, 2.13, 2.14]) #Angstrom
pol=np.array([8.55, 7.57, 6.03, 8.6, 3.874, 5.44, 7.6, 6.5, 4.13])**(1/3) #E-24 cm3
EAm=np.array([0.662256, 1.15716, 1.235775, 1.14289, 0.56214, 1.304475, 1.564057, 2.1251, 2.30863]) #eV

In [None]:
#Acid properties from Excel file
excel=pd.read_excel('GCDFT_compile.xlsx', sheet_name='acids')
dipol=excel.to_numpy()[3][1:].astype('float')
eHOMO=excel.to_numpy()[9][1:].astype('float')*27.2114
eLUMO=excel.to_numpy()[10][1:].astype('float')*27.2114
eHOMO1=excel.to_numpy()[11][1:].astype('float')*27.2114
eHOMOz=excel.to_numpy()[12][1:].astype('float')*27.2114
EA1=excel.to_numpy()[13][1:].astype('float')*27.2114
EAz=excel.to_numpy()[14][1:].astype('float')*27.2114
homolytic=excel.to_numpy()[16][1:].astype('float')*27.2114
eHOMOH=excel.to_numpy()[17][1:].astype('float')*27.2114
excel.head()
print(len(dipol))

In [None]:
#Create data frame
df=pd.DataFrame(np.hstack(gam),columns=[r'$\gamma$'])
df.insert(0,[r'$U^o$'],np.hstack(Uo),True)
df.insert(0,[r'$\Delta\epsilon_F$'],np.hstack(ef_diff),True)
#Metal feature
df.insert(0,[r'$\varepsilon_{sp}$'],[i for i in spcens for j in range(28)],True)
df.insert(0,[r'$W_{sp}$'],[i for i in spwidths for j in range(28)],True)
df.insert(0,[r'$\varepsilon_d$'],[i for i in dcens for j in range(28)],True)
df.insert(0,[r'$W_d$'],[i for i in dwidths for j in range(28)],True)
df.insert(0,[r'$\varepsilon_F$'],[i for i in metalefs for j in range(28)],True)
df.insert(0,[r'$\varepsilon^{v}_F$'],[i for i in wf for j in range(28)],True)
df.insert(0,[r'$a_{fcc}$'],[i for i in alat for j in range(28)],True)
df.insert(0,[r'$V^2_{ad}$'],[i for i in Vad2 for j in range(28)],True)
df.insert(0,[r'$E^m_{EA}$'],[i for i in EAm for j in range(28)],True)
df.insert(0,[r'$r_{vdw}$'],[i for i in rVDW for j in range(28)],True)
df.insert(0,[r'$\alpha^{1/3}_m$'],[i for i in pol for j in range(28)],True)
#Acid feature
df.insert(0,[r'$\mu_z$'],list(dipol)*9,True)
df.insert(0,[r'$\Delta E^{dis}_H$'],list(homolytic)*9,True)
df.insert(0,[r'$E^{a1}_{EA}$'],list(EA1)*9,True)
df.insert(0,[r'$EA^{az}_{EA}$'],list(EAz)*9,True)
df.insert(0,[r'$E^{a0}_{LUMO}$'],list(eLUMO)*9,True)
df.insert(0,[r'$E^{a1}_{HOMO}$'],list(eHOMO1)*9,True)
df.insert(0,[r'$E^{az}_{HOMO}$'],list(eHOMOz)*9,True)
df.insert(0,[r'$E^{a0}_{HOMO}$'],list(eHOMO)*9,True)
df.insert(0,[r'$E^{aH}_{HOMO}$'],list(eHOMOH)*9,True)
df.insert(0,[r'$\Delta G_{pKa}$'],list(GpKas)*9,True)
df.insert(0,[r'$z$'],list(valences)*9,True)

#Tracking
df.insert(0,['metal'],[i for i in metals for j in range(28)],True)
df.insert(0,['acid'],[i for i in names]*9,True)
#Drop NaN
dfraw=df.copy()
dfraw.drop(dfraw[dfraw['acid'] == 'H+'].index, inplace = True)
df.dropna(axis=0,inplace=True)
#Drop -2 and -3
#df.drop(df[df['valences'] == -1].index, inplace = True)
#Drop H
df.drop(df[df['acid'] == 'H+'].index, inplace = True)
print(len(df))
df.head()

### Down select features based on cross-correlation

In [None]:
#Extract features
X=df.drop(columns=['metal','acid',r'$\gamma$',r'$U^o$',r'$\Delta\epsilon_F$'])
Xacid=X.iloc[:, 0:11] 
Xmetal=X.iloc[:,11:]
#Correlation matrix
corr_acid=Xacid.corr()
corr_metal=Xmetal.corr()

In [None]:
fig,axes=plt.subplots(2,1,figsize=(7, 12),dpi=300)
cbar_ax = fig.add_axes([0.92, 0.15, 0.02, 0.7]) 
#Acid
sns.heatmap(corr_acid, vmin=-1,vmax=1, ax=axes[0], annot=True, annot_kws={"fontsize":12}, cmap="coolwarm", fmt=".2f",cbar_ax=cbar_ax)
for i in range(corr_acid.shape[0]):
    for j in range(corr_acid.shape[1]):
        if 1 > abs(corr_acid.iloc[i,j]) > 0.8:
            axes[0].add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='k', linewidth=2,hatch='///'))
#Metal
sns.heatmap(corr_metal, vmin=-1,vmax=1, ax=axes[1], annot=True, annot_kws={"fontsize":12}, cmap="coolwarm", fmt=".2f",cbar=False)
for i in range(corr_metal.shape[0]):
    for j in range(corr_metal.shape[1]):
        if 1 > abs(corr_metal.iloc[i,j]) > 0.8:
            axes[1].add_patch(plt.Rectangle((j, i), 1, 1, fill=False, edgecolor='k', linewidth=2, hatch='///'))

axes[0].tick_params(axis='both',labelrotation=45, labelsize=12)
axes[0].set_title('Anion features',fontsize=14)
axes[1].tick_params(axis='both',labelrotation=45, labelsize=12)
axes[1].set_title('Metal features',fontsize=14)
cbar_ax.tick_params(labelsize=12,tickdir='inout')
plt.subplots_adjust(hspace=0.25)
plt.show()

In [None]:
#Down select features with R2 higher than 0.8
Xacid=X[[r'$E^{a0}_{HOMO}$',r'$\Delta E^{dis}_H$',r'$z$',r'$E^{a1}_{EA}$',r'$\Delta G_{pKa}$',r'$\mu_z$']]
Xmetal=X[[r'$E^m_{EA}$',r'$\alpha^{1/3}_m$',r'$r_{vdw}$',r'$\varepsilon_F$',r'$\varepsilon_d$',r'$\varepsilon_{sp}$']]
Xdown=pd.concat([Xacid, Xmetal], axis=1)
Xdown.shape

### $\gamma$ (e)

#### Fit all features

In [None]:
#Label
ygam=df[r'$\gamma$']
#80-20 split with all features (downed)
X_train, X_test, gam_train, gam_test = train_test_split(Xdown, ygam, test_size=0.2)
modelgam=linear_model.LinearRegression().fit(X_train,gam_train)

#Error
print('Train R2:',r2_score(gam_train,modelgam.predict(X_train)))
print('Train RMSE:',root_mean_squared_error(gam_train,modelgam.predict(X_train)))
print('Test R2:',r2_score(gam_test,modelgam.predict(X_test)))
print('Test RMSE:',root_mean_squared_error(gam_test,modelgam.predict(X_test)))

#Plot final split
plt.figure(figsize=(4,3.5),dpi=100)
plt.gca().set_aspect('equal')
plt.plot(gam_train,modelgam.predict(X_train),'o', markersize=8,mec='k',mfc='b',alpha=0.6)
plt.plot(gam_test,modelgam.predict(X_test),'o', markersize=8,mec='k',mfc='r',alpha=0.6)
plt.tick_params(axis='both',labelsize=15)
low=-3
high=0
plt.xlim(low,high)
plt.ylim(low,high)
plt.plot((low,high),(low,high),ls='--',c='k')
ticks=np.arange(low,high+1,1)
plt.xticks(ticks)
plt.yticks(ticks)
plt.show()

#### Sequential Forward Selection

In [None]:
#Repeat SFS multiple times (bootstrapping)
nSFS=50
gam_feas, gam_r2, gam_rmse=[],[],[]
for _ in range(nSFS):
    #Re-initialize model
    modelgam=linear_model.LinearRegression()
    #Do a random 80-20 split
    X_train, X_test, y_train, y_test = train_test_split(Xdown, ygam, test_size=0.2)
    #Sequential Forward Selection with varying number of features
    nfeas=np.arange(1,Xdown.shape[1],1)
    selected_feas=[]
    r2_feas=[]
    rmse_feas=[]
    for n in nfeas:
        sfs=SequentialFeatureSelector(modelgam, n_features_to_select=n, direction='forward', cv=5, scoring='neg_root_mean_squared_error')
        sfs.fit(X_train, y_train)
        X_train_sfs=sfs.transform(X_train)
        X_test_sfs=sfs.transform(X_test)
        modelgam.fit(X_train_sfs, y_train)
        #Metrics
        r2train=r2_score(y_train, modelgam.predict(X_train_sfs))
        rmsetrain=root_mean_squared_error(y_train, modelgam.predict(X_train_sfs))
        r2test=r2_score(y_test, modelgam.predict(X_test_sfs))
        rmsetest=root_mean_squared_error(y_test, modelgam.predict(X_test_sfs))
        #Extract
        selected_feas.append(sfs.get_feature_names_out())
        r2_feas.append([r2train, r2test])
        rmse_feas.append([rmsetrain, rmsetest])
    #Save
    gam_feas.append(selected_feas)
    gam_r2.append(r2_feas)
    gam_rmse.append(rmse_feas)

In [None]:
#Histogram for features
gam_topfeas=[]
for n in range(nSFS):
    gam_topfeas.append(gam_feas[n][3]) #Top n-1
gam_topfeas=np.array(gam_topfeas).astype('str')
counts=Counter(gam_topfeas.flatten())
counts_sorted=dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))
label_gam, count_gam=zip(*counts_sorted.items())

#### Standardized coefficients and LASSO

In [None]:
# Standardize the features
scaler = StandardScaler()
Xscaled_gam = scaler.fit_transform(Xdown) 
#single 80-20 split 
X_train, X_test, gam_train, gam_test = train_test_split(Xscaled_gam, ygam, test_size=0.2)
#LASSO
modelgam=linear_model.Lasso(alpha=0.01).fit(Xscaled_gam,ygam)
#Print abs coef.
coefs=pd.DataFrame(modelgam.coef_, index=Xdown.columns, columns=['Coef.'])
coefs['Abs. Coef.'] = coefs['Coef.'].abs()
# Rank features by the absolute value of their coefficients
ranked_features = coefs.sort_values('Abs. Coef.', ascending=False)
print(ranked_features)

#Error
gam_pred=modelgam.predict(Xscaled_gam) #predict
print('R2:',r2_score(ygam,gam_pred))
print('RMSE:',root_mean_squared_error(ygam,gam_pred))

#### Final cross-validation of selected features

In [None]:
#Down features further
Xdown_gam=Xdown[[r'$z$',r'$\mu_z$',r'$\varepsilon_F$',r'$\varepsilon_d$']]
#Cross-validation
kf=KFold(n_splits=5, shuffle=True,random_state=85)
modelgam=linear_model.LinearRegression()
cv_rmse_gam=[]
cv_r2_gam=[]
for train_index, test_index in kf.split(Xdown_gam):
    X_train, X_test = Xdown_gam.iloc[train_index], Xdown_gam.iloc[test_index]
    y_train, y_test = ygam.iloc[train_index], ygam.iloc[test_index]
    
    modelgam.fit(X_train, y_train)
    #Metrics
    r2train=r2_score(y_train, modelgam.predict(X_train))
    rmsetrain=root_mean_squared_error(y_train, modelgam.predict(X_train))
    r2test=r2_score(y_test, modelgam.predict(X_test))
    rmsetest=root_mean_squared_error(y_test, modelgam.predict(X_test))   
    cv_rmse_gam.append([rmsetrain, rmsetest])
    cv_r2_gam.append([r2train, r2test])
    
print("RMSE [train, test]:", np.mean(cv_rmse_gam,axis=0), np.std(cv_rmse_gam,axis=0)/np.sqrt(5))
print("R2 [train, test]:", np.mean(cv_r2_gam, axis=0), np.std(cv_r2_gam, axis=0)/np.sqrt(5))
print("Coef.:", linear_model.LinearRegression().fit(Xdown_gam, ygam).coef_)

In [None]:
fig=plt.figure(figsize=(8,2.5),dpi=200)
#Define axes
ax1 = fig.add_axes([0.1, 0.5, 0.25, 0.4])  # Bottom-left
ax2 = fig.add_axes([0.1, 0.1, 0.25, 0.4])  # Top-left
ax3 = fig.add_axes([0.45, 0.1, 0.25, 0.8]) 
ax4 = fig.add_axes([0.82, 0.1, 0.25, 0.8]) 
#RMSE
ax1.errorbar(nfeas,np.mean(gam_rmse,axis=0)[:,0],yerr=np.std(gam_rmse,axis=0)[:,0]/np.sqrt(nSFS), capsize=3, fmt='o',ls='-',c='b') #train
ax1.errorbar(nfeas,np.mean(gam_rmse,axis=0)[:,1],yerr=np.std(gam_rmse,axis=0)[:,1]/np.sqrt(nSFS), capsize=3, fmt='o',ls='--',c='r',mfc='w') #test
ax1.axvline(x=4,c='k',ls=':')
ax1.set_ylabel(r'RMSE (e)',fontsize=12)
ax1.set_title('Sequential forward selection')
ax1.set_ylim(0.06,0.22)
ax1.set_yticks([0.1,0.2])
#R2
ax2.errorbar(nfeas,np.mean(gam_r2,axis=0)[:,0],yerr=np.std(gam_r2,axis=0)[:,0]/np.sqrt(nSFS), capsize=3, fmt='o',ls='-',c='b') #train
ax2.errorbar(nfeas,np.mean(gam_r2,axis=0)[:,1],yerr=np.std(gam_r2,axis=0)[:,1]/np.sqrt(nSFS), capsize=3, fmt='o',ls='--',c='r',mfc='w') #test
ax2.axvline(x=4,c='k',ls=':')
ax2.set_xlabel('No. of features',fontsize=12)
ax2.set_ylabel(r'R$^2$',fontsize=12)
[i.set_xticks(range(1,12,2)) for i in [ax1,ax2]]
ax2.set_ylim(0.66,0.98)
#ax2.set_yticks([0.1,0.2])

#Feature hist
ax3.bar(label_gam[:6],count_gam[:6], fc='k', ec='w',width=1)
ax3.set_ylim(0,50)
ax3.tick_params(axis='x',labelsize=12,labelrotation=90)
ax3.tick_params(axis='y',labelsize=12)  # Rotate labels for readability
ax3.set_title('Top occurring features')
#Final performance
#ax4.set_aspect('equal')
ax4.plot(y_train,modelgam.predict(X_train),'o', markersize=8,mec='b',mfc='b',alpha=0.6,label='Train')
ax4.plot(y_test,modelgam.predict(X_test),'o', markersize=8,mec='r',mfc='w',alpha=0.6,label='Test')
low=-3.
high=0.
ax4.set_xlim(low,high)
ax4.set_ylim(low,high)
ax4.plot((low,high),(low,high),ls='--',c='k')
ax4.tick_params(axis='both',labelsize=12)
ax4.set_title('Final model performance')
ax4.set_xlabel(r'GC-DFT $\gamma$ (e)',fontsize=12)
ax4.set_ylabel(r'MLR $\gamma$ (e)',fontsize=12)
plt.show()

In [None]:
from sklearn.inspection import PartialDependenceDisplay

In [None]:
[PartialDependenceDisplay.from_estimator(modelgam, Xdown_gam, [i]) for i in range(4)]
plt.show()

#### Data heat map $\gamma$

In [None]:
# Function to convert formulas to LaTeX-style strings
def format_formula(formula):
    formatted = ''
    minuses=0
    for char in formula:
        if char.isdigit():
            formatted += f'_{char}'  # Add subscript for numbers
        elif char=='-':
            minuses+=1
        else:
            formatted += '\\text{'+char+'}'
    if minuses==1:
        return '$'+formatted + '^-$'
    else:
        return '$'+formatted + f'^{{{minuses}-}}$'

In [None]:
#Train on all available data
modelgam=linear_model.LinearRegression().fit(Xdown_gam, ygam)
#Make copy of raw df to be filled
dffill=dfraw.copy()
#Fill in blanks
for index, row in dfraw.iterrows():
    if pd.isna(row.iloc[26]): #if gamma is NaN
        #Predict
        Xrow=row[[r'$z$',r'$\mu_z$',r'$\varepsilon_F$',r'$\varepsilon_d$']]
        gamrow=modelgam.predict(pd.DataFrame([Xrow]))
        #Fill in
        dffill.at[index, r'$\gamma$']=gamrow

In [None]:
#sort by dipole moment then valency
sortgamkey=[r'$\gamma$']
sortorder=[True]
#sortgamkey=[r'$z$',  r'$\mu_z$', r'$\varepsilon_d$']
#sortorder=[True, True, True] #True for +, False for -
dfraw_sortgam=dfraw.sort_values(by=sortgamkey, axis='index', ascending=sortorder)
dffill_sortgam=dffill.sort_values(by=sortgamkey, axis='index', ascending=sortorder)
#print(df_sortgam.head(30))

#pivot and map gamma map
gamraw=pd.pivot_table(dfraw_sortgam, index=dfraw_sortgam.columns[1], columns=dfraw_sortgam.columns[0], values=dfraw_sortgam.columns[26], sort=False)
gamfill=pd.pivot_table(dffill_sortgam, index=dffill_sortgam.columns[1], columns=dffill_sortgam.columns[0], values=dffill_sortgam.columns[26], sort=False)

#reindex
#metalind=['Ir', 'Pt', 'Rh', 'Pd', 'Co', 'Ni', 'Au', 'Cu', 'Ag']
#gamfill=gamfill.reindex(columns=metalind)
#gamfill=gamfill.reindex(index=Uofill.index, columns=Uofill.columns) #if match Uo
gamraw=gamraw.reindex(index=gamfill.index, columns=gamfill.columns) #reindex to match perfectly
print(gamraw.columns.equals(gamfill.columns))
print(gamraw.index.equals(gamfill.index))

In [None]:
#Horizontal
fig=plt.figure(figsize=(12,5),dpi=300)
ax=plt.gca()
ax.set_aspect(1)
cbar_ax = fig.add_axes([0.125, 0.85, 0.775, 0.02]) 
sns.heatmap(gamfill, vmin=-2.5, vmax=-0.5, cmap="jet", ax=ax, cbar_ax=cbar_ax, cbar_kws={"orientation":"horizontal"}, linecolor='w',linewidths=1)
#Mask NaN in raw
nan_mask=gamraw.isna()
for i, j in zip(*np.where(nan_mask)):
    ax.add_patch(plt.Rectangle((j, i), 1, 1, fill=False, linewidth=0, hatch='///', color='w',alpha=0.5))
#Format
ax.set_yticklabels(gamfill.index, rotation=0)
ax.set_xticklabels([format_formula(i) for i in gamfill.columns],rotation=90) #format chemical formula
ax.tick_params(axis='both',labelsize=14, bottom=False, left=False)
ax.set_xlabel('', fontsize=19)
ax.set_ylabel('', fontsize=19)
cbar_ax.tick_params(axis='both',labelsize=15, bottom=False, labelbottom=False, top=True, labeltop=True, width=1.5, size=5, direction='inout')
for _, spine in ax.spines.items():
    spine.set_visible(True)
    spine.set_linewidth(1)
for _, spine in cbar_ax.spines.items():
    spine.set_visible(True)
    spine.set_linewidth(0.5)
plt.show()

#### Charge retained $z-\gamma$

In [None]:
#Use above trained model and extract z
zval = Xdown_gam[[r'$z$']].to_numpy().flatten()

plt.figure(figsize=(4,3.5),dpi=200)
#Plot each valency value
vcolor=['b','g','r']
vin=0
for v in [-1,-2,-3]:
    mask=zval==v
    plt.plot(zval[mask]-ygam[mask],zval[mask]-modelgam.predict(Xdown_gam)[mask],ls='',marker='o',mec='k',mfc=vcolor[vin],markersize=8,alpha=0.6)
    vin+=1
plt.gca().set_aspect('equal')
plt.tick_params(axis='both',labelsize=15)
low=-2.
high=1.
plt.xlim(low,high)
plt.ylim(low,high)
plt.plot((low,high),(low,high),ls='--',c='k')
ticks=np.arange(low,high+1,1)
plt.xticks(ticks)
plt.yticks(ticks)
plt.xlabel(r'$z-\gamma_{\text{GC-DFT}}$ (e)',fontsize=15)
plt.ylabel(r'$z-\gamma_{\text{MLR}}$ (e)',fontsize=15)
plt.show()

### $U_o$ (V SHE)

#### Fit all features

In [None]:
#Label
yUo=df[r'$U^o$']
#80-20 split withh all features
X_train, X_test, Uo_train, Uo_test = train_test_split(Xdown, yUo, test_size=0.2)
modelUo=linear_model.LinearRegression().fit(X_train,Uo_train)

#Error
print('Train R2:',r2_score(Uo_train,modelUo.predict(X_train)))
print('Train RMSE:',root_mean_squared_error(Uo_train,modelUo.predict(X_train)))
print('Test R2:',r2_score(Uo_test,modelUo.predict(X_test)))
print('Test RMSE:',root_mean_squared_error(Uo_test,modelUo.predict(X_test)))

#Plot final split
plt.figure(figsize=(4.5,4),dpi=300)
plt.gca().set_aspect('equal')
plt.plot(Uo_train,modelUo.predict(X_train),'o', markersize=8,mec='b',mfc='b',alpha=0.6)
plt.plot(Uo_test,modelUo.predict(X_test), 'o', markersize=8,mec='r',mfc='w',alpha=0.6)

plt.tick_params(axis='both',labelsize=14)
low=-1.2
high=1.8
plt.xlim(low,high)
plt.ylim(low,high)
plt.plot((low,high),(low,high),ls='--',c='k')     
ticks=np.arange(low+1,high+1,1)
plt.xticks(ticks)
plt.yticks(ticks)
plt.show()

#### Sequential Forward Selection

In [None]:
#Repeat SFS multiple times (bootstrapping)
nSFS=50
Uo_feas, Uo_r2, Uo_rmse=[],[],[]
for _ in range(nSFS):
    #Re-initialize model
    modelUo=linear_model.LinearRegression()
    #Do a random 80-20 split
    X_train, X_test, y_train, y_test = train_test_split(Xdown, yUo, test_size=0.2)
    #Sequential Forward Selection with varying number of features
    nfeas=np.arange(1,Xdown.shape[1],1)
    selected_feas=[]
    r2_feas=[]
    rmse_feas=[]
    for n in nfeas:
        sfs=SequentialFeatureSelector(modelUo, n_features_to_select=n, direction='forward', cv=5, scoring='neg_root_mean_squared_error')
        sfs.fit(X_train, y_train)
        X_train_sfs=sfs.transform(X_train)
        X_test_sfs=sfs.transform(X_test)
        modelUo.fit(X_train_sfs, y_train)
        #Metrics
        r2train=r2_score(y_train, modelUo.predict(X_train_sfs))
        rmsetrain=root_mean_squared_error(y_train, modelUo.predict(X_train_sfs))
        r2test=r2_score(y_test, modelUo.predict(X_test_sfs))
        rmsetest=root_mean_squared_error(y_test, modelUo.predict(X_test_sfs))
        #Extract
        selected_feas.append(sfs.get_feature_names_out())
        r2_feas.append([r2train, r2test])
        rmse_feas.append([rmsetrain, rmsetest])
    #Save
    Uo_feas.append(selected_feas)
    Uo_r2.append(r2_feas)
    Uo_rmse.append(rmse_feas)

In [None]:
#Histogram for features
Uo_topfeas=[]
for n in range(nSFS):
    Uo_topfeas.append(Uo_feas[n][5]) #top 6
Uo_topfeas=np.array(Uo_topfeas).astype('str')

counts=Counter(Uo_topfeas.flatten())
counts_sorted=dict(sorted(counts.items(), key=lambda item: item[1], reverse=True))
label_Uo, count_Uo=zip(*counts_sorted.items())

#### Standardized coefficients and LASSO

In [None]:
#Standardize the features
scaler = StandardScaler()
Xscaled_Uo = scaler.fit_transform(Xdown) 
#80-20 split withh all features
X_train, X_test, Uo_train, Uo_test = train_test_split(Xscaled_Uo, yUo, test_size=0.001)
#modelUo=linear_model.Lasso(alpha=0.05).fit(X_train,Uo_train)
modelUo=linear_model.Lasso(alpha=0.04).fit(Xscaled_Uo,yUo)
#Print
coefs=pd.DataFrame(modelUo.coef_, index=Xdown.columns, columns=['Coef.'])
coefs['Abs. Coef.'] = coefs['Coef.'].abs()
# Rank features by the absolute value of their coefficients
ranked_features = coefs.sort_values('Abs. Coef.', ascending=False)
print(ranked_features)

#Error
Uo_pred=modelUo.predict(Xscaled_Uo) #predict
print('R2:',r2_score(yUo,Uo_pred))
print('RMSE:',root_mean_squared_error(yUo,Uo_pred))

#### Final cross-validation of selected features

In [None]:
#Down features from SFS
Xdown_Uo=Xdown[[r'$E^m_{EA}$',r'$\alpha^{1/3}_m$',r'$r_{vdw}$',r'$\Delta E^{dis}_H$',r'$E^{a0}_{HOMO}$',r'$E^{a1}_{EA}$']]
#Cross-validation
kf=KFold(n_splits=5, shuffle=True, random_state=500)
modelUo=linear_model.LinearRegression()
cv_rmse_Uo=[]
cv_r2_Uo=[]
for train_index, test_index in kf.split(Xdown_Uo):
    X_train, X_test = Xdown_Uo.iloc[train_index], Xdown_Uo.iloc[test_index]
    y_train, y_test = yUo.iloc[train_index], yUo.iloc[test_index]
    
    modelUo.fit(X_train, y_train)
    #Metrics
    r2train=r2_score(y_train, modelUo.predict(X_train))
    rmsetrain=root_mean_squared_error(y_train, modelUo.predict(X_train))
    r2test=r2_score(y_test, modelUo.predict(X_test))
    rmsetest=root_mean_squared_error(y_test, modelUo.predict(X_test))   
    cv_rmse_Uo.append([rmsetrain, rmsetest])
    cv_r2_Uo.append([r2train, r2test])

print("RMSE [train, test]:", np.mean(cv_rmse_Uo,axis=0), np.std(cv_rmse_Uo,axis=0)/np.sqrt(5))
print("R2 [train, test]:", np.mean(cv_r2_Uo, axis=0), np.std(cv_r2_Uo, axis=0)/np.sqrt(5))
print("Coef.:", linear_model.LinearRegression().fit(Xdown_Uo, yUo).coef_)

In [None]:
fig=plt.figure(figsize=(8,2.5),dpi=200)
#Define axes
ax1 = fig.add_axes([0.1, 0.5, 0.25, 0.4])  # Bottom-left
ax2 = fig.add_axes([0.1, 0.1, 0.25, 0.4])  # Top-left
ax3 = fig.add_axes([0.45, 0.1, 0.25, 0.8]) 
ax4 = fig.add_axes([0.82, 0.1, 0.25, 0.8]) 

#RMSE
ax1.errorbar(nfeas,np.mean(Uo_rmse,axis=0)[:,0],yerr=np.std(Uo_rmse,axis=0)[:,0]/np.sqrt(nSFS), capsize=3, fmt='s',ls='-',c='b') #train
ax1.errorbar(nfeas,np.mean(Uo_rmse,axis=0)[:,1],yerr=np.std(Uo_rmse,axis=0)[:,1]/np.sqrt(nSFS), capsize=3, fmt='s',ls='--',c='r',mfc='w') #test
ax1.axvline(x=6,c='k',ls=':')
ax1.set_ylim(0.15,0.42)
ax1.set_yticks([0.2,0.3,0.4])
ax1.set_ylabel(r'RMSE (e)',fontsize=12)
ax1.set_title('Sequential forward selection')

#R2
ax2.errorbar(nfeas,np.mean(Uo_r2,axis=0)[:,0],yerr=np.std(Uo_r2,axis=0)[:,0]/np.sqrt(nSFS), capsize=3, fmt='s',ls='-',c='b') #train
ax2.errorbar(nfeas,np.mean(Uo_r2,axis=0)[:,1],yerr=np.std(Uo_r2,axis=0)[:,1]/np.sqrt(nSFS), capsize=3, fmt='s',ls='--',c='r',mfc='w') #test
ax2.axvline(x=6,c='k',ls=':')
ax2.set_xlabel('No. of features',fontsize=12)
ax2.set_ylabel(r'R$^2$',fontsize=12)
ax2.set_ylim(0.17,0.95)
ax2.set_yticks([0.3,0.6,0.9])
[i.set_xticks(range(1,12,2)) for i in [ax1,ax2]]

#Feature hist
ax3.bar(label_Uo[:7],count_Uo[:7], fc='k', ec='w',width=1)
ax3.set_ylim(0,50)
ax3.tick_params(axis='x',labelsize=12,labelrotation=90)
ax3.tick_params(axis='y',labelsize=12)  # Rotate labels for readability
ax3.set_title('Top occurring features')

#Final performance
#ax4.set_aspect('equal')
ax4.plot(y_train,modelUo.predict(X_train),'s', markersize=8,mec='b',mfc='b',alpha=0.6,label='Train')
ax4.plot(y_test,modelUo.predict(X_test),'s', markersize=8,mec='r',mfc='w',alpha=0.6,label='Test')
low=-1.2
high=1.8
ax4.set_xlim(low,high)
ax4.set_ylim(low,high)
ax4.plot((low,high),(low,high),ls='--',c='k')
ax4.tick_params(axis='both',labelsize=12)
ax4.set_title('Final model performance')
ax4.set_xlabel(r'GC-DFT $U^0$ (V)',fontsize=12)
ax4.set_ylabel(r'MLR $U^0$ (V)',fontsize=12)

plt.show()

In [None]:
[PartialDependenceDisplay.from_estimator(modelUo, Xdown_Uo, [i]) for i in range(6)]
plt.show()

#### Data heat map $U_o$

In [None]:
#Train on all available data
modelUo=linear_model.LinearRegression().fit(Xdown_Uo, yUo)
#Fill in blanks
for index, row in dfraw.iterrows():
    if pd.isna(row.iloc[25]): #if Uo is NaN
        #Predict
        Xrow=row[[r'$E^m_{EA}$',r'$\alpha^{1/3}_m$',r'$r_{vdw}$',r'$\Delta E^{dis}_H$',r'$E^{a0}_{HOMO}$',r'$E^{a1}_{EA}$']]
        Uorow=modelUo.predict(pd.DataFrame([Xrow]))
        #Fill in
        dffill.at[index, r'$U^o$']=Uorow

In [None]:
#sort 
#sortUokey=[r'$U^o$']
#sortorder=[True]
sortUokey=[r'$r_{vdw}$',r'$E^{a0}_{HOMO}$']
sortorder=[False, False]
dfraw_sortUo=dfraw.sort_values(by=sortUokey, axis='index', ascending=sortorder)
dffill_sortUo=dffill.sort_values(by=sortUokey, axis='index', ascending=sortorder)
#print(df_sortUo.head(30))
#pivot and map Uoma map
Uoraw=pd.pivot_table(dfraw_sortUo, index=dfraw_sortUo.columns[1], columns=dfraw_sortUo.columns[0], values=dfraw_sortUo.columns[25], sort=False)
Uofill=pd.pivot_table(dffill_sortUo, index=dffill_sortUo.columns[1], columns=dffill_sortUo.columns[0], values=dffill_sortUo.columns[25], sort=False)
#reindex
#metalind=['Co', 'Ni', 'Rh', 'Cu', 'Ir', 'Pd', 'Ag', 'Pt', 'Au']
#Uofill=Uofill.reindex(columns=metalind)
#Uofill=Uofill.reindex(index=gamfill.index, columns=gamfill.columns) #if reindex with gamma
Uoraw=Uoraw.reindex(index=Uofill.index, columns=Uofill.columns) #reindex to match perfectly

print(Uoraw.columns.equals(Uofill.columns))
print(Uoraw.index.equals(Uofill.index))

In [None]:
#Horizontal
fig=plt.figure(figsize=(12,5),dpi=300)
ax=plt.gca()
ax.set_aspect(1)
cbar_ax = fig.add_axes([0.125, 0.85, 0.775, 0.02]) 
sns.heatmap(Uofill, vmin=-1.2, vmax=1.6, cmap="jet", ax=ax, cbar_ax=cbar_ax, cbar_kws={"orientation":"horizontal"}, linecolor='w',linewidths=1)
#Mask NaN in raw
nan_mask=Uoraw.isna()
for i, j in zip(*np.where(nan_mask)):
    ax.add_patch(plt.Rectangle((j, i), 1, 1, fill=False, linewidth=0, hatch='///', color='w',alpha=0.5))
#Format
ax.set_yticklabels(Uofill.index, rotation=0)
ax.set_xticklabels([format_formula(i) for i in Uofill.columns],rotation=90) #format chemical formula
ax.tick_params(axis='both',labelsize=14, bottom=False, left=False)
ax.set_xlabel('', fontsize=19)
ax.set_ylabel('', fontsize=19)
cbar_ax.tick_params(axis='both',labelsize=15, bottom=False, labelbottom=False, top=True, labeltop=True, width=1.5, size=5, direction='inout')
for _, spine in ax.spines.items():
    spine.set_visible(True)
    spine.set_linewidth(1)
for _, spine in cbar_ax.spines.items():
    spine.set_visible(True)
    spine.set_linewidth(0.5)
plt.show()