In [1]:
import numpy as np
import pandas as pd
import os
from sklearn.model_selection import StratifiedKFold

from CrabNet_spacegroup.kingcrab import CrabNet
from CrabNet_spacegroup.model import Model
from CrabNet_spacegroup.get_compute_device import get_compute_device

from sklearn.metrics import mean_absolute_error as mae, \
                            r2_score as r2, \
                            mean_squared_error as mse

compute_device = get_compute_device()

In [2]:
from pymatgen.core import Composition

# No warnings about setting value on copy of slice
pd.options.mode.chained_assignment = None

# Display up to 60 columns of a dataframe
pd.set_option('display.max_columns', 60)

# Matplotlib visualization
import matplotlib.pyplot as plt
from matplotlib import rcParams

# Internal ipython tool for setting figure size
from IPython.core.pylabtools import figsize

# Seaborn for visualization
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

config = {
    "mathtext.fontset":'stix',
    "font.family":'serif',
    "font.serif": ['Times New Roman'],
    "font.size": 24,
    'axes.unicode_minus': False 
}
rcParams.update(config)
plt.rcParams['axes.unicode_minus'] = False  
large = 22; med = 20; small = 12
params = {'axes.titlesize': large,
          'legend.fontsize': med,
          'figure.figsize': (8, 6),
          'axes.labelsize': med,
          'axes.titlesize': med,
          'xtick.labelsize': med,
          'ytick.labelsize': med,
          'figure.titlesize': large}
plt.rcParams.update(params)
plt.rcParams['figure.dpi'] = 500 

In [3]:
df = pd.read_csv('./data/mp_data_new.csv', index_col=0)[['formula_pretty','symmetry','energy_above_hull']]
df.columns = ['formula','spacegroup','target']

In [4]:
wrong_indexes = []
for i,row in df.iterrows():
    formula = row['formula']
    try:
        Composition(formula)
    except:
        wrong_indexes.append(i)
df = df.drop(index=wrong_indexes).reset_index(drop=True)
df

Unnamed: 0,formula,spacegroup,target
0,LiCaPb,187,0.000000
1,Li2Mn3Cr3O12,10,0.074943
2,FeB4,58,0.006845
3,Mg149Tc,187,0.000000
4,Li7Mn5O12,2,0.048052
...,...,...,...
154710,ScPaOs2,225,0.000000
154711,CeSe2,10,0.521610
154712,WN2,61,0.149157
154713,Al(CuO2)2,88,0.203042


In [5]:
from pymatgen.core import Composition
df['composition'] = df['formula'].map(Composition)
df['n_elements'] = df['composition'].map(lambda x:len(Composition(x).elements))
df = df[df['n_elements']>1].reset_index(drop=True)

In [6]:
idx = df.groupby('formula')['target'].idxmin()
df = df.loc[idx].reset_index(drop=True)
df

Unnamed: 0,formula,spacegroup,target,composition,n_elements
0,Ac2AgIr,225,0.000000,"(Ac, Ag, Ir)",3
1,Ac2AgPb,225,0.000000,"(Ac, Ag, Pb)",3
2,Ac2Br2O,123,1.242844,"(Ac, Br, O)",3
3,Ac2CdGa,225,0.000000,"(Ac, Cd, Ga)",3
4,Ac2CdGe,225,0.000000,"(Ac, Cd, Ge)",3
...,...,...,...,...,...
105488,ZrZnNi4,216,0.000000,"(Zr, Zn, Ni)",3
105489,ZrZnO3,221,0.757972,"(Zr, Zn, O)",3
105490,ZrZnPd2,225,0.000000,"(Zr, Zn, Pd)",3
105491,ZrZnPt2,225,0.026347,"(Zr, Zn, Pt)",3


In [7]:
df['is_stable'] = df['target'].map(lambda x: 1 if x <= 0.2 else 0)
df['is_stable'].value_counts()

is_stable
1    88735
0    16758
Name: count, dtype: int64

In [8]:
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

cv_indices = kf.split(df["formula"], df["is_stable"])
for i, (train_index, val_index) in enumerate(cv_indices):
    train, val = df.loc[train_index], df.loc[val_index]
    name = f"fold_{i}_Eh"
    if os.path.exists(f'./result/predictions_crabnet_spacegroup/{name}.csv'):
        res_df = pd.read_csv(f'./result/predictions_crabnet_spacegroup/{name}.csv')
        print(name)
        print(f"MAE: {mae(res_df['real'], res_df['pred'])}")
        print(f"R2: {r2(res_df['real'], res_df['pred'])}")
        print(f"RMSE: {np.sqrt(mse(res_df['real'], res_df['pred']))}")
    else:
        model = Model(CrabNet(compute_device=compute_device).to(compute_device),
                      model_name=name, verbose=True, classification=False)
        model.load_data(train, train=True)
        model.load_data(val)
        model.fit(epochs=500, losscurve=True)
        res = model.best_results
        res_df = pd.DataFrame({"composition": res[2], "real": res[0], "pred": res[1], 'uncert':res[3]})
        print(name)
        print(f"MAE: {mae(res_df['real'], res_df['pred'])}")
        print(f"R2: {r2(res_df['real'], res_df['pred'])}")
        print(f"RMSE: {np.sqrt(mse(res_df['real'], res_df['pred']))}")
        res_df.to_csv(f'./result/predictions_crabnet_spacegroup/{name}.csv',index=False)
        model.save_network(f'{name}_spacegroup.pth')

fold_0_Eh
MAE: 0.050012208310368414
R2: 0.8328688204401518
RMSE: 0.16284428968386247
fold_1_Eh
MAE: 0.05090474862935503
R2: 0.8630418839491772
RMSE: 0.15786213782376318
fold_2_Eh
MAE: 0.06999617281087708
R2: 0.5350129699417493
RMSE: 0.2778928112760103
fold_3_Eh
MAE: 0.07293513199828551
R2: 0.4821690833283304
RMSE: 0.29682978801943105
fold_4_Eh
MAE: 0.05052810783712969
R2: 0.8459945573265675
RMSE: 0.16292013467408517
