## Thermodynamic Property Prediction for H, S, Tc
1. Generates models/predictions for H, S, Tc
2. Uses transfer learning to predict Tc
3. Calculates a prediction for Tc using H and S predictions and the Tc formula

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib as mpl
from cycler import cycler
import joblib
import os


from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.kernel_ridge import KernelRidge

from xgboost import XGBRegressor

import warnings
from sklearn.exceptions import ConvergenceWarning
warnings.filterwarnings("ignore", category=ConvergenceWarning)

from great_tables import GT

In [5]:
plt.style.use("presentation.mplstyle")
mpl.rcParams['axes.prop_cycle'] = cycler(color=['#5c3c8b', '#92c36d', '#ee9432', '#496391', '#85a5cd', '#FDF3CC'])
plotsize = 5

### Define Parameters

In [6]:
file_info = {
    "enthalpy": "2_split_datasets/enthalpy_80_MI_reduced.csv",
    "entropy": "2_split_datasets/entropy_80_MI_reduced.csv",
    "tc": "2_split_datasets/has_both_101_MI_reduced.csv",
    "log_tc": "2_split_datasets/has_both_101_MI_reduced.csv", 
}

feature_sets = [
    (name, pd.read_csv(path, index_col=0))
    for name, path in file_info.items()
]

In [7]:
# store column indices for use in the loop
target_idx = 0
phase_split_idx = 1
category_idx = 2
smiles_idx = 3
global_test_idx = 4
first_feature_idx = 5

In [8]:
models = [
    ('gpr', GaussianProcessRegressor(), {
        "model__kernel": [], 
    }),
    ('rf', RandomForestRegressor(), {
        'model__n_estimators': [25, 50, 100],
        'model__max_depth': [None, 10],
        'model__min_samples_split': [2, 5]
    }),
    ('xgb', XGBRegressor(), {
        'model__n_estimators': [50, 100],
        'model__max_depth': [3, 6],
        'model__learning_rate': [0.01, 0.1],
        'model__subsample': [0.8, 1.0]
    }),
    ('svr', SVR(), {
        'model__C': [0.1, 1, 10],
        'model__gamma': ['scale', 'auto'],
        'model__kernel': ['rbf']
    }),
    ('krr', KernelRidge(), {
        'model__alpha': [1e-3, 1e-2, 1e-1, 1.0],
        'model__kernel': ['rbf', 'linear', 'poly'],
        'model__gamma': [0.01, 0.1, 1],
    }),
]

In [9]:
outer_splits = 10
inner_splits = 5
results = []

model_dir = "3_saved_models"
output_dir = "3_images_and_csvs"

os.makedirs(model_dir, exist_ok=True)
os.makedirs(output_dir, exist_ok=True)

### Generate First Model Table

In [10]:
df = pd.read_csv(f"{output_dir}/model_results_summary.csv")

In [11]:
data = {
    "Model": [item.upper() for item in df['model'][:len(models)].tolist()],
    
    "Split by Chemistry.ΔHₚ.MAE": df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Split by Chemistry.ΔHₚ.R²": df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
    "Split by Chemistry.ΔSₚ.MAE": df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Split by Chemistry.ΔSₚ.R²": df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
    "Split by Chemistry.Tc.MAE": df[(df['target_Name'] == 'tc') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Split by Chemistry.Tc.R²": df[(df['target_Name'] == 'tc') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
    "Split by Chemistry.log Tc.MAE": df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Split by Chemistry.log Tc.R²": df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
    

    "Split by Phase.ΔHₚ.MAE": df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'phase')]['test_mae'].tolist(),
    "Split by Phase.ΔHₚ.R²": df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'phase')]['test_r2'].tolist(),
    "Split by Phase.ΔSₚ.MAE": df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'phase')]['test_mae'].tolist(),
    "Split by Phase.ΔSₚ.R²": df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'phase')]['test_r2'].tolist(),
    "Split by Phase.Tc.MAE": df[(df['target_Name'] == 'tc') & (df['split_type'] == 'phase')]['test_mae'].tolist(),
    "Split by Phase.Tc.R²": df[(df['target_Name'] == 'tc') & (df['split_type'] == 'phase')]['test_r2'].tolist(),
    "Split by Phase.log Tc.MAE": df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'phase')]['test_mae'].tolist(),
    "Split by Phase.log Tc.R²": df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'phase')]['test_r2'].tolist(),
}

gt = GT(pd.DataFrame(data)).cols_align(align="center")

for split in ['Chemistry', 'Phase']:
    for prop in ['ΔHₚ', 'ΔSₚ', 'Tc', 'log Tc']:
            for metric in ['MAE', 'R²']:
                gt = gt.fmt_number(columns=f"Split by {split}.{prop}.{metric}", decimals=2, drop_trailing_zeros=False)

gt = gt.fmt_integer(columns="Split by Chemistry.Tc.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Split by Phase.Tc.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Split by Chemistry.log Tc.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Split by Phase.log Tc.MAE", use_seps=False)

gt = gt.opt_horizontal_padding(scale=2.0)
gt = gt.opt_vertical_padding(scale=1.5)

gt = (gt.tab_options(container_width = "100%",column_labels_background_color="#C0ABDC",))

gt = gt.tab_options(
    column_labels_border_top_color="black",
    column_labels_border_top_style="solid",
    column_labels_border_top_width="2px",

    column_labels_border_bottom_color="black",
    column_labels_border_bottom_style="solid",
    column_labels_border_bottom_width="2px",

    table_body_border_bottom_color="black",
    table_body_border_bottom_style="solid",
    table_body_border_bottom_width="2px",
)

gt = gt.tab_spanner_delim()
gt.save(f"{output_dir}/model_result_table.png")

Unnamed: 0_level_0,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Phase,Split by Phase,Split by Phase,Split by Phase,Split by Phase,Split by Phase,Split by Phase,Split by Phase
Model,ΔHₚ,ΔHₚ,ΔSₚ,ΔSₚ,Tc,Tc,log Tc,log Tc,ΔHₚ,ΔHₚ,ΔSₚ,ΔSₚ,Tc,Tc,log Tc,log Tc
Model,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²
GPR,1.99,0.85,3.4,0.76,239,−0.72,174,−0.08,1.99,0.85,3.4,0.76,239,−0.72,174,−0.08
RF,1.27,0.91,3.51,0.76,288,−3.09,142,0.23,1.26,0.9,3.39,0.77,226,−0.61,160,0.07
XGB,1.48,0.9,4.05,0.69,194,0.12,204,−0.72,1.48,0.9,3.56,0.75,194,0.12,141,0.20
SVR,1.7,0.87,3.24,0.8,196,−0.06,143,0.31,1.7,0.87,3.24,0.8,196,−0.06,143,0.31
KRR,1.78,0.89,3.91,0.66,197,−0.22,157,0.09,1.99,0.82,3.91,0.66,197,−0.22,157,0.09


### Add Transfer Learning to Table

In [12]:
best_H_model = None
best_S_model = None

for file in os.listdir("3_saved_models"):
    if "enthalpy" in file and "BEST" in file and "chemistry" in file:
        best_H_model = joblib.load(os.path.join("3_saved_models", file))
        print(file)
    elif "entropy" in file and "BEST" in file and "chemistry" in file:
        best_S_model = joblib.load(os.path.join("3_saved_models", file))
        print(file)

entropy_svr_chemistry_cluster_BEST.joblib
enthalpy_rf_chemistry_cluster_BEST.joblib


In [13]:
# enthalpy/entropy pred columns
entc = feature_sets[0][1].iloc[:, first_feature_idx:].columns
entrc = feature_sets[1][1].iloc[:, first_feature_idx:].columns

# make predictions
h_preds = best_H_model.predict(feature_sets[2][1].iloc[:, first_feature_idx:][entc])
s_preds = best_S_model.predict(feature_sets[2][1].iloc[:, first_feature_idx:][entrc])

# add to tc data set
tc_df = feature_sets[2][1].copy()
tc_df['predicted_H_TL'] = h_preds
tc_df['predicted_S_TL'] = s_preds
tc_df['predicted_TC_TL'] = h_preds*1000/s_preds - 273.15

feature_sets.append((f"{feature_sets[2][0]}_transfer", tc_df))

In [14]:
df_tl = pd.read_csv(os.path.join(output_dir, "transfer_model_results_summary.csv"))

In [15]:
# flag best Tc transfer model
for split_type in df_tl['split_type'].unique():
    for target_name in ['tc_transfer']:
        # find the best model with the lowest MAE
        subset = df_tl[(df_tl['split_type'] == split_type) & (df_tl['target_Name'] == target_name)]
        best_model_row = subset.sort_values(by='test_mae').head(1)
        
        if not best_model_row.empty:
            best_model_name = best_model_row['model'].values[0]
            model_path = f"{model_dir}/{target_name}_{best_model_name}_{split_type}_transfer_learning.joblib"
            if os.path.exists(model_path):
                new_model_path = f"{model_dir}/{target_name}_{best_model_name}_{split_type}_TL_BEST.joblib"
                os.rename(model_path, new_model_path)

In [16]:
data = {
    "Model": [item.upper() for item in df['model'][:len(models)].tolist()],
    
    "Split by Chemistry.ΔHₚ.MAE": df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Split by Chemistry.ΔHₚ.R²": df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
    "Split by Chemistry.ΔSₚ.MAE": df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Split by Chemistry.ΔSₚ.R²": df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
    "Split by Chemistry.Tc.MAE": df[(df['target_Name'] == 'tc') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Split by Chemistry.Tc.R²": df[(df['target_Name'] == 'tc') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
    "Split by Chemistry.log Tc.MAE": df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Split by Chemistry.log Tc.R²": df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),

    "Split by Chemistry.Tc TL.MAE": df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Split by Chemistry.Tc TL.R²": df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),

    "Split by Phase.ΔHₚ.MAE": df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'phase')]['test_mae'].tolist(),
    "Split by Phase.ΔHₚ.R²": df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'phase')]['test_r2'].tolist(),
    "Split by Phase.ΔSₚ.MAE": df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'phase')]['test_mae'].tolist(),
    "Split by Phase.ΔSₚ.R²": df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'phase')]['test_r2'].tolist(),
    "Split by Phase.Tc.MAE": df[(df['target_Name'] == 'tc') & (df['split_type'] == 'phase')]['test_mae'].tolist(),
    "Split by Phase.Tc.R²": df[(df['target_Name'] == 'tc') & (df['split_type'] == 'phase')]['test_r2'].tolist(),
    "Split by Phase.log Tc.MAE": df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'phase')]['test_mae'].tolist(),
    "Split by Phase.log Tc.R²": df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'phase')]['test_r2'].tolist(),

    "Split by Phase.Tc TL.MAE": df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'phase')]['test_mae'].tolist(),
    "Split by Phase.Tc TL.R²": df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'phase')]['test_r2'].tolist(),
}

gt = GT(pd.DataFrame(data)).cols_align(align="center")

for split in ['Chemistry', 'Phase']:
    for prop in ['ΔHₚ', 'ΔSₚ', 'Tc', 'log Tc']:
            for metric in ['MAE', 'R²']:
                gt = gt.fmt_number(columns=f"Split by {split}.{prop}.{metric}", decimals=2, drop_trailing_zeros=False)

gt = gt.fmt_integer(columns="Split by Chemistry.Tc TL.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Split by Phase.Tc TL.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Split by Chemistry.Tc.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Split by Phase.Tc.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Split by Chemistry.Tc.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Split by Phase.Tc.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Split by Chemistry.log Tc.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Split by Phase.log Tc.MAE", use_seps=False)

gt = gt.opt_horizontal_padding(scale=2.0)
gt = gt.opt_vertical_padding(scale=1.5)

gt = (gt.tab_options(container_width = "100%",column_labels_background_color="#C0ABDC",))

gt = gt.tab_options(
    column_labels_border_top_color="black",
    column_labels_border_top_style="solid",
    column_labels_border_top_width="2px",

    column_labels_border_bottom_color="black",
    column_labels_border_bottom_style="solid",
    column_labels_border_bottom_width="2px",

    table_body_border_bottom_color="black",
    table_body_border_bottom_style="solid",
    table_body_border_bottom_width="2px",
) 

gt = gt.tab_spanner_delim()
gt.save(f"{output_dir}/transfer_model_result_table.png")

Unnamed: 0_level_0,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Chemistry,Split by Phase,Split by Phase,Split by Phase,Split by Phase,Split by Phase,Split by Phase,Split by Phase,Split by Phase,Split by Phase,Split by Phase
Model,ΔHₚ,ΔHₚ,ΔSₚ,ΔSₚ,Tc,Tc,log Tc,log Tc,Tc TL,Tc TL,ΔHₚ,ΔHₚ,ΔSₚ,ΔSₚ,Tc,Tc,log Tc,log Tc,Tc TL,Tc TL
Model,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²
GPR,1.99,0.85,3.4,0.76,239,−0.72,174,−0.08,239,-0.56,1.99,0.85,3.4,0.76,239,−0.72,174,−0.08,239,-0.56
RF,1.27,0.91,3.51,0.76,288,−3.09,142,0.23,332,-12.55,1.26,0.9,3.39,0.77,226,−0.61,160,0.07,336,-12.15
XGB,1.48,0.9,4.05,0.69,194,0.12,204,−0.72,256,-5.04,1.48,0.9,3.56,0.75,194,0.12,141,0.20,207,-1.53
SVR,1.7,0.87,3.24,0.8,196,−0.06,143,0.31,151,0.39,1.7,0.87,3.24,0.8,196,−0.06,143,0.31,151,0.39
KRR,1.78,0.89,3.91,0.66,197,−0.22,157,0.09,226,-3.99,1.99,0.82,3.91,0.66,197,−0.22,157,0.09,226,-3.99


### Chem-only table

In [17]:
data = {
    "Model": [item.upper() for item in df['model'][:len(models)].tolist()],
    
    "ΔHₚ.MAE": df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "ΔHₚ.R²": df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
    "ΔSₚ.MAE": df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "ΔSₚ.R²": df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
    "Tc.MAE": df[(df['target_Name'] == 'tc') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Tc.R²": df[(df['target_Name'] == 'tc') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
    "log Tc.MAE": df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "log Tc.R²": df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),

    "Tc TL.MAE": df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'chemistry_cluster')]['test_mae'].tolist(),
    "Tc TL.R²": df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'chemistry_cluster')]['test_r2'].tolist(),
}

gt = GT(pd.DataFrame(data)).cols_align(align="center")

for prop in ['ΔHₚ', 'ΔSₚ', 'Tc', 'log Tc', 'Tc TL']:
        for metric in ['MAE', 'R²']:
            gt = gt.fmt_number(columns=f"{prop}.{metric}", decimals=2, drop_trailing_zeros=False)

gt = gt.fmt_integer(columns="Tc TL.MAE", use_seps=False)
gt = gt.fmt_integer(columns="Tc.MAE", use_seps=False)
gt = gt.fmt_integer(columns="log Tc.MAE", use_seps=False)

gt = gt.opt_horizontal_padding(scale=2.0)
gt = gt.opt_vertical_padding(scale=1.5)

gt = (gt.tab_options(container_width = "100%",column_labels_background_color="#BDDAA7",))

gt = gt.tab_options(
    column_labels_border_top_color="black",
    column_labels_border_top_style="solid",
    column_labels_border_top_width="2px",

    column_labels_border_bottom_color="black",
    column_labels_border_bottom_style="solid",
    column_labels_border_bottom_width="2px",

    table_body_border_bottom_color="black",
    table_body_border_bottom_style="solid",
    table_body_border_bottom_width="2px",
)

gt = gt.tab_spanner_delim()
gt.save(f"{output_dir}/transfer_model_result_table_chem_only.png")

Model,ΔHₚ,ΔHₚ,ΔSₚ,ΔSₚ,Tc,Tc,log Tc,log Tc,Tc TL,Tc TL
Model,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²
GPR,1.99,0.85,3.4,0.76,239,−0.72,174,−0.08,239,−0.56
RF,1.27,0.91,3.51,0.76,288,−3.09,142,0.23,332,−12.55
XGB,1.48,0.9,4.05,0.69,194,0.12,204,−0.72,256,−5.04
SVR,1.7,0.87,3.24,0.8,196,−0.06,143,0.31,151,0.39
KRR,1.78,0.89,3.91,0.66,197,−0.22,157,0.09,226,−3.99


### Nested CV

In [18]:
data = {
    "Model": [item.upper() for item in df['model'][:len(models)].tolist()],
    
    "Split by Chemistry - Nested CV.ΔHₚ.MAE": [f"{m:0.2f} ± {s:0.2f}" for m, s in zip(df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_std'].tolist())],
    "Split by Chemistry - Nested CV.ΔHₚ.R²": [f"{m:0.2f} ± {s:0.2f}" for m, s in zip(df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_std'].tolist())],
    "Split by Chemistry - Nested CV.ΔSₚ.MAE": [f"{m:0.2f} ± {s:0.2f}" for m, s in zip(df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_std'].tolist())],
    "Split by Chemistry - Nested CV.ΔSₚ.R²": [f"{m:0.2f} ± {s:0.2f}" for m, s in zip(df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_std'].tolist())],
    "Split by Chemistry - Nested CV.Tc.MAE": [f"{m:.0f} ± {s:.0f}" for m, s in zip(df[(df['target_Name'] == 'tc') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'tc') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_std'].tolist())],
    "Split by Chemistry - Nested CV.Tc.R²": [f"{m:.2f} ± {s:.2f}" for m, s in zip(df[(df['target_Name'] == 'tc') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_r2_mean'].tolist(), 
                                                                           df[(df['target_Name'] == 'tc') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_r2_std'].tolist())],
    "Split by Chemistry - Nested CV.log Tc.MAE": [f"{m:.0f} ± {s:.0f}" for m, s in zip(df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_std'].tolist())],
    "Split by Chemistry - Nested CV.log Tc.R²": [f"{m:.2f} ± {s:.2f}" for m, s in zip(df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_r2_mean'].tolist(), 
                                                                           df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'chemistry_cluster')]['nested_cv_r2_std'].tolist())],
    "Split by Chemistry - Nested CV.Tc TL.MAE": [f"{m:.0f} ± {s:.0f}" for m, s in zip(df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'chemistry_cluster')]['nested_cv_mae'].tolist(), 
                                                                           df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'chemistry_cluster')]['nested_cv_std'].tolist())],
    "Split by Chemistry - Nested CV.Tc TL.R²": [f"{m:.2f} ± {s:.2f}" for m, s in zip(df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'chemistry_cluster')]['nested_cv_r2_mean'].tolist(), 
                                                                           df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'chemistry_cluster')]['nested_cv_r2_std'].tolist())],
}

gt = GT(pd.DataFrame(data)).cols_align(align="center")

gt = gt.opt_horizontal_padding(scale=2.0)
gt = gt.opt_vertical_padding(scale=1.5)

gt = (gt.tab_options(container_width = "100%",column_labels_background_color="#C0ABDC",))

gt = gt.tab_options(
    column_labels_border_top_color="black",
    column_labels_border_top_style="solid",
    column_labels_border_top_width="2px",

    column_labels_border_bottom_color="black",
    column_labels_border_bottom_style="solid",
    column_labels_border_bottom_width="2px",

    table_body_border_bottom_color="black",
    table_body_border_bottom_style="solid",
    table_body_border_bottom_width="2px",
) 

gt = gt.tab_spanner_delim()
gt.save(f"{output_dir}/nested_CV_table_chem.png")

Unnamed: 0_level_0,Split by Chemistry - Nested CV,Split by Chemistry - Nested CV,Split by Chemistry - Nested CV,Split by Chemistry - Nested CV,Split by Chemistry - Nested CV,Split by Chemistry - Nested CV,Split by Chemistry - Nested CV,Split by Chemistry - Nested CV,Split by Chemistry - Nested CV,Split by Chemistry - Nested CV
Model,ΔHₚ,ΔHₚ,ΔSₚ,ΔSₚ,Tc,Tc,log Tc,log Tc,Tc TL,Tc TL
Model,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²
GPR,2.88 ± 0.50,2.88 ± 0.50,4.15 ± 0.60,4.15 ± 0.60,390 ± 210,-0.11 ± 0.12,365 ± 203,-0.08 ± 0.14,340 ± 216,0.03 ± 0.20
RF,2.83 ± 0.55,2.83 ± 0.55,4.15 ± 0.67,4.15 ± 0.67,443 ± 255,-0.35 ± 0.58,362 ± 196,-0.09 ± 0.15,325 ± 211,-0.08 ± 0.43
XGB,3.01 ± 0.69,3.01 ± 0.69,4.25 ± 0.83,4.25 ± 0.83,438 ± 270,-0.22 ± 0.27,352 ± 198,-0.01 ± 0.22,355 ± 200,-0.64 ± 1.71
SVR,3.06 ± 0.50,3.06 ± 0.50,4.66 ± 0.89,4.66 ± 0.89,371 ± 199,-0.09 ± 0.10,387 ± 217,-0.50 ± 0.96,326 ± 211,0.07 ± 0.16
KRR,3.35 ± 1.42,3.35 ± 1.42,7.60 ± 9.45,7.60 ± 9.45,400 ± 233,-0.17 ± 0.18,445 ± 321,-1.34 ± 3.09,271 ± 232,0.40 ± 0.39


In [19]:
data = {
    "Model": [item.upper() for item in df['model'][:len(models)].tolist()],
    
    "Split by Phase - Nested CV.ΔHₚ.MAE": [f"{m:0.2f} ± {s:0.2f}" for m, s in zip(df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'phase')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'phase')]['nested_cv_std'].tolist())],
    "Split by Phase - Nested CV.ΔHₚ.R²": [f"{m:0.2f} ± {s:0.2f}" for m, s in zip(df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'phase')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'enthalpy') & (df['split_type'] == 'phase')]['nested_cv_std'].tolist())],
    "Split by Phase - Nested CV.ΔSₚ.MAE": [f"{m:0.2f} ± {s:0.2f}" for m, s in zip(df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'phase')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'phase')]['nested_cv_std'].tolist())],
    "Split by Phase - Nested CV.ΔSₚ.R²": [f"{m:0.2f} ± {s:0.2f}" for m, s in zip(df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'phase')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'entropy') & (df['split_type'] == 'phase')]['nested_cv_std'].tolist())],
    "Split by Phase - Nested CV.Tc.MAE": [f"{m:.0f} ± {s:.0f}" for m, s in zip(df[(df['target_Name'] == 'tc') & (df['split_type'] == 'phase')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'tc') & (df['split_type'] == 'phase')]['nested_cv_std'].tolist())],
    "Split by Phase - Nested CV.Tc.R²": [f"{m:.2f} ± {s:.2f}" for m, s in zip(df[(df['target_Name'] == 'tc') & (df['split_type'] == 'phase')]['nested_cv_r2_mean'].tolist(), 
                                                                           df[(df['target_Name'] == 'tc') & (df['split_type'] == 'phase')]['nested_cv_r2_std'].tolist())],
    "Split by Phase - Nested CV.log Tc.MAE": [f"{m:.0f} ± {s:.0f}" for m, s in zip(df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'phase')]['nested_cv_mae'].tolist(), 
                                                                           df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'phase')]['nested_cv_std'].tolist())],
    "Split by Phase - Nested CV.log Tc.R²": [f"{m:.2f} ± {s:.2f}" for m, s in zip(df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'phase')]['nested_cv_r2_mean'].tolist(), 
                                                                           df[(df['target_Name'] == 'log_tc') & (df['split_type'] == 'phase')]['nested_cv_r2_std'].tolist())],
    "Split by Phase - Nested CV.Tc TL.MAE": [f"{m:.0f} ± {s:.0f}" for m, s in zip(df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'phase')]['nested_cv_mae'].tolist(), 
                                                                           df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'phase')]['nested_cv_std'].tolist())],
    "Split by Phase - Nested CV.Tc TL.R²": [f"{m:.2f} ± {s:.2f}" for m, s in zip(df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'phase')]['nested_cv_r2_mean'].tolist(), 
                                                                           df_tl[(df_tl['target_Name'] == 'tc_transfer') & (df_tl['split_type'] == 'phase')]['nested_cv_r2_std'].tolist())],
}

gt = GT(pd.DataFrame(data)).cols_align(align="center")

gt = gt.opt_horizontal_padding(scale=2)
gt = gt.opt_vertical_padding(scale=1.5)

gt = (gt.tab_options(container_width = "100%",column_labels_background_color="#C0ABDC",))

gt = gt.tab_options(
    column_labels_border_top_color="black",
    column_labels_border_top_style="solid",
    column_labels_border_top_width="2px",

    column_labels_border_bottom_color="black",
    column_labels_border_bottom_style="solid",
    column_labels_border_bottom_width="2px",

    table_body_border_bottom_color="black",
    table_body_border_bottom_style="solid",
    table_body_border_bottom_width="2px",
) 

gt = gt.tab_spanner_delim()
gt.save(f"{output_dir}/nested_CV_table_phase.png")

Unnamed: 0_level_0,Split by Phase - Nested CV,Split by Phase - Nested CV,Split by Phase - Nested CV,Split by Phase - Nested CV,Split by Phase - Nested CV,Split by Phase - Nested CV,Split by Phase - Nested CV,Split by Phase - Nested CV,Split by Phase - Nested CV,Split by Phase - Nested CV
Model,ΔHₚ,ΔHₚ,ΔSₚ,ΔSₚ,Tc,Tc,log Tc,log Tc,Tc TL,Tc TL
Model,MAE,R²,MAE,R²,MAE,R²,MAE,R²,MAE,R²
GPR,2.38 ± 0.40,2.38 ± 0.40,4.05 ± 0.43,4.05 ± 0.43,363 ± 128,0.07 ± 0.16,360 ± 131,0.02 ± 0.12,334 ± 148,0.07 ± 0.11
RF,2.39 ± 0.47,2.39 ± 0.47,3.94 ± 0.33,3.94 ± 0.33,391 ± 142,-0.29 ± 0.66,352 ± 142,0.05 ± 0.13,290 ± 177,0.21 ± 0.43
XGB,2.44 ± 0.41,2.44 ± 0.41,4.00 ± 0.39,4.00 ± 0.39,381 ± 136,0.09 ± 0.18,331 ± 131,0.14 ± 0.19,303 ± 157,0.26 ± 0.28
SVR,2.65 ± 0.42,2.65 ± 0.42,4.20 ± 0.42,4.20 ± 0.42,369 ± 128,-0.03 ± 0.05,354 ± 131,-0.00 ± 0.26,326 ± 148,0.08 ± 0.12
KRR,2.48 ± 0.51,2.48 ± 0.51,7.02 ± 8.88,7.02 ± 8.88,375 ± 138,-0.05 ± 0.29,367 ± 133,-0.12 ± 0.55,259 ± 128,0.14 ± 0.52
