In [69]:
from pathlib import Path as pt
import pandas as pd
import json
import numpy as np
import sigfig

In [43]:
from load_data import (
    processed_data_dirs, plots_dir, titles, total_counts,
    models, models_labels, embeddings_dirname, embeddings_names, root_loc
)

print(
    # plots_dir.parent, 
    # [d.name for d in processed_data_dirs], 
    # plots_dir.name, 
    # titles, 
    # total_counts, 
    # models, 
    # models_labels, 
    # embeddings_dirname, 
    embeddings_names, 
    root_loc
)

['Mol2Vec', 'VICGAE'] /Users/aravindhnivas/Library/CloudStorage/OneDrive-MassachusettsInstituteofTechnology/ML-properties


In [47]:
def df_to_latex(
    df, caption='<caption here>', label='tab:<label here>', column_format=None, float_format='{:.2f}', 
):
    """
    Convert pandas DataFrame to LaTeX table string.
    
    Parameters:
    -----------
    df : pandas.DataFrame
        The DataFrame to convert
    caption : str, optional
        Table caption
    label : str, optional
        Table label for referencing
    column_format : str, optional
        LaTeX column format (e.g., 'lcc' for left-aligned, centered, centered)
        If None, will generate 'l' for string columns and 'c' for numeric columns
    float_format : str, optional
        Format string for floating point numbers
        
    Returns:
    --------
    str : LaTeX table code
    """
    # Deep copy to avoid modifying original DataFrame
    df = df.copy()
    
    # Generate column format if not provided
    if column_format is None:
        column_format = ''
        for col in df.columns:
            if df[col].dtype in ['float64', 'int64']:
                column_format += 'c'
            else:
                column_format += 'l'
    
    # Format floating point numbers
    for col in df.select_dtypes(include=['float64']):
        df[col] = df[col].apply(lambda x: float_format.format(x))
    
    # Start building LaTeX table
    latex = ['\\begin{table}[!htb]']
    latex.append('\t\\centering')
    
    # Add caption if provided
    if caption:
        latex.append(f'\t\\caption{{{caption}}}')
    
    # Add label if provided
    if label:
        latex.append(f'\t\\label{{{label}}}')
    
    # Begin tabular environment
    latex.append(f'\t\\begin{{tabular}}{{{column_format}}}')
    latex.append('\t\t\\toprule')
    
    # Add header
    headers = ' & '.join(df.columns)
    latex.append(f'\t\t{headers} \\\\')
    latex.append('\t\t\\midrule')
    
    # Add rows
    for _, row in df.iterrows():
        latex.append('\t\t' + ' & '.join(str(val) for val in row) + ' \\\\')
    
    # End tabular environment
    latex.append('\t\t\\bottomrule')
    latex.append('\t\\end{tabular}')
    latex.append('\\end{table}')
    
    return '\n'.join(latex)

### Data shape pre- and post-processing

In [44]:
metadata = {}
rows_data_shapes = []

for idx, dirs in enumerate(processed_data_dirs):
    title = titles[idx]
    metadata[title] = {}
    
    embedded_vectors_dir = dirs / 'embedded_vectors'
    for embedder, embedding in zip(embeddings_names, embeddings_dirname):
        metadata[title][embedder] = {}
        metadata_file = embedded_vectors_dir / f'processed_{embedding}' / 'metadata.json'
        with open(metadata_file, 'r') as f:
            meta = json.load(f)
            metadata[title][embedder] = meta
            original = meta['original_X_shape']
            validated = meta['validated_X_shape']
            cleaned = meta['cleaned_X_shape']
            rows_data_shapes.append([title, embedder, original, validated, cleaned])
        
# print(json.dumps(metadata, indent=4))
data_shapes_df = pd.DataFrame(rows_data_shapes, columns=['Property', 'Embedder', 'Original', 'Validated', 'Cleaned'])
data_shapes_df.to_csv(root_loc / 'results/data_shapes.csv', index=False)
data_shapes_df

Unnamed: 0,Property,Embedder,Original,Validated,Cleaned
0,MP,Mol2Vec,7476,7476,5980
1,MP,VICGAE,7476,7200,5940
2,BP,Mol2Vec,4915,4915,4865
3,BP,VICGAE,4915,4909,4295
4,VP,Mol2Vec,398,398,348
5,VP,VICGAE,398,398,338
6,CP,Mol2Vec,777,777,769
7,CP,VICGAE,777,776,727
8,CT,Mol2Vec,819,819,802
9,CT,VICGAE,819,818,818


In [48]:
latex_str = df_to_latex(data_shapes_df)
latex_str = latex_str.replace('\t', '    ')
print(latex_str)

\begin{table}[!htb]
    \centering
    \caption{<caption here>}
    \label{tab:<label here>}
    \begin{tabular}{llccc}
        \toprule
        Property & Embedder & Original & Validated & Cleaned \\
        \midrule
        MP & Mol2Vec & 7476 & 7476 & 5980 \\
        MP & VICGAE & 7476 & 7200 & 5940 \\
        BP & Mol2Vec & 4915 & 4915 & 4865 \\
        BP & VICGAE & 4915 & 4909 & 4295 \\
        VP & Mol2Vec & 398 & 398 & 348 \\
        VP & VICGAE & 398 & 398 & 338 \\
        CP & Mol2Vec & 777 & 777 & 769 \\
        CP & VICGAE & 777 & 776 & 727 \\
        CT & Mol2Vec & 819 & 819 & 802 \\
        CT & VICGAE & 819 & 818 & 818 \\
        \bottomrule
    \end{tabular}
\end{table}


In [86]:
modes = ['default', 'default_cleaned_xgboost', 'best_model', 'best_model_cleaned_xgboost']
cleaned_model = 'xgboost'

nfold = 5
metrics_rows = []

invalid_entries = 0

for dir_ind, dir in enumerate(processed_data_dirs):
    for model_ind, model in enumerate(models):
        for embedder_ind, embedder in enumerate(embeddings_dirname):
            for mode in modes:
                property_name = titles[dir_ind]
                model_name = models_labels[model_ind]
                embedder_name = embeddings_names[embedder_ind]
                print(f"Processing {property_name}: {model_name} - {embedder_name}")
                
                pre_trained_filename = f'{model}_{embedder}_pretrained_model_{mode}'
                loc = dir / f'pretrained_models/{model}/{embedder}/{mode}/'

                cv_scores_file = loc / f'{pre_trained_filename}.cv_scores.json'
                if not cv_scores_file.exists():
                    print(f"Skipping {cv_scores_file.name}\n")
                    metrics_row = [property_name, model_name, embedder_name, mode, np.nan, np.nan, np.nan]
                    metrics_rows.append(metrics_row)
                    invalid_entries += 1
                    continue
                
                cv_scores_info = json.load(open(cv_scores_file, 'r'))
                cv_scores = cv_scores_info[f'{nfold}']
                
                # get the mean and std of the test scores
                test_scores = cv_scores['test']
                r2 = sigfig.round(test_scores['r2']['mean'], test_scores['r2']['std'], sep="external_brackets")
                rmse = sigfig.round(test_scores['rmse']['mean'], test_scores['rmse']['std'], sep="external_brackets")
                mae = sigfig.round(test_scores['mae']['mean'], test_scores['mae']['std'], sep="external_brackets")
                
                print(f"R2: {r2}, RMSE: {rmse}, MAE: {mae}")
                metrics_row = [property_name, model_name, embedder_name, mode, r2, rmse, mae]
                metrics_rows.append(metrics_row)
                print()
            print()
        print()
        
    print("#".center(80, '#') + '\n')

print(f"Invalid entries: {invalid_entries}")

Processing MP: GBR - Mol2Vec
R2: 0.67(3), RMSE: 59(3), MAE: 45(2)

Processing MP: GBR - Mol2Vec
R2: 0.83(1), RMSE: 39(1), MAE: 31(1)

Processing MP: GBR - Mol2Vec
R2: 0.71(4), RMSE: 54(4), MAE: 40(2)

Processing MP: GBR - Mol2Vec
R2: 0.87(1), RMSE: 35(2), MAE: 27(1)


Processing MP: GBR - VICGAE
R2: 0.56(3), RMSE: 67(2), MAE: 52(1)

Processing MP: GBR - VICGAE
R2: 0.72(1), RMSE: 49(2), MAE: 39(1)

Processing MP: GBR - VICGAE
R2: 0.63(3), RMSE: 61(3), MAE: 47(2)

Processing MP: GBR - VICGAE
R2: 0.80(1), RMSE: 42(2), MAE: 33.1(9)



Processing MP: CatBoost - Mol2Vec
R2: 0.71(4), RMSE: 54(4), MAE: 40(2)

Processing MP: CatBoost - Mol2Vec
R2: 0.83(1), RMSE: 40(2), MAE: 31(1)

Processing MP: CatBoost - Mol2Vec
R2: 0.69(4), RMSE: 56(4), MAE: 42(2)

Processing MP: CatBoost - Mol2Vec
R2: 0.854(9), RMSE: 36(1), MAE: 28(1)


Processing MP: CatBoost - VICGAE
R2: 0.64(3), RMSE: 61(2), MAE: 46(1)

Processing MP: CatBoost - VICGAE
R2: 0.80(1), RMSE: 42(1), MAE: 33.1(7)

Processing MP: CatBoost - VIC

In [88]:
metrics_columns = ["Property", "Model", "Embedder", "Mode", "R2", "RMSE", "MAE"]
metrics_rows
metrics_df = pd.DataFrame(metrics_rows, columns=metrics_columns)
metrics_df.to_csv(root_loc / 'results/metrics.csv', index=False)
metrics_df

Unnamed: 0,Property,Model,Embedder,Mode,R2,RMSE,MAE
0,MP,GBR,Mol2Vec,default,0.67(3),59(3),45(2)
1,MP,GBR,Mol2Vec,default_cleaned_xgboost,0.83(1),39(1),31(1)
2,MP,GBR,Mol2Vec,best_model,0.71(4),54(4),40(2)
3,MP,GBR,Mol2Vec,best_model_cleaned_xgboost,0.87(1),35(2),27(1)
4,MP,GBR,VICGAE,default,0.56(3),67(2),52(1)
...,...,...,...,...,...,...,...
155,CT,LGBM,Mol2Vec,best_model_cleaned_xgboost,0.93(1),32(4),23(3)
156,CT,LGBM,VICGAE,default,0.86(2),45(3),33(3)
157,CT,LGBM,VICGAE,default_cleaned_xgboost,0.86(2),45(3),33(3)
158,CT,LGBM,VICGAE,best_model,0.86(1),46(2),34(2)
