In [1]:
from pathlib import Path as pt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from cleanlab.regression.learn import CleanLearning
from load_data import processed_data_dirs, property_names, property_units, titles

In [2]:
metrics_loc = pt('/Users/aravindhnivas/Documents/ML-properties/[PHYSICAL CONSTANTS OF ORGANIC COMPOUNDS]/vp_kPa_25C_filtered_ydata_processed_data/analysis_data/filtered/vp_kPa_25C_topelements_processed_data/metrics')
all_metrics_csv = metrics_loc/'all_metrics.csv'
metrics_df = pd.read_csv(all_metrics_csv, index_col=0)
metrics_df

Unnamed: 0,model,Mode,Embedder,Data shape,R2,MSE,RMSE,MAE
0,xgboost,best_model,VICGAE,398 x 32,-0.1(4),15(5),3.8(6),2.8(5)
1,xgboost,best_model: cleaned_xgboost,VICGAE,323 x 32,0.1(4),6(3),2.5(6),1.8(4)
2,xgboost,best_model: cleaned_xgboost: yeo_johnson_Stand...,VICGAE,323 x 32,0.3(2),5(3),2.3(6),1.6(3)
3,xgboost,best_model: cleaned_xgboost: StandardScaler,VICGAE,323 x 32,0.1(4),6(3),2.5(6),1.8(4)
4,xgboost,best_model: cleaned_xgboost: yeo_johnson_Robus...,VICGAE,323 x 32,0.3(2),5(3),2.3(6),1.6(3)
...,...,...,...,...,...,...,...,...
187,lgbm,default: yeo_johnson_StandardScaler,mol2vec,398 x 300,0.2(2),11(4),3.3(5),2.2(4)
188,lgbm,default: StandardScaler,mol2vec,398 x 300,0.2(2),10(3),3.2(5),2.4(3)
189,lgbm,default: yeo_johnson_RobustScaler,mol2vec,398 x 300,0.2(2),11(4),3.3(5),2.2(4)
190,lgbm,default: yeo_johnson,mol2vec,398 x 300,0.2(2),11(4),3.3(5),2.2(4)


In [92]:
from uncertainties import ufloat_fromstr, ufloat
value = '0.3(1)'
print(ufloat_fromstr(value))
parts = value.replace('(', ' ').replace(')', '').split()
print(parts)
if len(parts) == 2:
    nominal = float(parts[0])
    # Convert uncertainty to the same decimal places as nominal
    uncertainty = float(parts[1]) * 10**(-len(parts[0].split('.')[-1]))
    print(ufloat(nominal, uncertainty))
    
10**(-len(parts[0].split('.')[-1]))

0.30+/-0.10
['0.3', '1']
0.30+/-0.10


0.1

In [9]:
import pandas as pd
from uncertainties import ufloat_fromstr, ufloat
import uncertainties as unc

def parse_metric_with_uncertainty(value: str):
    """Parse metrics in format like '0.3(2)' to ufloat."""
    if isinstance(value, str):
        # Handle cases where the uncertainty is in parentheses
        try:
            # Try parsing as a direct ufloat string
            return ufloat_fromstr(value)
        except Exception as e:
            print(e)
            # If that fails, try manual parsing
            parts = value.replace('(', ' ').replace(')', '').split()
            if len(parts) == 2:
                nominal = float(parts[0])
                # Convert uncertainty to the same decimal places as nominal
                uncertainty = float(parts[1]) * 10**(-len(parts[0].split('.')[-1]))
                return ufloat(nominal, uncertainty)
    return ufloat(float(value), 0)

def analyze_best_metrics(df: pd.DataFrame):
    """Analyze and find best performing models across different metrics."""
    # Convert metrics to ufloat values
    metrics = ['R2', 'MSE', 'RMSE', 'MAE']
    for metric in metrics:
        df[f'{metric}_value'] = df[metric].apply(ufloat_fromstr)
    
    best_models = {}
    for metric in metrics:
        # Sort by nominal value
        if metric == 'R2':
            # For R2, higher is better, and we want to access the nominal value directly
            sorted_df = df.sort_values(
                by=f'{metric}_value', 
                key=lambda x: [v.nominal_value for v in x],
                ascending=False
            )
        else:
            # For MSE, RMSE, MAE lower is better, and we want to access the nominal value directly
            sorted_df = df.sort_values(
                by=f'{metric}_value',
                key=lambda x: [v.nominal_value for v in x],
                ascending=True
            )
        best_models[metric] = sorted_df.head(5)[['model', 'Mode', 'Embedder', metric, f'{metric}_value']]
    
    # Analyze by model type
    model_performance = {}
    for model_type in df['model'].unique():
        model_data = df[df['model'] == model_type]
        model_performance[model_type] = {
            'best_R2': max(model_data['R2_value'], key=lambda x: x.nominal_value),
            'best_MSE': min(model_data['MSE_value'], key=lambda x: x.nominal_value),
            'best_RMSE': min(model_data['RMSE_value'], key=lambda x: x.nominal_value),
            'best_MAE': min(model_data['MAE_value'], key=lambda x: x.nominal_value)
        }
    
    # Analyze by embedder
    embedder_performance = {}
    for embedder in df['Embedder'].unique():
        embedder_data = df[df['Embedder'] == embedder]
        embedder_performance[embedder] = {
            'best_R2': max(embedder_data['R2_value'], key=lambda x: x.nominal_value),
            'best_MSE': min(embedder_data['MSE_value'], key=lambda x: x.nominal_value),
            'best_RMSE': min(embedder_data['RMSE_value'], key=lambda x: x.nominal_value),
            'best_MAE': min(embedder_data['MAE_value'], key=lambda x: x.nominal_value)
        }
    
    # Model-embedder performance: For each model, find the best performing metric for each embedder
    # Columns are: model -> embedder -> best_R2, best_MSE, best_RMSE, best_MAE
    model_embedder_performance = []
    for model_type in metrics_df['model'].unique():
        for embedder in metrics_df['Embedder'].unique():
            data = metrics_df[(metrics_df['model'] == model_type) & 
                                (metrics_df['Embedder'] == embedder)]
            
            if len(data) > 0:
                best_r2_idx = data['R2_value'].apply(lambda x: x.nominal_value).idxmax()
                best_mse_idx = data['MSE_value'].apply(lambda x: x.nominal_value).idxmin()
                best_rmse_idx = data['RMSE_value'].apply(lambda x: x.nominal_value).idxmin()
                best_mae_idx = data['MAE_value'].apply(lambda x: x.nominal_value).idxmin()
                
                row_data = {
                    'model': model_type,
                    'embedder': embedder,
                    'best_R2': data.loc[best_r2_idx, 'R2_value'],
                    'R2_mode': data.loc[best_r2_idx, 'Mode'],
                    'best_MSE': data.loc[best_mse_idx, 'MSE_value'],
                    'MSE_mode': data.loc[best_mse_idx, 'Mode'],
                    'best_RMSE': data.loc[best_rmse_idx, 'RMSE_value'],
                    'RMSE_mode': data.loc[best_rmse_idx, 'Mode'],
                    'best_MAE': data.loc[best_mae_idx, 'MAE_value'],
                    'MAE_mode': data.loc[best_mae_idx, 'Mode']
                }
                model_embedder_performance.append(row_data)
    
    return {
        'best_models': best_models,
        'model_performance': model_performance,
        'embedder_performance': embedder_performance,
        'model_embedder_performance': model_embedder_performance
    }

def format_results(results):
    """Format the analysis results into a readable string."""
    output = []
    
    # Best models by metric
    output.append("=== TOP 5 MODELS BY METRIC ===")
    for metric, df in results['best_models'].items():
        output.append(f"\nBest {metric}:")
        for idx, row in df.iterrows():
            output.append(f"  {row['model']} ({row['Mode']}, {row['Embedder']}): {row[f'{metric}_value']}")
    
    # Model type performance
    output.append("\n=== MODEL TYPE PERFORMANCE ===")
    for model, perf in results['model_performance'].items():
        output.append(f"\n{model}:")
        output.append(f"  R2: best={perf['best_R2']}")
        output.append(f"  MSE: best={perf['best_MSE']}")
        output.append(f"  RMSE: best={perf['best_RMSE']}")
        output.append(f"  MAE: best={perf['best_MAE']}")
    
    # Embedder performance
    output.append("\n=== EMBEDDER PERFORMANCE ===")
    for embedder, perf in results['embedder_performance'].items():
        output.append(f"\n{embedder}:")
        output.append(f"  R2: best={perf['best_R2']}")
        output.append(f"  MSE: best={perf['best_MSE']}")
        output.append(f"  RMSE: best={perf['best_RMSE']}")
        output.append(f"  MAE: best={perf['best_MAE']}")
    
    return "\n".join(output)

# Load and process the data
# Run analysis
best_metrics_results = analyze_best_metrics(metrics_df)

# Print formatted results
print(format_results(best_metrics_results))


=== TOP 5 MODELS BY METRIC ===

Best R2:
  lgbm (default: cleaned_xgboost: yeo_johnson_StandardScaler, VICGAE): 0.40+/-0.20
  lgbm (default: cleaned_xgboost, VICGAE): 0.40+/-0.20
  gbr (default: cleaned_xgboost: yeo_johnson_RobustScaler, VICGAE): 0.40+/-0.20
  gbr (default: cleaned_xgboost: StandardScaler, VICGAE): 0.40+/-0.20
  gbr (default: cleaned_xgboost: yeo_johnson_StandardScaler, VICGAE): 0.40+/-0.20

Best MSE:
  gbr (best_model: cleaned_xgboost: RobustScaler, VICGAE): 4.0+/-2.0
  gbr (best_model: cleaned_xgboost, VICGAE): 4.0+/-2.0
  gbr (best_model: cleaned_xgboost: yeo_johnson_RobustScaler, VICGAE): 4.0+/-3.0
  gbr (default: cleaned_xgboost: yeo_johnson_StandardScaler, VICGAE): 5.0+/-3.0
  lgbm (default: cleaned_xgboost: yeo_johnson, VICGAE): 5.0+/-2.0

Best RMSE:
  gbr (best_model: cleaned_xgboost: yeo_johnson_RobustScaler, VICGAE): 2.0+/-0.6
  gbr (best_model: cleaned_xgboost, VICGAE): 2.1+/-0.5
  lgbm (default: cleaned_xgboost: yeo_johnson, VICGAE): 2.1+/-0.5
  lgbm (defau

In [10]:
performance_by_embedder_df = pd.DataFrame(best_metrics_results['model_embedder_performance'])
performance_by_embedder_df

Unnamed: 0,model,embedder,best_R2,R2_mode,best_MSE,MSE_mode,best_RMSE,RMSE_mode,best_MAE,MAE_mode
0,xgboost,VICGAE,0.40+/-0.10,default: cleaned_xgboost: yeo_johnson_Standard...,5.0+/-3.0,best_model: cleaned_xgboost: yeo_johnson_Stand...,2.1+/-0.6,default: cleaned_xgboost: yeo_johnson_Standard...,1.5+/-0.4,default: cleaned_xgboost: yeo_johnson_Standard...
1,xgboost,mol2vec,0.24+/-0.04,default: cleaned_xgboost: yeo_johnson_Standard...,6.0+/-1.0,best_model: cleaned_xgboost,2.50+/-0.20,best_model: cleaned_xgboost,1.70+/-0.20,best_model: cleaned_xgboost: yeo_johnson_Stand...
2,catboost,VICGAE,0.40+/-0.20,default: cleaned_xgboost,5.0+/-2.0,default: cleaned_xgboost,2.1+/-0.5,default: cleaned_xgboost,1.50+/-0.30,default: cleaned_xgboost
3,catboost,mol2vec,0.32+/-0.07,default: cleaned_xgboost,5.4+/-0.4,best_model: cleaned_xgboost,2.30+/-0.20,default: cleaned_xgboost,1.60+/-0.10,best_model: cleaned_xgboost: yeo_johnson_Stand...
4,gbr,VICGAE,0.40+/-0.20,best_model: cleaned_xgboost,4.0+/-2.0,best_model: cleaned_xgboost,2.0+/-0.6,best_model: cleaned_xgboost: yeo_johnson_Robus...,1.4+/-0.4,best_model: cleaned_xgboost: yeo_johnson_Robus...
5,gbr,mol2vec,0.36+/-0.09,best_model: cleaned_xgboost: yeo_johnson_Stand...,5.0+/-1.0,best_model: cleaned_xgboost: StandardScaler,2.30+/-0.10,best_model: cleaned_xgboost,1.60+/-0.20,best_model: cleaned_xgboost: yeo_johnson_Stand...
6,lgbm,VICGAE,0.40+/-0.20,default: cleaned_xgboost,5.0+/-2.0,default: cleaned_xgboost,2.1+/-0.5,default: cleaned_xgboost,1.50+/-0.30,default: cleaned_xgboost
7,lgbm,mol2vec,0.30+/-0.20,best_model: cleaned_xgboost,5.4+/-0.4,default: cleaned_xgboost,2.32+/-0.08,default: cleaned_xgboost,1.60+/-0.20,default: cleaned_xgboost: yeo_johnson_Standard...


In [67]:
metrics = ['R2', 'MSE', 'RMSE', 'MAE']
for metric in metrics:
    metrics_df[f'{metric}_value'] = metrics_df[metric].apply(parse_metric_with_uncertainty)

metrics_df

Unnamed: 0,model,Mode,Embedder,Data shape,R2,MSE,RMSE,MAE,R2_value,MSE_value,RMSE_value,MAE_value
0,xgboost,best_model,VICGAE,398 x 32,-0.1(4),15(5),3.8(6),2.8(5),-0.1+/-0.4,15+/-5,3.8+/-0.6,2.8+/-0.5
1,xgboost,best_model: cleaned_xgboost,VICGAE,323 x 32,0.1(4),6(3),2.5(6),1.8(4),0.1+/-0.4,6.0+/-3.0,2.5+/-0.6,1.8+/-0.4
2,xgboost,best_model: cleaned_xgboost: yeo_johnson_Stand...,VICGAE,323 x 32,0.3(2),5(3),2.3(6),1.6(3),0.30+/-0.20,5.0+/-3.0,2.3+/-0.6,1.60+/-0.30
3,xgboost,best_model: cleaned_xgboost: StandardScaler,VICGAE,323 x 32,0.1(4),6(3),2.5(6),1.8(4),0.1+/-0.4,6.0+/-3.0,2.5+/-0.6,1.8+/-0.4
4,xgboost,best_model: cleaned_xgboost: yeo_johnson_Robus...,VICGAE,323 x 32,0.3(2),5(3),2.3(6),1.6(3),0.30+/-0.20,5.0+/-3.0,2.3+/-0.6,1.60+/-0.30
...,...,...,...,...,...,...,...,...,...,...,...,...
187,lgbm,default: yeo_johnson_StandardScaler,mol2vec,398 x 300,0.2(2),11(4),3.3(5),2.2(4),0.20+/-0.20,11+/-4,3.3+/-0.5,2.2+/-0.4
188,lgbm,default: StandardScaler,mol2vec,398 x 300,0.2(2),10(3),3.2(5),2.4(3),0.20+/-0.20,10.0+/-3.0,3.2+/-0.5,2.40+/-0.30
189,lgbm,default: yeo_johnson_RobustScaler,mol2vec,398 x 300,0.2(2),11(4),3.3(5),2.2(4),0.20+/-0.20,11+/-4,3.3+/-0.5,2.2+/-0.4
190,lgbm,default: yeo_johnson,mol2vec,398 x 300,0.2(2),11(4),3.3(5),2.2(4),0.20+/-0.20,11+/-4,3.3+/-0.5,2.2+/-0.4


In [122]:
type(ufloat(0.3, 0.1))

uncertainties.core.Variable

In [134]:
import sigfig
from typing import Literal
def round_off(x: unc.core.Variable):
    return sigfig.round(x.nominal_value, x.std_dev, sep='external_brackets')
    
def get_best_metrics(df: pd.DataFrame, unique_name: Literal["model", "Embedder"]):
    """Get the best metrics for each model."""

    other_name = "model" if unique_name == "Embedder" else "Embedder"

    model_performance = []
    for model_type in df[unique_name].unique():
        model_data = df[df[unique_name] == model_type]

        # For R2 (highest value)
        best_r2_idx = model_data["R2_value"].apply(lambda x: x.nominal_value).idxmax()
        best_r2_row = model_data.loc[best_r2_idx]

        # For MSE (lowest value)
        best_mse_idx = model_data["MSE_value"].apply(lambda x: x.nominal_value).idxmin()
        best_mse_row = model_data.loc[best_mse_idx]

        # For RMSE (lowest value)
        best_rmse_idx = (
            model_data["RMSE_value"].apply(lambda x: x.nominal_value).idxmin()
        )
        best_rmse_row = model_data.loc[best_rmse_idx]

        # For MAE (lowest value)
        best_mae_idx = model_data["MAE_value"].apply(lambda x: x.nominal_value).idxmin()
        best_mae_row = model_data.loc[best_mae_idx]

        row_data = {
            unique_name: model_type,
            "best_R2": best_r2_row["R2"],
            "best_R2_mode": best_r2_row["Mode"],
            f"best_R2_{other_name}": best_r2_row[other_name],
            "best_MSE": best_mse_row["MSE"],
            "best_MSE_mode": best_mse_row["Mode"],
            f"best_MSE_{other_name}": best_mse_row[other_name],
            "best_RMSE": best_rmse_row["RMSE"],
            "best_RMSE_mode": best_rmse_row["Mode"],
            f"best_RMSE_{other_name}": best_rmse_row[other_name],
            "best_MAE": best_mae_row["MAE"],
            "best_MAE_mode": best_mae_row["Mode"],
            f"best_MAE_{other_name}": best_mae_row[other_name],
        }
        model_performance.append(row_data)

    model_performance_df = pd.DataFrame(model_performance)

    return model_performance_df

# Convert to DataFrame
performance_df = get_best_metrics(metrics_df, "model")
performance_df

Unnamed: 0,model,best_R2,best_R2_mode,best_R2_Embedder,best_MSE,best_MSE_mode,best_MSE_Embedder,best_RMSE,best_RMSE_mode,best_RMSE_Embedder,best_MAE,best_MAE_mode,best_MAE_Embedder
0,xgboost,0.4(1),default: cleaned_xgboost: yeo_johnson_Standard...,VICGAE,5(3),best_model: cleaned_xgboost: yeo_johnson_Stand...,VICGAE,2.1(6),default: cleaned_xgboost: yeo_johnson_Standard...,VICGAE,1.5(4),default: cleaned_xgboost: yeo_johnson_Standard...,VICGAE
1,catboost,0.4(2),default: cleaned_xgboost,VICGAE,5(2),default: cleaned_xgboost,VICGAE,2.1(5),default: cleaned_xgboost,VICGAE,1.5(3),default: cleaned_xgboost,VICGAE
2,gbr,0.4(2),best_model: cleaned_xgboost,VICGAE,4(2),best_model: cleaned_xgboost,VICGAE,2.0(6),best_model: cleaned_xgboost: yeo_johnson_Robus...,VICGAE,1.4(4),best_model: cleaned_xgboost: yeo_johnson_Robus...,VICGAE
3,lgbm,0.4(2),default: cleaned_xgboost,VICGAE,5(2),default: cleaned_xgboost,VICGAE,2.1(5),default: cleaned_xgboost,VICGAE,1.5(3),default: cleaned_xgboost,VICGAE


In [76]:
best_models = {}
for metric in metrics:
    # Sort by nominal value
    if metric == 'R2':
        # For R2, higher is better, and we want to access the nominal value directly
        sorted_df = metrics_df.sort_values(
            by=f'{metric}_value', 
            key=lambda x: [v.nominal_value for v in x],
            ascending=False
        )
    else:
        # For MSE, RMSE, MAE lower is better, and we want to access the nominal value directly
        sorted_df = metrics_df.sort_values(
            by=f'{metric}_value',
            key=lambda x: [v.nominal_value for v in x],
            ascending=True
        )
    best_models[metric] = sorted_df.head(5)[['model', 'Mode', 'Embedder', metric, f'{metric}_value']]

In [80]:
best_models['MAE']

Unnamed: 0,model,Mode,Embedder,MAE,MAE_value
100,gbr,best_model: cleaned_xgboost: yeo_johnson_Robus...,VICGAE,1.4(4),1.4+/-0.4
65,catboost,default: cleaned_xgboost: yeo_johnson,VICGAE,1.5(4),1.5+/-0.4
157,lgbm,default: cleaned_xgboost,VICGAE,1.5(3),1.50+/-0.30
110,gbr,default: cleaned_xgboost: yeo_johnson_Standard...,VICGAE,1.5(3),1.50+/-0.30
158,lgbm,default: cleaned_xgboost: yeo_johnson_Standard...,VICGAE,1.5(3),1.50+/-0.30


In [265]:
ind = 0
embeddings = 'VICGAE'
# embeddings = 'mol2vec'
property_name = property_names[ind]
property_unit = property_units[ind]
title = titles[ind]
property_name_with_unit = f'{property_name} ({property_unit})'
print(property_name_with_unit, title)

# File path setup
current_dir = processed_data_dirs[ind]
fname = current_dir.name.replace('_processed_data', '')
csv_file = current_dir.parent / f'{fname}.csv'
vec_dir = current_dir / 'embedded_vectors'
vec_file = current_dir / 'embedded_vectors' / f'{embeddings}_embeddings.npy'

# Print file existence checks
print(csv_file.exists(), csv_file.name)
print(vec_file.exists(), vec_file.name)
processed_vec_dir = vec_dir / f'processed_{embeddings}_embeddings'
print(processed_vec_dir.exists(), processed_vec_dir.name)

def process_data(csv_file, vec_file):
    # Load data
    df = pd.read_csv(csv_file, index_col='INDEX')
    X = np.load(vec_file, allow_pickle=True)
    
    # Create DataFrame with feature columns
    feature_cols = [str(i) for i in range(X.shape[1])]
    data_df = pd.DataFrame(X, index=df.index, columns=feature_cols)
    
    # Add SMILES and y columns
    data_df.loc[:, 'SMILES'] = df['SMILES']
    data_df.loc[:, 'y'] = pd.to_numeric(df['Processed tmp/ºC'], errors='coerce')
    
    # Reorder columns
    cols_order = ['SMILES', 'y'] + feature_cols
    data_df = data_df[cols_order]
    
    # Filter data efficiently using numpy operations
    features = data_df.iloc[:, 2:].to_numpy()
    y_values = data_df['y'].to_numpy()
    
    # Create masks
    non_zero_mask = np.any(features != 0, axis=1)
    valid_y_mask = ~(np.isnan(y_values) | np.isinf(y_values))
    final_mask = non_zero_mask & valid_y_mask
    
    # Apply final filtering
    final_df = data_df[final_mask]
    
    # Print statistics
    print(f"Original number of rows: {len(data_df)}")
    print(f"Rows removed due to all-zero features: {np.sum(~non_zero_mask)}")
    print(f"Rows removed due to invalid y values: {np.sum(~valid_y_mask)}")
    print(f"Final number of rows: {len(final_df)}")
    
    return final_df

# Process the data
final_df = process_data(csv_file, vec_file)

# Ensure all column names are strings
final_df.columns = [str(col) if not isinstance(col, str) else col for col in final_df.columns]
final_df

Melting Point (K) MP
True tmpC_topelements.csv
True VICGAE_embeddings.npy
True processed_VICGAE_embeddings
Original number of rows: 7476
Rows removed due to all-zero features: 276
Rows removed due to invalid y values: 0
Final number of rows: 7200


Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,22,23,24,25,26,27,28,29,30,31
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,COP(=S)(OC)Oc1ccc(Sc2ccc(OP(=S)(OC)OC)cc2)cc1,31.6,-0.098059,-0.047184,0.106616,-0.027322,-0.093619,0.066488,-0.064736,-0.111412,...,-0.126450,0.195126,-0.151011,0.005076,-0.020991,0.256710,0.129438,-0.206446,0.233008,-0.077102
1,CC(C)C1=CC2=CCC3C(C)(C(=O)O)CCCC3(C)C2CC1,173.5,-0.024240,-0.144994,0.199108,0.072760,-0.184217,0.134512,0.053255,0.059104,...,-0.162371,0.178511,-0.228626,-0.168337,-0.140319,0.233482,0.153210,-0.113846,0.198516,-0.038174
2,CC(C=CC1(O)C(C)=CC(=O)CC1(C)C)=CC(=O)O,160.0,-0.027528,-0.102735,0.150704,0.037579,-0.147735,0.115634,0.028802,-0.005540,...,-0.150653,0.151073,-0.144016,-0.136821,-0.122729,0.215734,0.169227,-0.148617,0.206037,-0.014154
3,COc1ccc(-c2cc(=O)c3c(O)cc(O)cc3o2)cc1,263.0,-0.072301,-0.007497,0.101740,-0.017365,-0.115630,0.072632,0.007010,-0.054559,...,-0.040294,0.202049,-0.147742,-0.097413,-0.068532,0.192383,0.185914,-0.184456,0.168783,-0.089093
4,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1,121.0,-0.020509,-0.202147,0.106685,0.065656,-0.172309,0.090950,0.112530,-0.041245,...,-0.180756,0.163883,-0.202998,-0.112563,-0.087562,0.213093,0.216535,-0.087852,0.252828,0.012640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7634,OC1O[C@H](CO[C@@H]2OC[C@@H](O)[C@H](O)[C@H]2O)...,210.0,0.020207,0.053076,0.150216,0.050146,-0.207151,0.176501,-0.088642,0.080877,...,-0.056266,0.128084,-0.179547,-0.127651,-0.009876,0.025442,0.097077,-0.049156,0.214880,-0.172932
7635,CNC(=O)Oc1cc(C)cc(C)c1,99.0,-0.032345,-0.083990,0.064483,0.019927,-0.075109,0.025857,0.043563,-0.063878,...,-0.074251,0.091374,-0.106849,-0.054253,-0.043797,0.108406,0.118283,-0.079988,0.130430,0.003424
7636,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21,241.0,-0.032777,-0.116827,0.146305,0.010973,-0.127730,0.124864,0.011101,-0.016484,...,-0.152184,0.188292,-0.212106,-0.128266,-0.117496,0.240508,0.158816,-0.204683,0.211354,-0.036449
7638,CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c2C(=O)O1,164.0,-0.007161,-0.095031,0.162794,0.073035,-0.163656,0.110062,0.028735,-0.044958,...,-0.130072,0.174966,-0.199527,-0.122258,-0.132071,0.200796,0.169283,-0.179877,0.220272,-0.042694


In [275]:
features = final_df.iloc[:, 2:].to_numpy()
y_values = final_df["y"].to_numpy()

# Create masks
non_zero_mask = np.any(features != 0, axis=1)
valid_y_mask = ~(np.isnan(y_values) | np.isinf(y_values))
final_mask = non_zero_mask & valid_y_mask

In [286]:
len(final_df), final_df.shape[0]

(7200, 7200)

In [287]:
# 1. Most efficient - Apache Parquet format
# Maintains data types, supports compression, very fast read/write
final_df.to_parquet('final_data.parquet', compression='snappy')

In [291]:
# read_df = pd.read_parquet('final_data.parquet')
read_df = pd.read_parquet('/Users/aravindhnivas/Documents/ML-properties/[PHYSICAL CONSTANTS OF ORGANIC COMPOUNDS]/tmp_C_processed_data/analysis_data/filtered/tmpC_topelements_processed_data/embedded_vectors/processed_VICGAE_embeddings/processed_df.parquet')
print(read_df.index.name)
read_df

INDEX


Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,22,23,24,25,26,27,28,29,30,31
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,COP(=S)(OC)Oc1ccc(Sc2ccc(OP(=S)(OC)OC)cc2)cc1,31.6,-0.098059,-0.047184,0.106616,-0.027322,-0.093619,0.066488,-0.064736,-0.111412,...,-0.126450,0.195126,-0.151011,0.005076,-0.020991,0.256710,0.129438,-0.206446,0.233008,-0.077102
1,CC(C)C1=CC2=CCC3C(C)(C(=O)O)CCCC3(C)C2CC1,173.5,-0.024240,-0.144994,0.199108,0.072760,-0.184217,0.134512,0.053255,0.059104,...,-0.162371,0.178511,-0.228626,-0.168337,-0.140319,0.233482,0.153210,-0.113846,0.198516,-0.038174
2,CC(C=CC1(O)C(C)=CC(=O)CC1(C)C)=CC(=O)O,160.0,-0.027528,-0.102735,0.150704,0.037579,-0.147735,0.115634,0.028802,-0.005540,...,-0.150653,0.151073,-0.144016,-0.136821,-0.122729,0.215734,0.169227,-0.148617,0.206037,-0.014154
3,COc1ccc(-c2cc(=O)c3c(O)cc(O)cc3o2)cc1,263.0,-0.072301,-0.007497,0.101740,-0.017365,-0.115630,0.072632,0.007010,-0.054559,...,-0.040294,0.202049,-0.147742,-0.097413,-0.068532,0.192383,0.185914,-0.184456,0.168783,-0.089093
4,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1,121.0,-0.020509,-0.202147,0.106685,0.065656,-0.172309,0.090950,0.112530,-0.041245,...,-0.180756,0.163883,-0.202998,-0.112563,-0.087562,0.213093,0.216535,-0.087852,0.252828,0.012640
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7634,OC1O[C@H](CO[C@@H]2OC[C@@H](O)[C@H](O)[C@H]2O)...,210.0,0.020207,0.053076,0.150216,0.050146,-0.207151,0.176501,-0.088642,0.080877,...,-0.056266,0.128084,-0.179547,-0.127651,-0.009876,0.025442,0.097077,-0.049156,0.214880,-0.172932
7635,CNC(=O)Oc1cc(C)cc(C)c1,99.0,-0.032345,-0.083990,0.064483,0.019927,-0.075109,0.025857,0.043563,-0.063878,...,-0.074251,0.091374,-0.106849,-0.054253,-0.043797,0.108406,0.118283,-0.079988,0.130430,0.003424
7636,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21,241.0,-0.032777,-0.116827,0.146305,0.010973,-0.127730,0.124864,0.011101,-0.016484,...,-0.152184,0.188292,-0.212106,-0.128266,-0.117496,0.240508,0.158816,-0.204683,0.211354,-0.036449
7638,CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c2C(=O)O1,164.0,-0.007161,-0.095031,0.162794,0.073035,-0.163656,0.110062,0.028735,-0.044958,...,-0.130072,0.174966,-0.199527,-0.122258,-0.132071,0.200796,0.169283,-0.179877,0.220272,-0.042694


In [292]:
label_issues_df = pd.read_parquet('/Users/aravindhnivas/Documents/ML-properties/[PHYSICAL CONSTANTS OF ORGANIC COMPOUNDS]/tmp_C_processed_data/analysis_data/filtered/tmpC_topelements_processed_data/embedded_vectors/processed_VICGAE_embeddings/label_issues_xgboost.parquet')
label_issues_df

Unnamed: 0_level_0,is_label_issue,label_quality,given_label,predicted_label
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,True,0.019530,31.6,178.806290
1,False,0.915968,173.5,170.546692
2,False,0.466650,160.0,134.655960
3,False,0.457391,263.0,225.751968
4,False,0.414707,121.0,151.042877
...,...,...,...,...
7634,False,0.614436,210.0,181.933868
7635,False,0.647124,99.0,84.992813
7636,False,0.139080,241.0,164.400925
7638,False,0.770702,164.0,153.737183


In [18]:
from xgboost import XGBRegressor
clean_model = XGBRegressor(verbosity=0, n_jobs=-1)
cl = CleanLearning(clean_model, verbose=True)
cl.fit(final_df.iloc[:, 2:], final_df['y'])
# label_issues_df = cl.get_label_issues()

Identifying label issues ...
Identified 468 examples with label issues.
Pruning 468 examples with label issues ...
Remaining clean data has 7008 examples.
Fitting final model on the clean data ...
Label issues stored in label_issues_df DataFrame accessible via: self.get_label_issues(). Call self.save_space() to delete this potentially large DataFrame attribute.


In [10]:
# Get label issues and assign the same index
# label_issues_df = cl.get_label_issues()
label_issues_df = cl.get_label_issues().copy()
label_issues_df.index = final_df.index

# Verify indices match
print("Do indices match?", (label_issues_df.index == final_df.index).all())
print("final_df index shape:", final_df.index.shape)
print("label_issues_df index shape:", label_issues_df.index.shape)

# Now you can safely filter
final_df_cleaned = final_df[~label_issues_df["is_label_issue"]]
label_issues_df

Do indices match? True
final_df index shape: (7200,)
label_issues_df index shape: (7200,)


Unnamed: 0_level_0,is_label_issue,label_quality,given_label,predicted_label
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,True,0.044651,31.6,137.787941
1,False,0.423571,173.5,203.630417
2,True,0.089634,160.0,80.963783
3,False,0.162613,263.0,187.759750
4,False,0.868707,121.0,114.434525
...,...,...,...,...
7634,False,0.445912,210.0,262.916443
7635,False,0.947231,99.0,100.737709
7636,False,0.406718,241.0,214.745087
7638,False,0.918281,164.0,160.993088


In [264]:
final_df_cleaned = final_df[~label_issues_df["is_label_issue"]]
final_df_cleaned

Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,COP(=S)(OC)Oc1ccc(Sc2ccc(OP(=S)(OC)OC)cc2)cc1,31.6,14.583024,7.004414,10.764027,9.659239,-7.881231,-12.125900,4.289826,11.829774,...,-2.557744,2.143950,-1.143839,7.093911,8.230412,2.932323,-9.742094,-3.931319,4.814355,0.262482
1,CC(C)C1=CC2=CCC3C(C)(C(=O)O)CCCC3(C)C2CC1,173.5,8.672269,3.382508,-1.009339,8.613342,-5.257068,-3.722235,-2.904521,3.233326,...,-3.215222,1.214921,0.065320,2.262887,3.039335,1.418446,-5.120859,0.569136,6.675544,-3.676702
2,CC(C=CC1(O)C(C)=CC(=O)CC1(C)C)=CC(=O)O,160.0,6.926352,4.612771,-0.524525,6.452645,-5.662361,-8.545525,-2.476228,6.148377,...,-2.327952,-0.310817,-2.341082,1.690612,1.047211,-0.751602,-5.519598,1.978377,6.315439,-2.911364
3,COc1ccc(-c2cc(=O)c3c(O)cc(O)cc3o2)cc1,263.0,9.478333,8.671986,2.309459,5.326269,-4.638491,-9.584875,-1.285632,10.793772,...,-4.620464,-0.298921,-1.304448,6.174148,6.492252,3.929846,-3.667210,-3.263721,3.714898,-0.117793
4,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1,121.0,11.625536,0.939922,3.890236,5.016088,-10.919663,-4.537029,-0.758402,7.000362,...,-3.590667,1.033413,-0.449358,0.507696,4.534042,0.110596,-8.913643,-0.294865,9.153279,-6.478106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7634,OC1O[C@H](CO[C@@H]2OC[C@@H](O)[C@H](O)[C@H]2O)...,210.0,9.415656,14.146734,-6.948192,4.933622,-0.930297,-9.007641,-5.025923,2.624626,...,-3.239574,6.539446,7.643373,4.224568,-0.641620,8.178635,-7.266513,-8.799261,2.400525,3.103480
7635,CNC(=O)Oc1cc(C)cc(C)c1,99.0,6.605989,1.137232,2.373102,3.327681,-5.042240,-4.029456,-0.581940,4.979243,...,-2.800613,0.174910,0.252637,1.030272,3.036746,1.474039,-4.084049,-2.257625,2.323892,-3.162244
7636,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21,241.0,13.432186,6.694004,-0.122617,7.899603,-2.365482,-6.335269,-3.245623,6.332031,...,-4.330900,2.721418,0.066121,5.533253,3.201767,5.296447,-4.901769,-3.042064,8.489907,-0.412169
7638,CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c2C(=O)O1,164.0,9.669550,5.628396,0.379068,6.077866,-3.538999,-8.804146,-1.054745,8.215943,...,-5.421795,-0.008273,-3.303473,5.494559,5.693076,5.359212,-3.300048,-4.743765,3.138605,-2.784042


In [12]:
final_df_train, final_df_test = train_test_split(final_df, test_size=0.2)
final_df_train

Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,22,23,24,25,26,27,28,29,30,31
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7287,OCCOCCOCCO,-9.4,0.074590,0.001385,0.036988,0.070433,-0.064667,0.067474,0.018467,-0.036345,...,-0.070835,0.033864,-0.035366,-0.051272,-0.016264,0.063585,0.065975,-0.075633,0.062644,-0.015201
2612,S=c1nn[nH]n1-c1ccccc1,145.0,-0.085973,-0.027213,-0.024130,-0.087277,-0.014035,-0.008887,-0.014158,-0.049648,...,-0.007281,0.084851,-0.065530,0.005929,0.024622,0.072121,0.078100,-0.058612,0.072334,-0.028537
40,CC(=O)Nc1ccc(C(=O)O)cc1,256.5,-0.047981,-0.083447,0.069319,0.040202,-0.067987,0.037940,0.065622,-0.051146,...,-0.073453,0.074501,-0.097112,0.002115,-0.051941,0.112823,0.077744,-0.066154,0.131735,-0.002050
1813,COc1cc2c(cc1OC)C(=O)Cc1ccc3c(c1CN(C)CC2)OCO3,223.0,-0.049364,-0.132988,0.124386,0.009060,-0.171241,0.093667,0.022339,-0.099430,...,-0.117811,0.147807,-0.206406,-0.051481,-0.084663,0.235192,0.151422,-0.185062,0.243362,-0.073059
4872,Cc1cccc(N)c1,-30.8,-0.042286,-0.054319,0.000392,-0.035801,-0.030392,-0.020759,0.016465,-0.039812,...,-0.026600,0.054355,-0.058683,-0.009614,-0.005260,0.040889,0.060164,-0.042282,0.062481,-0.010888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5777,NCC(O)c1ccc(O)c(O)c1,217.0,-0.018433,-0.048278,0.024849,0.019308,-0.087372,0.021562,0.051280,-0.061216,...,-0.072510,0.084665,-0.104880,-0.071651,0.000109,0.097689,0.127243,-0.071670,0.117511,-0.032753
4597,CC(C)NCC(O)c1ccc(O)c(O)c1,170.5,-0.014037,-0.078677,0.047386,0.034270,-0.123350,0.038314,0.063532,-0.053550,...,-0.112239,0.106748,-0.132359,-0.103295,-0.017451,0.123556,0.160652,-0.080801,0.150009,-0.021998
2568,S=c1[nH]c2ccccc2[nH]1,316.3,-0.058087,0.012937,-0.002006,-0.064712,-0.027478,-0.000813,0.006489,-0.023106,...,0.012840,0.057007,-0.066623,-0.021001,0.004246,0.038827,0.053446,-0.076124,0.051081,-0.028906
4889,CNC(=O)c1ccccc1,74.3,-0.034717,-0.044686,0.041118,-0.018546,-0.048061,0.016626,0.022363,-0.050720,...,-0.029593,0.050489,-0.063695,0.006311,-0.035995,0.061604,0.056923,-0.079362,0.077946,0.005174


In [15]:
# 2. Get the training indices
train_indices = final_df_train.index

# 3. Filter label_issues_df for only training data indices and then use it to clean
train_label_issues = label_issues_df.loc[train_indices]
final_df_train_cleaned = final_df_train[~train_label_issues["is_label_issue"]]
final_df_train_cleaned

Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,22,23,24,25,26,27,28,29,30,31
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7287,OCCOCCOCCO,-9.4,0.074590,0.001385,0.036988,0.070433,-0.064667,0.067474,0.018467,-0.036345,...,-0.070835,0.033864,-0.035366,-0.051272,-0.016264,0.063585,0.065975,-0.075633,0.062644,-0.015201
2612,S=c1nn[nH]n1-c1ccccc1,145.0,-0.085973,-0.027213,-0.024130,-0.087277,-0.014035,-0.008887,-0.014158,-0.049648,...,-0.007281,0.084851,-0.065530,0.005929,0.024622,0.072121,0.078100,-0.058612,0.072334,-0.028537
40,CC(=O)Nc1ccc(C(=O)O)cc1,256.5,-0.047981,-0.083447,0.069319,0.040202,-0.067987,0.037940,0.065622,-0.051146,...,-0.073453,0.074501,-0.097112,0.002115,-0.051941,0.112823,0.077744,-0.066154,0.131735,-0.002050
1813,COc1cc2c(cc1OC)C(=O)Cc1ccc3c(c1CN(C)CC2)OCO3,223.0,-0.049364,-0.132988,0.124386,0.009060,-0.171241,0.093667,0.022339,-0.099430,...,-0.117811,0.147807,-0.206406,-0.051481,-0.084663,0.235192,0.151422,-0.185062,0.243362,-0.073059
4872,Cc1cccc(N)c1,-30.8,-0.042286,-0.054319,0.000392,-0.035801,-0.030392,-0.020759,0.016465,-0.039812,...,-0.026600,0.054355,-0.058683,-0.009614,-0.005260,0.040889,0.060164,-0.042282,0.062481,-0.010888
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,Nc1cccc(N)c1,65.5,-0.042932,-0.067501,-0.015043,-0.044189,-0.018549,-0.022246,0.025539,-0.048912,...,-0.023625,0.045955,-0.062433,0.003940,0.005534,0.044699,0.061991,-0.038910,0.062748,-0.006981
5777,NCC(O)c1ccc(O)c(O)c1,217.0,-0.018433,-0.048278,0.024849,0.019308,-0.087372,0.021562,0.051280,-0.061216,...,-0.072510,0.084665,-0.104880,-0.071651,0.000109,0.097689,0.127243,-0.071670,0.117511,-0.032753
2568,S=c1[nH]c2ccccc2[nH]1,316.3,-0.058087,0.012937,-0.002006,-0.064712,-0.027478,-0.000813,0.006489,-0.023106,...,0.012840,0.057007,-0.066623,-0.021001,0.004246,0.038827,0.053446,-0.076124,0.051081,-0.028906
4889,CNC(=O)c1ccccc1,74.3,-0.034717,-0.044686,0.041118,-0.018546,-0.048061,0.016626,0.022363,-0.050720,...,-0.029593,0.050489,-0.063695,0.006311,-0.035995,0.061604,0.056923,-0.079362,0.077946,0.005174


In [263]:
# label_issues_df.to_csv('label_issues.csv')

In [188]:
read_label_issues_df = pd.read_csv('label_issues.csv', index_col="INDEX")
read_label_issues_df

Unnamed: 0_level_0,is_label_issue,label_quality,given_label,predicted_label
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,False,0.440461,31.6,52.363441
1,False,0.269478,173.5,135.949295
2,False,0.537392,160.0,140.438766
3,False,0.662444,263.0,252.171234
4,False,0.705964,121.0,110.204659
...,...,...,...,...
7635,False,0.351817,99.0,69.412567
7636,False,0.344846,241.0,187.852722
7637,True,0.089272,302.0,227.395441
7638,False,0.371428,164.0,189.363968
