In [147]:
from pathlib import Path as pt
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from cleanlab.regression.learn import CleanLearning
from load_data import processed_data_dirs, property_names, property_units, titles

In [135]:
ind = 0
embeddings = 'mol2vec'
property_name = property_names[ind]
property_unit = property_units[ind]
title = titles[ind]
property_name_with_unit = f'{property_name} ({property_unit})'
print(property_name_with_unit, title)

current_dir = processed_data_dirs[ind]
fname = current_dir.name.replace('_processed_data', '')

csv_file = current_dir.parent / f'{fname}.csv'
vec_dir = current_dir / 'embedded_vectors'
vec_file = current_dir / 'embedded_vectors' / f'{embeddings}_embeddings.npy'

print(csv_file.exists(), csv_file.name)
print(vec_file.exists(), vec_file.name)

processed_vec_dir = vec_dir / f'processed_{embeddings}_embeddings'
print(processed_vec_dir.exists(), processed_vec_dir.name)

df = pd.read_csv(csv_file)
df.set_index('INDEX', inplace=True)
print(df.columns, df.shape)

# (N, D) array where N is the number of samples and D is the dimensionality of the feature vectors
X = np.load(vec_file, allow_pickle=True)

# Create DataFrame more efficiently by providing column names upfront
feature_cols = [str(i) for i in range(X.shape[1])]  # Pre-generate column names
data_df = pd.DataFrame(X, index=df.index, columns=feature_cols)

# Add SMILES and y columns efficiently using loc
data_df.loc[:, 'SMILES'] = df['SMILES']
data_df.loc[:, 'y'] = pd.to_numeric(df['Processed tmp/ºC'], errors='coerce')

# Reorder columns efficiently
cols_order = ['SMILES', 'y'] + feature_cols
data_df = data_df[cols_order]

# 1. First filter out rows where all X features are zero
# We look at columns from index 2 onwards (excluding SMILES and y)
non_zero_mask = ~(data_df.iloc[:, 2:] == 0).all(axis=1)
filtered_df = data_df[non_zero_mask]

# 2. Remove rows with invalid y values
# Assuming invalid y values are NaN, infinite, or non-numeric
# Convert y column to float and handle invalid conversions
filtered_df['y'] = pd.to_numeric(filtered_df['y'], errors='coerce')
valid_y_mask = ~(pd.isna(filtered_df['y']) | np.isinf(filtered_df['y']))
final_df = filtered_df[valid_y_mask]

# Print information about removed rows
print(f"Original number of rows: {len(data_df)}")
print(f"Rows removed due to all-zero features: {len(data_df) - len(filtered_df)}")
print(f"Rows removed due to invalid y values: {len(filtered_df) - len(final_df)}")
print(f"Final number of rows: {len(final_df)}")

final_df.columns = [str(col) if not isinstance(col, str) else col for col in final_df.columns]
final_df

Melting Point (K) MP
True tmpC_topelements.csv
True mol2vec_embeddings.npy
True processed_mol2vec_embeddings
Index(['CRC Index', 'SMILES', 'tmp/ºC', 'Processed tmp/ºC'], dtype='object') (7476, 4)
Original number of rows: 7476
Rows removed due to all-zero features: 0
Rows removed due to invalid y values: 0
Final number of rows: 7476


Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,COP(=S)(OC)Oc1ccc(Sc2ccc(OP(=S)(OC)OC)cc2)cc1,31.6,14.583024,7.004414,10.764027,9.659239,-7.881231,-12.125900,4.289826,11.829774,...,-2.557744,2.143950,-1.143839,7.093911,8.230412,2.932323,-9.742094,-3.931319,4.814355,0.262482
1,CC(C)C1=CC2=CCC3C(C)(C(=O)O)CCCC3(C)C2CC1,173.5,8.672269,3.382508,-1.009339,8.613342,-5.257068,-3.722235,-2.904521,3.233326,...,-3.215222,1.214921,0.065320,2.262887,3.039335,1.418446,-5.120859,0.569136,6.675544,-3.676702
2,CC(C=CC1(O)C(C)=CC(=O)CC1(C)C)=CC(=O)O,160.0,6.926352,4.612771,-0.524525,6.452645,-5.662361,-8.545525,-2.476228,6.148377,...,-2.327952,-0.310817,-2.341082,1.690612,1.047211,-0.751602,-5.519598,1.978377,6.315439,-2.911364
3,COc1ccc(-c2cc(=O)c3c(O)cc(O)cc3o2)cc1,263.0,9.478333,8.671986,2.309459,5.326269,-4.638491,-9.584875,-1.285632,10.793772,...,-4.620464,-0.298921,-1.304448,6.174148,6.492252,3.929846,-3.667210,-3.263721,3.714898,-0.117793
4,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1,121.0,11.625536,0.939922,3.890236,5.016088,-10.919663,-4.537029,-0.758402,7.000362,...,-3.590667,1.033413,-0.449358,0.507696,4.534042,0.110596,-8.913643,-0.294865,9.153279,-6.478106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7635,CNC(=O)Oc1cc(C)cc(C)c1,99.0,6.605989,1.137232,2.373102,3.327681,-5.042240,-4.029456,-0.581940,4.979243,...,-2.800613,0.174910,0.252637,1.030272,3.036746,1.474039,-4.084049,-2.257625,2.323892,-3.162244
7636,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21,241.0,13.432186,6.694004,-0.122617,7.899603,-2.365482,-6.335269,-3.245623,6.332031,...,-4.330900,2.721418,0.066121,5.533253,3.201767,5.296447,-4.901769,-3.042064,8.489907,-0.412169
7637,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21.Cl,302.0,13.432186,6.694004,-0.122617,7.899603,-2.365482,-6.335269,-3.245623,6.332031,...,-4.330900,2.721418,0.066121,5.533253,3.201767,5.296447,-4.901769,-3.042064,8.489907,-0.412169
7638,CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c2C(=O)O1,164.0,9.669550,5.628396,0.379068,6.077866,-3.538999,-8.804146,-1.054745,8.215943,...,-5.421795,-0.008273,-3.303473,5.494559,5.693076,5.359212,-3.300048,-4.743765,3.138605,-2.784042


In [122]:
# 1. Most efficient - Apache Parquet format
# Maintains data types, supports compression, very fast read/write
final_df.to_parquet('final_data.parquet', compression='snappy')

In [123]:
read_df = pd.read_parquet('final_data.parquet')
read_df

Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,COP(=S)(OC)Oc1ccc(Sc2ccc(OP(=S)(OC)OC)cc2)cc1,31.6,14.583024,7.004414,10.764027,9.659239,-7.881231,-12.125900,4.289826,11.829774,...,-2.557744,2.143950,-1.143839,7.093911,8.230412,2.932323,-9.742094,-3.931319,4.814355,0.262482
1,CC(C)C1=CC2=CCC3C(C)(C(=O)O)CCCC3(C)C2CC1,173.5,8.672269,3.382508,-1.009339,8.613342,-5.257068,-3.722235,-2.904521,3.233326,...,-3.215222,1.214921,0.065320,2.262887,3.039335,1.418446,-5.120859,0.569136,6.675544,-3.676702
2,CC(C=CC1(O)C(C)=CC(=O)CC1(C)C)=CC(=O)O,160.0,6.926352,4.612771,-0.524525,6.452645,-5.662361,-8.545525,-2.476228,6.148377,...,-2.327952,-0.310817,-2.341082,1.690612,1.047211,-0.751602,-5.519598,1.978377,6.315439,-2.911364
3,COc1ccc(-c2cc(=O)c3c(O)cc(O)cc3o2)cc1,263.0,9.478333,8.671986,2.309459,5.326269,-4.638491,-9.584875,-1.285632,10.793772,...,-4.620464,-0.298921,-1.304448,6.174148,6.492252,3.929846,-3.667210,-3.263721,3.714898,-0.117793
4,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1,121.0,11.625536,0.939922,3.890236,5.016088,-10.919663,-4.537029,-0.758402,7.000362,...,-3.590667,1.033413,-0.449358,0.507696,4.534042,0.110596,-8.913643,-0.294865,9.153279,-6.478106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7635,CNC(=O)Oc1cc(C)cc(C)c1,99.0,6.605989,1.137232,2.373102,3.327681,-5.042240,-4.029456,-0.581940,4.979243,...,-2.800613,0.174910,0.252637,1.030272,3.036746,1.474039,-4.084049,-2.257625,2.323892,-3.162244
7636,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21,241.0,13.432186,6.694004,-0.122617,7.899603,-2.365482,-6.335269,-3.245623,6.332031,...,-4.330900,2.721418,0.066121,5.533253,3.201767,5.296447,-4.901769,-3.042064,8.489907,-0.412169
7637,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21.Cl,302.0,13.432186,6.694004,-0.122617,7.899603,-2.365482,-6.335269,-3.245623,6.332031,...,-4.330900,2.721418,0.066121,5.533253,3.201767,5.296447,-4.901769,-3.042064,8.489907,-0.412169
7638,CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c2C(=O)O1,164.0,9.669550,5.628396,0.379068,6.077866,-3.538999,-8.804146,-1.054745,8.215943,...,-5.421795,-0.008273,-3.303473,5.494559,5.693076,5.359212,-3.300048,-4.743765,3.138605,-2.784042


In [133]:
def process_data(csv_file: pt, vec_file: pt):
    # Load data efficiently
    df = pd.read_csv(csv_file, index_col='INDEX')
    X = np.load(vec_file, allow_pickle=True)
    
    # Create DataFrame more efficiently by providing column names upfront
    feature_cols = [str(i) for i in range(X.shape[1])]  # Pre-generate column names
    data_df = pd.DataFrame(X, index=df.index, columns=feature_cols)
    
    # Add SMILES and y columns efficiently using loc
    data_df.loc[:, 'SMILES'] = df['SMILES']
    data_df.loc[:, 'y'] = pd.to_numeric(df['Processed tmp/ºC'], errors='coerce')
    
    # Reorder columns efficiently
    cols_order = ['SMILES', 'y'] + feature_cols
    data_df = data_df[cols_order]
    
    # Filter rows in one pass
    # Combine both conditions to avoid creating intermediate DataFrames
    valid_mask = (~(data_df.iloc[:, 2:] == 0).all(axis=1)) & (~pd.isna(data_df['y'])) & (~np.isinf(data_df['y']))
    final_df = data_df[valid_mask]
    
    # Print summary
    rows_removed = len(data_df) - len(final_df)
    print(f"Original rows: {len(data_df)}")
    print(f"Rows removed: {rows_removed}")
    print(f"Final rows: {len(final_df)}")
    
    return final_df

final_df = process_data(csv_file, vec_file)
final_df

Original rows: 7476
Rows removed: 0
Final rows: 7476


Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,COP(=S)(OC)Oc1ccc(Sc2ccc(OP(=S)(OC)OC)cc2)cc1,31.6,14.583024,7.004414,10.764027,9.659239,-7.881231,-12.125900,4.289826,11.829774,...,-2.557744,2.143950,-1.143839,7.093911,8.230412,2.932323,-9.742094,-3.931319,4.814355,0.262482
1,CC(C)C1=CC2=CCC3C(C)(C(=O)O)CCCC3(C)C2CC1,173.5,8.672269,3.382508,-1.009339,8.613342,-5.257068,-3.722235,-2.904521,3.233326,...,-3.215222,1.214921,0.065320,2.262887,3.039335,1.418446,-5.120859,0.569136,6.675544,-3.676702
2,CC(C=CC1(O)C(C)=CC(=O)CC1(C)C)=CC(=O)O,160.0,6.926352,4.612771,-0.524525,6.452645,-5.662361,-8.545525,-2.476228,6.148377,...,-2.327952,-0.310817,-2.341082,1.690612,1.047211,-0.751602,-5.519598,1.978377,6.315439,-2.911364
3,COc1ccc(-c2cc(=O)c3c(O)cc(O)cc3o2)cc1,263.0,9.478333,8.671986,2.309459,5.326269,-4.638491,-9.584875,-1.285632,10.793772,...,-4.620464,-0.298921,-1.304448,6.174148,6.492252,3.929846,-3.667210,-3.263721,3.714898,-0.117793
4,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1,121.0,11.625536,0.939922,3.890236,5.016088,-10.919663,-4.537029,-0.758402,7.000362,...,-3.590667,1.033413,-0.449358,0.507696,4.534042,0.110596,-8.913643,-0.294865,9.153279,-6.478106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7635,CNC(=O)Oc1cc(C)cc(C)c1,99.0,6.605989,1.137232,2.373102,3.327681,-5.042240,-4.029456,-0.581940,4.979243,...,-2.800613,0.174910,0.252637,1.030272,3.036746,1.474039,-4.084049,-2.257625,2.323892,-3.162244
7636,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21,241.0,13.432186,6.694004,-0.122617,7.899603,-2.365482,-6.335269,-3.245623,6.332031,...,-4.330900,2.721418,0.066121,5.533253,3.201767,5.296447,-4.901769,-3.042064,8.489907,-0.412169
7637,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21.Cl,302.0,13.432186,6.694004,-0.122617,7.899603,-2.365482,-6.335269,-3.245623,6.332031,...,-4.330900,2.721418,0.066121,5.533253,3.201767,5.296447,-4.901769,-3.042064,8.489907,-0.412169
7638,CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c2C(=O)O1,164.0,9.669550,5.628396,0.379068,6.077866,-3.538999,-8.804146,-1.054745,8.215943,...,-5.421795,-0.008273,-3.303473,5.494559,5.693076,5.359212,-3.300048,-4.743765,3.138605,-2.784042


In [164]:
from xgboost import XGBRegressor
clean_model = XGBRegressor(verbosity=0, n_jobs=-1)
cl = CleanLearning(clean_model, verbose=True)
cl.fit(final_df.iloc[:, 2:], final_df['y'])
# label_issues_df = cl.get_label_issues()

Identifying label issues ...
Identified 1402 examples with label issues.
Pruning 1402 examples with label issues ...
Remaining clean data has 6074 examples.
Fitting final model on the clean data ...
Label issues stored in label_issues_df DataFrame accessible via: self.get_label_issues(). Call self.save_space() to delete this potentially large DataFrame attribute.


In [169]:
# Get label issues and assign the same index
# label_issues_df = cl.get_label_issues()
label_issues_df = cl.get_label_issues().copy()
label_issues_df.index = final_df.index

# Verify indices match
print("Do indices match?", (label_issues_df.index == final_df.index).all())
print("final_df index shape:", final_df.index.shape)
print("label_issues_df index shape:", label_issues_df.index.shape)

# Now you can safely filter
final_df_cleaned = final_df[~label_issues_df["is_label_issue"]]
label_issues_df

Do indices match? True
final_df index shape: (7476,)
label_issues_df index shape: (7476,)


Unnamed: 0_level_0,is_label_issue,label_quality,given_label,predicted_label
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,False,0.440461,31.6,52.363441
1,False,0.269478,173.5,135.949295
2,False,0.537392,160.0,140.438766
3,False,0.662444,263.0,252.171234
4,False,0.705964,121.0,110.204659
...,...,...,...,...
7635,False,0.351817,99.0,69.412567
7636,False,0.344846,241.0,187.852722
7637,True,0.089272,302.0,227.395441
7638,False,0.371428,164.0,189.363968


In [190]:
final_df_cleaned = final_df[~label_issues_df["is_label_issue"]]
final_df_cleaned

Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,COP(=S)(OC)Oc1ccc(Sc2ccc(OP(=S)(OC)OC)cc2)cc1,31.6,14.583024,7.004414,10.764027,9.659239,-7.881231,-12.125900,4.289826,11.829774,...,-2.557744,2.143950,-1.143839,7.093911,8.230412,2.932323,-9.742094,-3.931319,4.814355,0.262482
1,CC(C)C1=CC2=CCC3C(C)(C(=O)O)CCCC3(C)C2CC1,173.5,8.672269,3.382508,-1.009339,8.613342,-5.257068,-3.722235,-2.904521,3.233326,...,-3.215222,1.214921,0.065320,2.262887,3.039335,1.418446,-5.120859,0.569136,6.675544,-3.676702
2,CC(C=CC1(O)C(C)=CC(=O)CC1(C)C)=CC(=O)O,160.0,6.926352,4.612771,-0.524525,6.452645,-5.662361,-8.545525,-2.476228,6.148377,...,-2.327952,-0.310817,-2.341082,1.690612,1.047211,-0.751602,-5.519598,1.978377,6.315439,-2.911364
3,COc1ccc(-c2cc(=O)c3c(O)cc(O)cc3o2)cc1,263.0,9.478333,8.671986,2.309459,5.326269,-4.638491,-9.584875,-1.285632,10.793772,...,-4.620464,-0.298921,-1.304448,6.174148,6.492252,3.929846,-3.667210,-3.263721,3.714898,-0.117793
4,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1,121.0,11.625536,0.939922,3.890236,5.016088,-10.919663,-4.537029,-0.758402,7.000362,...,-3.590667,1.033413,-0.449358,0.507696,4.534042,0.110596,-8.913643,-0.294865,9.153279,-6.478106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7634,OC1O[C@H](CO[C@@H]2OC[C@@H](O)[C@H](O)[C@H]2O)...,210.0,9.415656,14.146734,-6.948192,4.933622,-0.930297,-9.007641,-5.025923,2.624626,...,-3.239574,6.539446,7.643373,4.224568,-0.641620,8.178635,-7.266513,-8.799261,2.400525,3.103480
7635,CNC(=O)Oc1cc(C)cc(C)c1,99.0,6.605989,1.137232,2.373102,3.327681,-5.042240,-4.029456,-0.581940,4.979243,...,-2.800613,0.174910,0.252637,1.030272,3.036746,1.474039,-4.084049,-2.257625,2.323892,-3.162244
7636,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21,241.0,13.432186,6.694004,-0.122617,7.899603,-2.365482,-6.335269,-3.245623,6.332031,...,-4.330900,2.721418,0.066121,5.533253,3.201767,5.296447,-4.901769,-3.042064,8.489907,-0.412169
7638,CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c2C(=O)O1,164.0,9.669550,5.628396,0.379068,6.077866,-3.538999,-8.804146,-1.054745,8.215943,...,-5.421795,-0.008273,-3.303473,5.494559,5.693076,5.359212,-3.300048,-4.743765,3.138605,-2.784042


In [177]:
final_df_train, final_df_test = train_test_split(final_df, test_size=0.2)
final_df_train

Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7118,Cc1ccc(S(=O)(=O)O)cc1,104.5,4.897093,3.777663,1.346407,3.535480,-2.693464,-5.323051,-0.886255,5.807439,...,-2.697405,0.244533,-1.589061,3.414617,3.493030,0.380309,-3.669165,-1.365567,1.059546,-0.272054
1748,c1ccc2nnccc2c1,38.0,5.432196,3.451047,0.625473,3.280932,-2.508114,-4.571104,-0.407828,3.397220,...,-2.637960,-0.976570,-0.540310,5.841665,3.951513,0.978751,-2.107548,-1.326095,1.759714,-0.695998
5247,CC=CC(C)C,-134.8,2.426050,0.157611,0.225276,1.833412,-2.955518,-2.237134,-1.184319,1.611603,...,-0.223021,0.683610,0.191155,0.939786,1.070682,-1.248909,-3.729141,-0.571278,3.121681,-3.520040
2628,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,289.5,7.720578,7.258456,2.629590,5.192294,-3.354980,-7.540139,-1.235759,9.810016,...,-3.149164,-1.284903,-3.311849,7.650428,4.372863,4.211869,-3.427923,-2.739337,2.349053,0.059407
4712,c1ccc2sc(SSc3nc4ccccc4s3)nc2c1,180.0,8.846535,8.747698,2.963148,4.489509,-0.754896,-6.799650,-3.600182,8.316263,...,-3.785359,-2.091301,0.032358,9.800165,7.311416,0.149913,-5.355039,-5.684108,4.332882,-1.660756
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
275,Nc1ccc2c(S(=O)(=O)O)cccc2c1S(=O)(=O)O,300.0,7.802561,8.294585,2.754875,5.962450,-3.722327,-7.922889,-1.433091,11.954533,...,-3.360179,-0.366804,-2.022266,4.991146,5.151605,-0.318351,-5.686307,-3.940958,0.691341,0.489728
7081,NC(=S)NC(N)=S,181.0,3.549316,0.522557,1.431816,3.576143,0.288377,-1.451147,0.756446,3.675696,...,0.459687,0.262160,3.371526,-1.160112,0.838372,0.764153,-3.558444,0.339156,0.600241,-3.174201
5678,O=[N+]([O-])c1ccc(N=Nc2c(O)ccc3ccccc23)cc1,257.0,9.673212,8.664092,3.468399,6.441185,-3.730956,-11.683831,1.462775,12.902738,...,-4.674635,0.558924,-2.749690,9.247746,6.510659,3.709771,-4.565406,-4.136503,1.467097,-0.166889
755,ClCC1(CCl)COC1,19.0,1.571314,0.750639,1.564714,-0.686495,-0.959669,-2.020104,3.331656,0.763441,...,-2.227716,1.971287,-3.818565,-3.651531,-0.712679,-0.609357,-1.987914,0.569648,0.176417,0.818494


In [183]:
# 2. Get the training indices
train_indices = final_df_train.index

# 3. Filter label_issues_df for only training data indices and then use it to clean
train_label_issues = label_issues_df.loc[train_indices]
final_df_train_cleaned = final_df_train[~train_label_issues["is_label_issue"]]
final_df_train_cleaned

Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
7118,Cc1ccc(S(=O)(=O)O)cc1,104.50,4.897093,3.777663,1.346407,3.535480,-2.693464,-5.323051,-0.886255,5.807439,...,-2.697405,0.244533,-1.589061,3.414617,3.493030,0.380309,-3.669165,-1.365567,1.059546,-0.272054
5247,CC=CC(C)C,-134.80,2.426050,0.157611,0.225276,1.833412,-2.955518,-2.237134,-1.184319,1.611603,...,-0.223021,0.683610,0.191155,0.939786,1.070682,-1.248909,-3.729141,-0.571278,3.121681,-3.520040
2628,O=C1c2ccccc2C(=O)c2c1ccc(O)c2O,289.50,7.720578,7.258456,2.629590,5.192294,-3.354980,-7.540139,-1.235759,9.810016,...,-3.149164,-1.284903,-3.311849,7.650428,4.372863,4.211869,-3.427923,-2.739337,2.349053,0.059407
4712,c1ccc2sc(SSc3nc4ccccc4s3)nc2c1,180.00,8.846535,8.747698,2.963148,4.489509,-0.754896,-6.799650,-3.600182,8.316263,...,-3.785359,-2.091301,0.032358,9.800165,7.311416,0.149913,-5.355039,-5.684108,4.332882,-1.660756
7224,C[Si](Cl)(Cl)Cl,-75.77,0.907923,0.017815,1.807034,-0.882072,-0.436762,-0.752431,-0.257094,-0.105972,...,-0.501076,0.752777,-1.605621,-1.699278,0.163618,-0.049534,-0.154259,-0.226285,-0.283851,-0.856268
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7535,NC(=O)C(N)Cc1ccc(O)cc1,153.50,5.024655,3.489659,0.900660,4.691381,-3.123414,-3.169268,-0.643614,6.103518,...,-2.102573,1.262073,1.554440,3.184036,1.235606,1.139516,-3.982087,-1.861491,2.655226,-2.603404
275,Nc1ccc2c(S(=O)(=O)O)cccc2c1S(=O)(=O)O,300.00,7.802561,8.294585,2.754875,5.962450,-3.722327,-7.922889,-1.433091,11.954533,...,-3.360179,-0.366804,-2.022266,4.991146,5.151605,-0.318351,-5.686307,-3.940958,0.691341,0.489728
7081,NC(=S)NC(N)=S,181.00,3.549316,0.522557,1.431816,3.576143,0.288377,-1.451147,0.756446,3.675696,...,0.459687,0.262160,3.371526,-1.160112,0.838372,0.764153,-3.558444,0.339156,0.600241,-3.174201
755,ClCC1(CCl)COC1,19.00,1.571314,0.750639,1.564714,-0.686495,-0.959669,-2.020104,3.331656,0.763441,...,-2.227716,1.971287,-3.818565,-3.651531,-0.712679,-0.609357,-1.987914,0.569648,0.176417,0.818494


In [184]:
label_issues_df.to_csv('label_issues.csv')

In [188]:
read_label_issues_df = pd.read_csv('label_issues.csv', index_col="INDEX")
read_label_issues_df

Unnamed: 0_level_0,is_label_issue,label_quality,given_label,predicted_label
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,False,0.440461,31.6,52.363441
1,False,0.269478,173.5,135.949295
2,False,0.537392,160.0,140.438766
3,False,0.662444,263.0,252.171234
4,False,0.705964,121.0,110.204659
...,...,...,...,...
7635,False,0.351817,99.0,69.412567
7636,False,0.344846,241.0,187.852722
7637,True,0.089272,302.0,227.395441
7638,False,0.371428,164.0,189.363968
