In [1]:
import pandas as pd
import numpy as np
from load_data import processed_data_dirs, property_names, property_units, titles

In [103]:
ind = 0
embeddings = 'mol2vec'
property_name = property_names[ind]
property_unit = property_units[ind]
title = titles[ind]
property_name_with_unit = f'{property_name} ({property_unit})'
print(property_name_with_unit, title)

current_dir = processed_data_dirs[ind]
fname = current_dir.name.replace('_processed_data', '')

csv_file = current_dir.parent / f'{fname}.csv'
vec_dir = current_dir / 'embedded_vectors'
vec_file = current_dir / 'embedded_vectors' / f'{embeddings}_embeddings.npy'

print(csv_file.exists(), csv_file.name)
print(vec_file.exists(), vec_file.name)

df = pd.read_csv(csv_file)
df.set_index('INDEX', inplace=True)

processed_vec_dir = vec_dir / f'processed_{embeddings}_embeddings'
print(processed_vec_dir.exists(), processed_vec_dir.name)

print(df.columns)

X = np.load(vec_file, allow_pickle=True)

data_df = pd.DataFrame(X)
data_df.insert(0, 'INDEX', df.index)
data_df.insert(1, 'SMILES', df['SMILES'].values)
data_df.insert(2, 'y', df['Processed tmp/ºC'].values)
data_df.set_index('INDEX', inplace=True)

# 1. First filter out rows where all X features are zero
# We look at columns from index 2 onwards (excluding SMILES and y)
non_zero_mask = ~(data_df.iloc[:, 2:] == 0).all(axis=1)
filtered_df = data_df[non_zero_mask]

# 2. Remove rows with invalid y values
# Assuming invalid y values are NaN, infinite, or non-numeric
# Convert y column to float and handle invalid conversions
filtered_df['y'] = pd.to_numeric(filtered_df['y'], errors='coerce')
valid_y_mask = ~(pd.isna(filtered_df['y']) | np.isinf(filtered_df['y']))
final_df = filtered_df[valid_y_mask]

# Print information about removed rows
print(f"Original number of rows: {len(data_df)}")
print(f"Rows removed due to all-zero features: {len(data_df) - len(filtered_df)}")
print(f"Rows removed due to invalid y values: {len(filtered_df) - len(final_df)}")
print(f"Final number of rows: {len(final_df)}")

final_df

Melting Point (K) MP
True tmpC_topelements.csv
True mol2vec_embeddings.npy
True processed_mol2vec_embeddings
Index(['CRC Index', 'SMILES', 'tmp/ºC', 'Processed tmp/ºC'], dtype='object')
Original number of rows: 7476
Rows removed due to all-zero features: 0
Rows removed due to invalid y values: 0
Final number of rows: 7476


Unnamed: 0_level_0,SMILES,y,0,1,2,3,4,5,6,7,...,290,291,292,293,294,295,296,297,298,299
INDEX,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,COP(=S)(OC)Oc1ccc(Sc2ccc(OP(=S)(OC)OC)cc2)cc1,31.6,14.583024,7.004414,10.764027,9.659239,-7.881231,-12.125900,4.289826,11.829774,...,-2.557744,2.143950,-1.143839,7.093911,8.230412,2.932323,-9.742094,-3.931319,4.814355,0.262482
1,CC(C)C1=CC2=CCC3C(C)(C(=O)O)CCCC3(C)C2CC1,173.5,8.672269,3.382508,-1.009339,8.613342,-5.257068,-3.722235,-2.904521,3.233326,...,-3.215222,1.214921,0.065320,2.262887,3.039335,1.418446,-5.120859,0.569136,6.675544,-3.676702
2,CC(C=CC1(O)C(C)=CC(=O)CC1(C)C)=CC(=O)O,160.0,6.926352,4.612771,-0.524525,6.452645,-5.662361,-8.545525,-2.476228,6.148377,...,-2.327952,-0.310817,-2.341082,1.690612,1.047211,-0.751602,-5.519598,1.978377,6.315439,-2.911364
3,COc1ccc(-c2cc(=O)c3c(O)cc(O)cc3o2)cc1,263.0,9.478333,8.671986,2.309459,5.326269,-4.638491,-9.584875,-1.285632,10.793772,...,-4.620464,-0.298921,-1.304448,6.174148,6.492252,3.929846,-3.667210,-3.263721,3.714898,-0.117793
4,CCCC(=O)Nc1ccc(OCC(O)CNC(C)C)c(C(C)=O)c1,121.0,11.625536,0.939922,3.890236,5.016088,-10.919663,-4.537029,-0.758402,7.000362,...,-3.590667,1.033413,-0.449358,0.507696,4.534042,0.110596,-8.913643,-0.294865,9.153279,-6.478106
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7635,CNC(=O)Oc1cc(C)cc(C)c1,99.0,6.605989,1.137232,2.373102,3.327681,-5.042240,-4.029456,-0.581940,4.979243,...,-2.800613,0.174910,0.252637,1.030272,3.036746,1.474039,-4.084049,-2.257625,2.323892,-3.162244
7636,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21,241.0,13.432186,6.694004,-0.122617,7.899603,-2.365482,-6.335269,-3.245623,6.332031,...,-4.330900,2.721418,0.066121,5.533253,3.201767,5.296447,-4.901769,-3.042064,8.489907,-0.412169
7637,COC(=O)C1C(O)CCC2CN3CCc4c([nH]c5ccccc45)C3CC21.Cl,302.0,13.432186,6.694004,-0.122617,7.899603,-2.365482,-6.335269,-3.245623,6.332031,...,-4.330900,2.721418,0.066121,5.533253,3.201767,5.296447,-4.901769,-3.042064,8.489907,-0.412169
7638,CC1CCCC(=O)CCCC=Cc2cc(O)cc(O)c2C(=O)O1,164.0,9.669550,5.628396,0.379068,6.077866,-3.538999,-8.804146,-1.054745,8.215943,...,-5.421795,-0.008273,-3.303473,5.494559,5.693076,5.359212,-3.300048,-4.743765,3.138605,-2.784042


In [None]:
# 1. Most efficient - Apache Parquet format
# Maintains data types, supports compression, very fast read/write
final_df.to_parquet('final_data.parquet', compression='snappy')

# 2. Alternative - HDF5 format 
# Also very efficient, good for large datasets
final_df.to_hdf('final_data.h5', key='df', mode='w', format='table')

# 3. Pickle format - Simple binary format
# Good for smaller datasets, Python-specific
final_df.to_pickle('final_data.pkl')

# To read these files back:
# Parquet:
# df = pd.read_parquet('final_data.parquet')

# HDF5:
# df = pd.read_hdf('final_data.h5', 'df')

# Pickle:
# df = pd.read_pickle('final_data.pkl')

In [98]:
final_df['y'].dtype, data_df['y'].dtype

(dtype('float64'), dtype('float64'))

In [67]:
data_df['y'].shape, data_df['y'].notna().sum(), df['Processed tmp/ºC'].notna().sum()

((7476,), 7476, 7476)

In [68]:
data_df['y'].iloc[7473], df['Processed tmp/ºC'].iloc[7473]

(302.0, 302.0)

In [77]:
data_df['INDEX'].notna().sum(), data_df['SMILES'].notna().sum(), data_df['y'].notna().sum()

(7476, 7476, 7476)

In [None]:
processed_y = np.load(current_dir / 'processed_y.npy')
processed_y.shape

FileNotFoundError: [Errno 2] No such file or directory: '/Users/aravindhnivas/Library/CloudStorage/OneDrive-MassachusettsInstituteofTechnology/ML-properties/[PHYSICAL CONSTANTS OF ORGANIC COMPOUNDS]/tmp_C_processed_data/analysis_data/filtered/tmpC_topelements_processed_data/processed_y.npy'

In [29]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X_df, y, test_size=0.2, random_state=42)

In [30]:
X_train

Unnamed: 0,INDEX,0,1,2,3,4,5,6,7,8,...,290,291,292,293,294,295,296,297,298,299
410,419,15.624734,5.835931,4.106830,6.014038,-5.981432,-9.873967,-2.696909,12.454193,-12.039816,...,-6.922585,0.041470,-2.922617,2.911139,5.712147,3.053860,-6.139966,-3.366051,6.342935,-2.403481
6252,6361,6.895595,4.989061,0.064360,2.153202,-2.869662,-3.779230,0.074179,2.639753,-3.587482,...,-0.462688,1.374659,1.203800,4.048262,1.688746,1.629563,-2.347820,-1.724825,1.860019,0.854571
4054,4127,4.713959,3.183613,2.181945,4.048241,-5.646354,-6.644563,0.706696,6.360467,-6.907686,...,-2.541763,-3.067338,-2.249733,4.721620,4.002388,-0.305089,0.168786,-0.643479,1.679067,-0.888601
438,447,24.492773,12.816860,2.849506,3.652983,-6.934154,-17.044130,-2.710021,19.158754,-23.137550,...,-9.258818,-0.121545,-7.099104,12.049026,10.376948,12.099661,-11.190363,-5.495942,9.798221,0.547257
2481,2533,11.698213,4.897184,6.369991,2.154273,-4.957835,-7.229101,-0.556095,8.478689,-10.199132,...,-4.190742,2.863543,-1.356861,4.161515,4.043112,2.084543,-6.488578,-2.626664,8.009536,-1.370593
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5191,5283,7.454230,3.558692,2.708807,3.448868,-4.144558,-7.756985,-0.496209,6.869094,-10.162835,...,-4.875896,-0.115453,-1.599765,5.287510,4.376238,2.131342,-4.108946,-2.175586,4.666823,-0.855000
5226,5318,5.658749,2.097558,3.322736,9.813667,-7.248013,0.723731,-2.273409,1.656151,-2.804730,...,-2.897757,5.915749,0.447422,-2.350207,5.149675,2.135646,-5.995398,3.295216,4.040868,-4.526198
5390,5484,7.123998,5.063552,3.060889,3.621100,-4.230202,-6.357446,-1.002585,6.043196,-8.762973,...,-3.391119,0.024242,-1.939771,4.436522,3.858251,2.340731,-3.613493,-1.233467,3.393109,0.044492
860,879,5.328402,2.736083,1.323762,2.210587,-2.358853,-4.830294,0.592288,2.185076,-6.735043,...,-1.340181,1.325703,-1.340155,4.774585,2.703774,1.016496,-2.250720,-1.337499,2.640474,2.000336


In [31]:
y_train

array([ 158. ,   34. ,  145. , ...,  132.8,  114. , -105.9])

In [34]:
y[6252]

34.0