In [1]:
import pandas as pd

In [2]:
data_prefix = 'data_cleaned/'

train_df = pd.read_csv(data_prefix + 'data.csv')
train_feat_df = pd.read_csv(data_prefix + 'data_features.csv')

test_df = pd.read_csv(data_prefix + 'test.csv')
test_feat_df = pd.read_csv(data_prefix + 'test_features.csv')

In [3]:
# combine features with original data horizontally
train_combined_df = pd.concat([train_df, train_feat_df], axis=1)
train_combined_df['benchmark'] = 'train'

test_combined_df = pd.concat([test_df, test_feat_df], axis=1)
test_combined_df['benchmark'] = 'test'

original_data = pd.concat([train_combined_df, test_combined_df], axis=0).reset_index(drop=True)
original_data['type'] = 'original'

In [4]:
# build alphabetical pair key for each mixture
original_data['combos'] = original_data.apply(
    lambda row: '-'.join(sorted([row['MOL_1'], row['MOL_2']])),
    axis=1
)


In [5]:
original_data.head(20)

Unnamed: 0,MOL_1,MOL_2,logV,MolFrac_1,T,benchmark,type,combos
0,B(Br)(Br)Br,CCO,-0.140112,1.0,298.0,train,original,B(Br)(Br)Br-CCO
1,BrBr,CCO,-0.011187,1.0,298.0,train,original,BrBr-CCO
2,BrC(Br)(Br)Br,c1ccccc1,-0.219755,0.0,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
3,BrC(Br)(Br)Br,c1ccccc1,-0.080765,0.1,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
4,BrC(Br)(Br)Br,c1ccccc1,0.05007,0.2,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
5,BrC(Br)(Br)Br,c1ccccc1,0.165422,0.3,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
6,BrC(Br)(Br)Br,c1ccccc1,0.2647,0.4,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
7,BrC(Br)(Br)Br,c1ccccc1,0.349316,0.5,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
8,BrC(Br)(Br)Br,c1ccccc1,0.420879,0.6,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
9,BrC(Br)(Br)Br,c1ccccc1,0.48084,0.7,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1


In [6]:
original_data.describe()

Unnamed: 0,logV,MolFrac_1,T
count,40905.0,40905.0,40905.0
mean,0.071711,0.497873,305.477083
std,0.406082,0.318345,6.210431
min,-2.188234,0.0,293.15
25%,-0.218826,0.2,298.428625
50%,-0.004557,0.5,305.65
75%,0.266937,0.8,308.15
max,4.065026,1.0,323.15


In [7]:
# Check for data leakage between train and test sets
train_mol_1 = set(original_data[original_data['benchmark']== 'train']['MOL_1'])
test_mol_1 = set(original_data[original_data['benchmark']== 'test']['MOL_1'])
train_mol_2 = set(original_data[original_data['benchmark']== 'train']['MOL_2'])
test_mol_2 = set(original_data[original_data['benchmark']== 'test']['MOL_2'])

train_combos = set(original_data[original_data['benchmark']== 'train']['combos'])
test_combos = set(original_data[original_data['benchmark']== 'test']['combos'])

# Check if any MOL_1 from train is in test MOL_1 or MOL_2
leakage_mol_1 = train_mol_1.intersection(test_mol_1) | train_mol_1.intersection(test_mol_2)

# Check if any MOL_2 from train is in test MOL_1 or MOL_2
leakage_mol_2 = train_mol_2.intersection(test_mol_1) | train_mol_2.intersection(test_mol_2)

# Check if any combos from train is in test combos
leakage_combos = train_combos.intersection(test_combos)

# Print results
if leakage_mol_1:
    print(f"Data leakage found in MOL_1: {leakage_mol_1}")
else:
    print("No data leakage in MOL_1")

if leakage_mol_2:
    print(f"Data leakage found in MOL_2: {leakage_mol_2}")
else:
    print("No data leakage in MOL_2")

if leakage_combos:
    print(f"Data leakage found in combos: {leakage_combos}")
else:
    print("No data leakage in combos")



Data leakage found in MOL_1: {'COCCOCCOCCOCCOC', 'NCc1ccccc1', 'CC/C(C)=N/O', 'Oc1ccccc1Cl', 'c1ccc2ccccc2c1', 'Cc1ccccc1[N+](=O)[O-]', 'C=CCN=C=S', 'CCCCCCBr', 'CC(=O)O', 'Clc1ccccc1', 'c1ccc2c(c1)CCCC2', 'C=CC(=O)OC', 'NCCO', 'ClC(Cl)(Cl)Cl', 'CCCCCCCCCC(=O)OCC', 'CCC(C)N', 'COC(=O)CCCCl', 'CC(C)CC(C)(C)C', 'CCNCCO', 'OCCCl', 'OCc1ccccc1', 'Brc1ccccc1', 'CCC(O)CC', 'CCCCOCCOCCOCCCC', 'CCCCBr', 'CNCCO', 'CCOCCOCCOCCO', 'Clc1ccc(Cl)c(Cl)c1', 'CCOC(=O)CCC(=O)OCC', 'COCCOCCOCCOC', 'CCOCCOCCO', 'Cc1ccc(C)cc1', 'Cc1ccncc1', 'NC1CCCCC1', 'CCCCN(CCCC)CCCC', 'O=S(=O)(O)O', 'CC(O)CCO', 'O', 'CCOC(C)(C)C', 'CCc1ccccc1', 'CCCN', 'CCO', 'CCCC(C)C', 'CC#N', 'CCCNCCC', 'ClC(Cl)C(Cl)Cl', 'C1CCNCC1', 'CCOC(C)=O', 'CCOC=O', 'O=CO', 'CCCCCC', 'CCCCCCCCCCC', 'BrCCBr', 'C1CCCCC1', 'CCCCCCCCCC(=O)OC', 'C[N+](=O)[O-]', 'CC1CCCCC1', 'ClC(Cl)Cl', 'CCCC(C)O', 'CCCCC1CCCCC1', 'CC(C)CCO', 'CCCCCCCCCCCCCCCC', 'CCOC(=O)CC(C)=O', 'CC(O)CO', 'CCCCOCCOCCO', 'CCCC(=O)OCC', 'OCCCO', 'Cc1cccc(O)c1', 'N#CCc1ccccc1', 'CC

In [8]:
original_data

Unnamed: 0,MOL_1,MOL_2,logV,MolFrac_1,T,benchmark,type,combos
0,B(Br)(Br)Br,CCO,-0.140112,1.0,298.00,train,original,B(Br)(Br)Br-CCO
1,BrBr,CCO,-0.011187,1.0,298.00,train,original,BrBr-CCO
2,BrC(Br)(Br)Br,c1ccccc1,-0.219755,0.0,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
3,BrC(Br)(Br)Br,c1ccccc1,-0.080765,0.1,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
4,BrC(Br)(Br)Br,c1ccccc1,0.050070,0.2,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
...,...,...,...,...,...,...,...,...
40900,c1ccncc1,CCCO,-0.055304,0.6,308.15,test,original,CCCO-c1ccncc1
40901,c1ccncc1,CCCO,-0.073118,0.7,308.15,test,original,CCCO-c1ccncc1
40902,c1ccncc1,CCCO,-0.082963,0.8,308.15,test,original,CCCO-c1ccncc1
40903,c1ccncc1,CCCO,-0.091413,0.9,308.15,test,original,CCCO-c1ccncc1


In [9]:
all_data_mirror = original_data.copy()
all_data_mirror['MOL_1_New'] = all_data_mirror['MOL_2']
all_data_mirror['MOL_2_New'] = all_data_mirror['MOL_1']
all_data_mirror['MolFrac_1_New'] = 1.0 - all_data_mirror['MolFrac_1']
all_data_mirror['type'] = 'expanded'

all_data_mirror = all_data_mirror.drop(columns=['MOL_1', 'MOL_2', 'MolFrac_1'])
all_data_mirror = all_data_mirror.rename(columns={'MOL_1_New': 'MOL_1', 'MOL_2_New': 'MOL_2', 'MolFrac_1_New': 'MolFrac_1'})

all_data_mirror = all_data_mirror[original_data.columns]

all_data_mirror.head(20)

Unnamed: 0,MOL_1,MOL_2,logV,MolFrac_1,T,benchmark,type,combos
0,CCO,B(Br)(Br)Br,-0.140112,0.0,298.0,train,expanded,B(Br)(Br)Br-CCO
1,CCO,BrBr,-0.011187,0.0,298.0,train,expanded,BrBr-CCO
2,c1ccccc1,BrC(Br)(Br)Br,-0.219755,1.0,298.15,train,expanded,BrC(Br)(Br)Br-c1ccccc1
3,c1ccccc1,BrC(Br)(Br)Br,-0.080765,0.9,298.15,train,expanded,BrC(Br)(Br)Br-c1ccccc1
4,c1ccccc1,BrC(Br)(Br)Br,0.05007,0.8,298.15,train,expanded,BrC(Br)(Br)Br-c1ccccc1
5,c1ccccc1,BrC(Br)(Br)Br,0.165422,0.7,298.15,train,expanded,BrC(Br)(Br)Br-c1ccccc1
6,c1ccccc1,BrC(Br)(Br)Br,0.2647,0.6,298.15,train,expanded,BrC(Br)(Br)Br-c1ccccc1
7,c1ccccc1,BrC(Br)(Br)Br,0.349316,0.5,298.15,train,expanded,BrC(Br)(Br)Br-c1ccccc1
8,c1ccccc1,BrC(Br)(Br)Br,0.420879,0.4,298.15,train,expanded,BrC(Br)(Br)Br-c1ccccc1
9,c1ccccc1,BrC(Br)(Br)Br,0.48084,0.3,298.15,train,expanded,BrC(Br)(Br)Br-c1ccccc1


In [10]:
# combine original_data and all_data_mirror vertically
all_data = pd.concat([original_data, all_data_mirror], axis=0).reset_index(drop=True)

all_data

Unnamed: 0,MOL_1,MOL_2,logV,MolFrac_1,T,benchmark,type,combos
0,B(Br)(Br)Br,CCO,-0.140112,1.0,298.00,train,original,B(Br)(Br)Br-CCO
1,BrBr,CCO,-0.011187,1.0,298.00,train,original,BrBr-CCO
2,BrC(Br)(Br)Br,c1ccccc1,-0.219755,0.0,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
3,BrC(Br)(Br)Br,c1ccccc1,-0.080765,0.1,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
4,BrC(Br)(Br)Br,c1ccccc1,0.050070,0.2,298.15,train,original,BrC(Br)(Br)Br-c1ccccc1
...,...,...,...,...,...,...,...,...
81805,CCCO,c1ccncc1,-0.055304,0.4,308.15,test,expanded,CCCO-c1ccncc1
81806,CCCO,c1ccncc1,-0.073118,0.3,308.15,test,expanded,CCCO-c1ccncc1
81807,CCCO,c1ccncc1,-0.082963,0.2,308.15,test,expanded,CCCO-c1ccncc1
81808,CCCO,c1ccncc1,-0.091413,0.1,308.15,test,expanded,CCCO-c1ccncc1


In [11]:
all_data.to_csv(data_prefix + 'original_data.csv', index=False)