In [1]:
import pandas as pd
import rdkit
from rxnfp.transformer_fingerprints import (
    RXNBERTFingerprintGenerator, get_default_model_and_tokenizer, generate_fingerprints, RXNBERTMinhashFingerprintGenerator
)
import os
import numpy as np
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt
import pickle
from sklearn.metrics import classification_report, accuracy_score

In [2]:
train = pd.read_csv('data/train.csv', delimiter=',')

In [3]:
val = pd.read_csv('data/val.csv', delimiter=',')

In [4]:
test = pd.read_csv('data/test.csv', delimiter=',')

### Test example

In [35]:
model, tokenizer = get_default_model_and_tokenizer()

rxnmhfp_generator = RXNBERTFingerprintGenerator(model, tokenizer)

example_rxn1 = "C1CCCCC1>>C1=CC=CC=C1"
example_rxn2 = "C1CCCCC1>>c1ccccc1"
fp1 = rxnmhfp_generator.convert(example_rxn1)
fp2 = rxnmhfp_generator.convert(example_rxn2)
print(fp1[:10])
print(fp2[:10])

[-0.20927272737026215, -0.2657712399959564, 1.1200777292251587, -1.178081750869751, 0.8444035053253174, -1.5568156242370605, -0.046560872346162796, 0.8905509114265442, -0.8488855361938477, 0.55846107006073]
[-0.4799078702926636, -1.4244647026062012, -0.10955993086099625, -1.541298747062683, 1.057775855064392, -2.771310567855835, 0.036745138466358185, 1.4090137481689453, -1.0847077369689941, 0.6043930649757385]


### Embeddings

In [10]:
train_reactions = train['reaction'].astype(str)
subsets = np.array_split(train_reactions, 10)  # Split into 10 subsets

In [11]:
for i, subset in enumerate(subsets):
    subset.to_csv(f'data/train{i+1}.csv', index=False)

In [5]:
from tqdm import tqdm

def process_in_batches(data, batch_size):
    results = []
    # Wrap the range function with tqdm for a progress bar
    for i in tqdm(range(0, len(data), batch_size), desc='Processing batches'):
        batch = data[i:i+batch_size]
        batch_results = rxnmhfp_generator.convert_batch(batch)
        results.extend(batch_results)
    return results

In [5]:
train1 = pd.read_csv('data/train1.csv', delimiter=',')
train2 = pd.read_csv('data/train2.csv', delimiter=',')
train3 = pd.read_csv('data/train3.csv', delimiter=',')
train4 = pd.read_csv('data/train4.csv', delimiter=',')
train5 = pd.read_csv('data/train5.csv', delimiter=',')
train6 = pd.read_csv('data/train6.csv', delimiter=',')
train7 = pd.read_csv('data/train7.csv', delimiter=',')
train8 = pd.read_csv('data/train8.csv', delimiter=',')
train9 = pd.read_csv('data/train9.csv', delimiter=',')
train10 = pd.read_csv('data/train10.csv', delimiter=',')

In [4]:
batch_size=500

In [16]:
train_reactions1 = train1['reaction'].astype(str)
results1 = process_in_batches(train_reactions1, batch_size)
train1['rxnfp'] = results1
X_train1 = np.array(train1['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_train1_RXNFP.npy', X_train1)
train1.to_csv('data/RXNFP/train1_RXNFP.csv', index=False)

Processing batches: 100%|██████████| 482/482 [04:09<00:00,  1.93it/s]


In [17]:
train_reactions2 = train2['reaction'].astype(str)
results2 = process_in_batches(train_reactions2, batch_size)
train2['rxnfp'] = results2
X_train2 = np.array(train2['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_train2_RXNFP.npy', X_train2)
train2.to_csv('data/RXNFP/train2_RXNFP.csv', index=False)

Processing batches: 100%|██████████| 482/482 [03:58<00:00,  2.02it/s]


In [8]:
train_reactions3 = train3['reaction'].astype(str)
results3 = process_in_batches(train_reactions3, batch_size)
train3['rxnfp'] = results3
X_train3 = np.array(train3['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_train3_RXNFP.npy', X_train3)
train3.to_csv('data/RXNFP/train3_RXNFP.csv', index=False)

Processing batches: 100%|██████████| 482/482 [03:06<00:00,  2.58it/s]


In [9]:
train_reactions4 = train4['reaction'].astype(str)
results4 = process_in_batches(train_reactions4, batch_size)
train4['rxnfp'] = results4
X_train4 = np.array(train4['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_train4_RXNFP.npy', X_train4)
train4.to_csv('data/RXNFP/train4_RXNFP.csv', index=False)

Processing batches: 100%|██████████| 482/482 [04:25<00:00,  1.82it/s]


In [10]:
train_reactions5 = train5['reaction'].astype(str)
results5 = process_in_batches(train_reactions5, batch_size)
train5['rxnfp'] = results5
X_train5 = np.array(train5['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_train5_RXNFP.npy', X_train4)
train5.to_csv('data/RXNFP/train5_RXNFP.csv', index=False)

Processing batches: 100%|██████████| 482/482 [04:35<00:00,  1.75it/s]


In [9]:
train_reactions6 = train6['reaction'].astype(str)
results6 = process_in_batches(train_reactions6, batch_size)
train6['rxnfp'] = results6
X_train6 = np.array(train6['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_train6_RXNFP.npy', X_train6)
train6.to_csv('data/RXNFP/train6_RXNFP.csv', index=False)

Processing batches: 100%|██████████| 482/482 [04:33<00:00,  1.76it/s]


In [10]:
train_reactions7 = train7['reaction'].astype(str)
results7 = process_in_batches(train_reactions7, batch_size)
train7['rxnfp'] = results7
X_train7 = np.array(train7['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_train7_RXNFP.npy', X_train7)
train7.to_csv('data/RXNFP/train7_RXNFP.csv', index=False)

Processing batches: 100%|██████████| 482/482 [04:33<00:00,  1.76it/s]


In [7]:
train_reactions8 = train8['reaction'].astype(str)
results8 = process_in_batches(train_reactions8, batch_size)
train8['rxnfp'] = results8
X_train8 = np.array(train8['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_train8_RXNFP.npy', X_train8)
train8.to_csv('data/RXNFP/train8_RXNFP.csv', index=False)

Processing batches: 100%|██████████| 482/482 [04:32<00:00,  1.77it/s]


In [8]:
train_reactions9 = train9['reaction'].astype(str)
results9 = process_in_batches(train_reactions9, batch_size)
train9['rxnfp'] = results9
X_train9 = np.array(train9['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_train9_RXNFP.npy', X_train9)
train9.to_csv('data/RXNFP/train9_RXNFP.csv', index=False)

Processing batches: 100%|██████████| 482/482 [04:37<00:00,  1.74it/s]


In [7]:
train_reactions10 = train10['reaction'].astype(str)
results10 = process_in_batches(train_reactions10, batch_size)
train10['rxnfp'] = results10
X_train10 = np.array(train10['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_train10_RXNFP.npy', X_train10)
train10.to_csv('data/RXNFP/train10_RXNFP.csv', index=False)

Processing batches: 100%|██████████| 482/482 [04:12<00:00,  1.91it/s]


### Test and Val Embedding

In [11]:
val_reactions = val['reaction'].astype(str)
val_results = process_in_batches(val_reactions, batch_size)
val['rxnfp'] = val_results

Processing batches: 100%|██████████| 663/663 [06:13<00:00,  1.78it/s]


In [12]:
X_val = np.array(val['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_val_RXNFP.npy', X_val)

In [13]:
val.to_csv('data/RXNFP/val_RXNFP.csv', index=False)

In [6]:
test_reactions = test['reaction'].astype(str)
test_results = process_in_batches(test_reactions, batch_size)
test['rxnfp'] = test_results

Processing batches: 100%|██████████| 662/662 [06:09<00:00,  1.79it/s]


In [7]:
X_test = np.array(test['rxnfp'].tolist()).astype('float32')
np.save('data/RXNFP/X_test_RXNFP.npy', X_test)

In [8]:
test.to_csv('data/RXNFP/test_RXNFP.csv', index=False)

In [9]:
print('DONE')

DONE


### Data Load

In [5]:
X_train1 = np.load('data/RXNFP/X_train1_RXNFP.npy')
X_train2 = np.load('data/RXNFP/X_train2_RXNFP.npy')
X_train3 = np.load('data/RXNFP/X_train3_RXNFP.npy')
X_train4 = np.load('data/RXNFP/X_train4_RXNFP.npy')
X_train5 = np.load('data/RXNFP/X_train5_RXNFP.npy')
X_train6 = np.load('data/RXNFP/X_train6_RXNFP.npy')
X_train7 = np.load('data/RXNFP/X_train7_RXNFP.npy')
X_train8 = np.load('data/RXNFP/X_train8_RXNFP.npy')
X_train9 = np.load('data/RXNFP/X_train9_RXNFP.npy')
X_train10 = np.load('data/RXNFP/X_train10_RXNFP.npy')

In [6]:
X_train = np.concatenate([X_train1, X_train2, X_train3, X_train4, X_train5,
                          X_train6, X_train7, X_train8, X_train9, X_train10], axis=0)

In [None]:
X_test = np.load('data/RXNFP/X_test_RXNFP.npy')
X_val = np.load('data/RXNFP/X_val_RXNFP.npy')

In [None]:
train = pd.read_csv('data/train.csv', delimiter=',')
test = pd.read_csv('data/test.csv', delimiter=',')
val = pd.read_csv('data/val.csv', delimiter=',')

In [None]:
y_train = train['CLASS-ID'].values
y_test = test['CLASS-ID'].values
y_val = val['CLASS-ID'].values

In [None]:
X_train = np.squeeze(X_train)
X_test = np.squeeze(X_test)
X_val = np.squeeze(X_val)

In [1]:
import torch
torch.cuda.empty_cache()  # Clear unused memory

### Test on Val set

In [10]:
import faiss
index = faiss.IndexFlatL2(X_train.shape[1])
index.add(X_train)

In [11]:
from tqdm import tqdm
k = 1 

batch_size = 1000

num_batches = (X_val.shape[0] + batch_size - 1) // batch_size

In [12]:
D_val_1nn = np.zeros((X_val.shape[0], k), dtype=np.float32)
I_val_1nn = np.zeros((X_val.shape[0], k), dtype=np.int64)

for b in tqdm(range(num_batches), desc='Searching'):
    start = b * batch_size
    end = min((b + 1) * batch_size, X_val.shape[0])
    D, I = index.search(X_val[start:end], k)
    D_val_1nn[start:end, :] = D
    I_val_1nn[start:end, :] = I


Searching: 100%|██████████| 332/332 [2:14:13<00:00, 24.26s/it]  


In [13]:
y_pred_val_1nn = np.array(y_train[I_val_1nn])

In [14]:
np.save('data/RXNFP/y_pred_val_1nn', y_pred_val_1nn)

In [15]:
y_pred_val_1nn = np.load('data/RXNFP/y_pred_val_1nn.npy')

In [16]:
print(f'Overall Accuracy: {accuracy_score(y_val, y_pred_val_1nn)}')

Overall Accuracy: 0.9589620177486027


### Test on Test set

In [19]:
k = 1  
batch_size = 1000  

num_batches = (X_test.shape[0] + batch_size - 1) // batch_size

In [20]:
D_test_1nn = np.zeros((X_test.shape[0], k), dtype=np.float32)
I_test_1nn = np.zeros((X_test.shape[0], k), dtype=np.int64)

for b in tqdm(range(num_batches), desc='FAISS Search on Test Data'):
    start = b * batch_size
    end = min((b + 1) * batch_size, X_test.shape[0])
    D, I = index.search(X_test[start:end], k)  
    D_test_1nn[start:end, :] = D  
    I_test_1nn[start:end, :] = I  

FAISS Search on Test Data: 100%|██████████| 331/331 [2:13:45<00:00, 24.25s/it]t]


In [21]:
y_pred_test_1nn = np.array(y_train[I_test_1nn])

In [22]:
np.save('data/RXNFP/y_pred_test_1nn', y_pred_test_1nn)

In [23]:
print(f'Overall Accuracy: {accuracy_score(y_test, y_pred_test_1nn)}')

Overall Accuracy: 0.9579758773918563


In [24]:
report_test1nn = classification_report(y_test, y_pred_test_1nn, output_dict=True)
report_df = pd.DataFrame(report_test1nn).transpose()
df_test1nn = report_df[:-3].reset_index().rename(columns={'index': 'CLASS-ID'})
df_test1nn['test_support'] = df_test1nn['support'].astype(int)
df_test1nn['CLASS-ID'] = df_test1nn['CLASS-ID'].astype(int)

  _warn_prf(average, modifier, msg_start, len(result))


In [25]:
train_class_support = train['CLASS-ID'].value_counts().sort_index()
train_support_df = train_class_support.reset_index()
train_support_df.columns = ['CLASS-ID', 'train_support']
train_support_df['CLASS-ID'] = train_support_df['CLASS-ID'].astype(int)

In [26]:
# Merge with df_val1nn on 'CLASS-ID'
df_test1nn_report = pd.merge(df_test1nn, train_support_df, on='CLASS-ID', how='left')
# Sort based on the number of train_support
df_test1nn_report = df_test1nn_report.sort_values(by='train_support', ascending=False)
df_test1nn_report = df_test1nn_report.drop(columns=['support'])

In [27]:
df_class = pd.read_csv('data/className.tsv', sep='\t', encoding='ISO-8859-1')

In [28]:
# Convert multi Class-ID into one
def clean_class_id(row):
    # Split the string by comma and convert to a list
    class_ids = str(row['CLASS-ID']).split(',')
    # Return the first element from the list, ensuring it's an integer
    return int(class_ids[0].strip())

# Apply the function to the 'CLASS-ID' column
df_class['CLASS-ID'] = df_class.apply(clean_class_id, axis=1)
df_class['CLASS-ID'] = df_class['CLASS-ID'].astype(int)

In [29]:
df_test1nn = pd.merge(df_test1nn_report, df_class[['CLASS-ID', 'TRANSFORM_NAME', 'TRANSFORM_ID']], on='CLASS-ID', how='left')

In [30]:
df_test1nn

Unnamed: 0,CLASS-ID,precision,recall,f1-score,test_support,train_support,TRANSFORM_NAME,TRANSFORM_ID
0,1085,0.996836,0.996996,0.996916,24966,185024,"Suzuki coupling, Suzuki-Miyaura Cross-Coupling","(ARCOUPLG)4.1.B, (AVNAMEDR)Suzuki-Miyaura"
1,432,0.975303,0.980808,0.978048,21259,164914,N-alkylation of alkylamines,(AG2ALKN)1.1.2
2,1016,0.991660,0.989102,0.990379,15507,113635,hydrolysis of carboxylic esters,(AQCLEAV1)1.1
3,60,0.998389,0.997228,0.997808,11185,85921,reduction of C-NO2 to C-NH2,(AAREDUCT)A.1
4,433,0.943528,0.946722,0.945122,7977,60554,"N-alkylation of benzenoid amines, anilines",(AG2ALKN)1.1.3
...,...,...,...,...,...,...,...,...
1294,1267,1.000000,1.000000,1.000000,1,8,Brackeen Imidazole Synthesis,(AVNAMEDR)Brackeen
1295,1800,1.000000,0.800000,0.888889,5,8,Shestakov Hydrazino Acid Synthesis,(AVNAMEDR)Shestakov
1296,500,0.000000,0.000000,0.000000,1,7,"O-propargylation of N-hydroxy amides, includin...",(AG2ALKO)2.4.1
1297,1622,0.500000,1.000000,0.666667,1,7,ListMacMillan Hydrogenation,(AVNAMEDR)List-MacMillan


In [31]:
# Calculate macro average for precision, recall, and f1-score
macro_precision2 = df_test1nn['precision'].mean()
macro_recall2 = df_test1nn['recall'].mean()
macro_f12 = df_test1nn['f1-score'].mean()

# Calculate weighted average for precision, recall, and f1-score
weighted_precision2 = (df_test1nn['precision'] * df_test1nn['test_support']).sum() / df_test1nn['test_support'].sum()
weighted_recall2 = (df_test1nn['recall'] * df_test1nn['test_support']).sum() / df_test1nn['test_support'].sum()
weighted_f12 = (df_test1nn['f1-score'] * df_test1nn['test_support']).sum() / df_test1nn['test_support'].sum()

In [32]:
output1 = f"""
Macro-averages:
- Precision: {macro_precision2:.4f}
- Recall: {macro_recall2:.4f}
- F1-score: {macro_f12:.4f}

Weighted-averages:
- Precision: {weighted_precision2:.4f}
- Recall: {weighted_recall2:.4f}
- F1-score: {weighted_f12:.4f}
"""

In [33]:
print(output1)


Macro-averages:
- Precision: 0.8747
- Recall: 0.8642
- F1-score: 0.8625

Weighted-averages:
- Precision: 0.9581
- Recall: 0.9580
- F1-score: 0.9578

