In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import logging
import random
import pkg_resources
import sklearn
from sklearn.model_selection import train_test_split
from rxnfp.tokenization import *
# from rdkit import Chem
# from rdkit.Chem import rdChemReactions
torch.cuda.is_available()
import rxnfp
#from rxnfp.models import SmilesClassificationModel
#from rxn_yields.core import SmilesTokenizer, SmilesClassificationModel

logger = logging.getLogger(__name__)
torch.cuda.is_available()
# from simpletransformers.classification import ClassificationModel, ClassificationArgs
from rxnfp.tokenization import SmilesTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from rxnfp.tokenization import SmilesTokenizer

import os
import pickle

import argparse

from tqdm.auto import tqdm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

In [3]:
train = pd.read_csv('../ReactionClassification_2024/data/pub_train.tsv', sep='\t')

In [4]:
test = pd.read_csv('../ReactionClassification_2024/data/pub_test.tsv', sep='\t')

### Train Embedding

In [5]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return(sum_embeddings / sum_mask)

In [6]:
model_path = 'data/fine_tuned_model_Final0.3'
#model_path = 'rxn-data-from-postgresql/models/reaxys_bert/checkpoint-4356432-epoch-36'

model = AutoModel.from_pretrained(model_path, output_hidden_states=True)
#tokenizer = SmilesTokenizer(vocab_file='rxn-data-from-postgresql/models/reaxys_bert/checkpoint-4235420-epoch-35/vocab.txt')
tokenizer = SmilesTokenizer(vocab_file='data/fine_tuned_model_Final0.3/vocab.txt')

In [8]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
val_sentence_embeddings = []
train_sentence_embeddings = []
test_sentence_embeddings = []
batch= 32

### Test and Val Embedding

In [9]:
train_reactions = train['canonical_rxn_with_fragment_info'].astype(str).tolist()
for i in tqdm(range(0, len(train_reactions), batch)):
    #Tokenize sentences
    encoded_input = tokenizer(train_reactions[i:i+batch], padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    #Perform pooling. In this case, mean pooling
    train_sentence_embeddings.extend([x.cpu().detach().numpy() for x in mean_pooling(model_output, encoded_input['attention_mask'])])

100%|██████████| 12519/12519 [06:09<00:00, 33.84it/s]


In [10]:
train['OptBertFP'] = train_sentence_embeddings

In [11]:
X_train = np.array(train['OptBertFP'])

In [12]:
np.save('../ReactionClassification_2024/data/OptBertFP_Pub/X_train_OptBertFP.npy', X_train)

In [17]:
#val.to_csv('data/PreBertFP/val_PreBertFP.csv', index=False)

In [13]:
test_reactions = test['canonical_rxn_with_fragment_info'].astype(str).tolist()
for i in tqdm(range(0, len(test_reactions), batch)):
    #Tokenize sentences
    encoded_input = tokenizer(test_reactions[i:i+batch], padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    #Perform pooling. In this case, mean pooling
    test_sentence_embeddings.extend([x.cpu().detach().numpy() for x in mean_pooling(model_output, encoded_input['attention_mask'])])

100%|██████████| 1391/1391 [00:41<00:00, 33.66it/s]


In [14]:
test['OptBertFP'] = test_sentence_embeddings
X_test = np.array(test['OptBertFP'])

In [15]:
np.save('../ReactionClassification_2024/data/OptBertFP_Pub/X_test_OptBertFP.npy', X_test)

In [17]:
#test.to_csv('data/PreBertFP/test_PreBertFP.csv', index=False)

### Data Load

In [5]:
X_train = np.load('../ReactionClassification_2024/data/OptBertFP_Pub/X_train_OptBertFP.npy',allow_pickle=True)

In [16]:
X_train1 = np.vstack(X_train)
X_train = np.squeeze(X_train1)

In [7]:
#X_train = np.load('data/RXNFP/X_train_RXNFP.npy')
X_test = np.load('../ReactionClassification_2024/data/OptBertFP_Pub/X_test_OptBertFP.npy',allow_pickle=True)
#X_val = np.load('../ReactionClassification_2024/data/OptBertFPFinal2/X_val_OptBertFP.npy',allow_pickle=True)

In [17]:
X_test1 = np.vstack(X_test)
X_test = np.squeeze(X_test1)

In [18]:
y_train = train['labels'].values
y_test = test['labels'].values

### Test on Test set

In [19]:
import faiss
index = faiss.IndexFlatL2(X_train.shape[1])
index.add(X_train)

In [20]:
k = 5
batch_size = 100

num_batches = (X_test.shape[0] + batch_size - 1) // batch_size

In [21]:
D_test_1nn = np.zeros((X_test.shape[0], k), dtype=np.float32)
I_test_1nn = np.zeros((X_test.shape[0], k), dtype=np.int64)

for b in tqdm(range(num_batches), desc='FAISS Search on Test Data'):
    start = b * batch_size
    end = min((b + 1) * batch_size, X_test.shape[0])
    D, I = index.search(X_test[start:end], k)
    D_test_1nn[start:end, :] = D
    I_test_1nn[start:end, :] = I

FAISS Search on Test Data: 100%|██████████| 446/446 [05:17<00:00,  1.40it/s]


In [23]:
y_pred_test_1nn = np.array([np.argmax(np.bincount(y_train[neighbors])) for neighbors in I_test_1nn])

In [25]:
np.save('../ReactionClassification_2024/data/OptBertFP_Pub/y_pred_test_1nn', y_pred_test_1nn)

In [26]:
y_pred_test_1nn = np.load('../ReactionClassification_2024/data/OptBertFP_Pub/y_pred_test_1nn.npy')

In [24]:
print(f'Overall Accuracy: {accuracy_score(y_test, y_pred_test_1nn)}')

Overall Accuracy: 0.7069488441059514


In [37]:
report_test1nn = classification_report(y_test, y_pred_test_1nn, output_dict=True)
report_df = pd.DataFrame(report_test1nn).transpose()
df_test1nn = report_df[:-3].reset_index().rename(columns={'index': 'labels'})
df_test1nn['test_support'] = df_test1nn['support'].astype(int)
df_test1nn['labels'] = df_test1nn['labels'].astype(int)

In [30]:
train_class_support = train['labels'].value_counts().sort_index()
train_support_df = train_class_support.reset_index()
train_support_df.columns = ['labels', 'train_support']
train_support_df['labels'] = train_support_df['labels'].astype(int)

In [31]:
# Merge with df_val1nn on 'CLASS-ID'
df_test1nn_report = pd.merge(df_test1nn, train_support_df, on='labels', how='left')
# Sort based on the number of train_support
df_test1nn_report = df_test1nn_report.sort_values(by='train_support', ascending=False)
df_test1nn_report = df_test1nn_report.drop(columns=['support'])

In [None]:
df_class = pd.read_csv('../ReactionClassification_2024/data/className.tsv', sep='\t', encoding='ISO-8859-1')

In [None]:
# Convert multi Class-ID into one
def clean_class_id(row):
    # Split the string by comma and convert to a list
    class_ids = str(row['CLASS-ID']).split(',')
    # Return the first element from the list, ensuring it's an integer
    return int(class_ids[0].strip())

# Apply the function to the 'CLASS-ID' column
df_class['CLASS-ID'] = df_class.apply(clean_class_id, axis=1)
df_class['CLASS-ID'] = df_class['CLASS-ID'].astype(int)

In [None]:
df_test1nn = pd.merge(df_test1nn_report, df_class[['CLASS-ID', 'TRANSFORM_NAME', 'TRANSFORM_ID']], on='CLASS-ID', how='left')

In [None]:
df_test1nn

In [None]:
df_test1nn.to_csv('data/OptBertFPFinal2/df_test1nn.csv', index=False)

In [33]:
# Calculate macro average for precision, recall, and f1-score
macro_precision2 = df_test1nn['precision'].mean()
macro_recall2 = df_test1nn['recall'].mean()
macro_f12 = df_test1nn['f1-score'].mean()

# Calculate weighted average for precision, recall, and f1-score
weighted_precision2 = (df_test1nn['precision'] * df_test1nn['test_support']).sum() / df_test1nn['test_support'].sum()
weighted_recall2 = (df_test1nn['recall'] * df_test1nn['test_support']).sum() / df_test1nn['test_support'].sum()
weighted_f12 = (df_test1nn['f1-score'] * df_test1nn['test_support']).sum() / df_test1nn['test_support'].sum()

In [34]:
output1 = f"""
Macro-averages:
- Precision: {macro_precision2:.4f}
- Recall: {macro_recall2:.4f}
- F1-score: {macro_f12:.4f}

Weighted-averages:
- Precision: {weighted_precision2:.4f}
- Recall: {weighted_recall2:.4f}
- F1-score: {weighted_f12:.4f}
"""

In [35]:
print(output1)


Macro-averages:
- Precision: 0.7086
- Recall: 0.6986
- F1-score: 0.6974

Weighted-averages:
- Precision: 0.7281
- Recall: 0.7223
- F1-score: 0.7223

