In [1]:
import os
import pickle
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import torch
import logging
import random
import pkg_resources
import sklearn
from sklearn.model_selection import train_test_split
from rxnfp.tokenization import *
# from rdkit import Chem
# from rdkit.Chem import rdChemReactions
torch.cuda.is_available()
import rxnfp
#from rxnfp.models import SmilesClassificationModel
#from rxn_yields.core import SmilesTokenizer, SmilesClassificationModel

logger = logging.getLogger(__name__)
torch.cuda.is_available()
# from simpletransformers.classification import ClassificationModel, ClassificationArgs
from rxnfp.tokenization import SmilesTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoTokenizer, AutoModel, AutoModelForSequenceClassification
from rxnfp.tokenization import SmilesTokenizer

import os
import pickle

import argparse

from tqdm.auto import tqdm
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import confusion_matrix

In [8]:
train = pd.read_csv('../ReactionClassification_2024/data/pub_train.tsv', sep='\t')

In [9]:
test = pd.read_csv('../ReactionClassification_2024/data/pub_test.tsv', sep='\t')

In [5]:
#Mean Pooling - Take attention mask into account for correct averaging
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return(sum_embeddings / sum_mask)

In [8]:
model_path = '../Janssen_project/rxn-data-from-postgresql/models/reaxys_bert/checkpoint-4356432-epoch-36'
#model_path = 'rxn-data-from-postgresql/models/reaxys_bert/checkpoint-4356432-epoch-36'

model = AutoModel.from_pretrained(model_path, output_hidden_states=True)
#tokenizer = SmilesTokenizer(vocab_file='rxn-data-from-postgresql/models/reaxys_bert/checkpoint-4235420-epoch-35/vocab.txt')
tokenizer = SmilesTokenizer(vocab_file='../Janssen_project/rxn-data-from-postgresql/models/reaxys_bert/checkpoint-4356432-epoch-36/vocab.txt')

Some weights of BertModel were not initialized from the model checkpoint at ../Janssen_project/rxn-data-from-postgresql/models/reaxys_bert/checkpoint-4356432-epoch-36 and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = model.to(device)
train_sentence_embeddings = []
test_sentence_embeddings = []
batch= 64

### Train Embedding

### Test and Val Embedding

In [12]:
train_reactions = train['canonical_rxn_with_fragment_info'].astype(str).tolist()
for i in tqdm(range(0, len(train_reactions), batch)):
    #Tokenize sentences
    encoded_input = tokenizer(train_reactions[i:i+batch], padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    #Perform pooling. In this case, mean pooling
    train_sentence_embeddings.extend([x.cpu().detach().numpy() for x in mean_pooling(model_output, encoded_input['attention_mask'])])

100%|██████████| 6260/6260 [06:12<00:00, 16.80it/s]


In [13]:
train['PreBertFP'] = train_sentence_embeddings

In [14]:
X_train = np.array(train['PreBertFP'])

In [15]:
np.save('../ReactionClassification_2024/data/PreBertFP_Pub/X_train_PreBertFP.npy', X_train)

In [16]:
#val.to_csv('data/PreBertFP/val_PreBertFP.csv', index=False)

In [17]:
test_reactions = test['canonical_rxn_with_fragment_info'].astype(str).tolist()
for i in tqdm(range(0, len(test_reactions), batch)):
    #Tokenize sentences
    encoded_input = tokenizer(test_reactions[i:i+batch], padding=True, truncation=True, max_length=512, return_tensors='pt').to(device)

    #Compute token embeddings
    with torch.no_grad():
        model_output = model(**encoded_input)

    #Perform pooling. In this case, mean pooling
    test_sentence_embeddings.extend([x.cpu().detach().numpy() for x in mean_pooling(model_output, encoded_input['attention_mask'])])

100%|██████████| 696/696 [00:41<00:00, 16.60it/s]


In [18]:
test['PreBertFP'] = test_sentence_embeddings
X_test = np.array(test['PreBertFP'])

In [19]:
np.save('../ReactionClassification_2024/data/PreBertFP_Pub/X_test_PreBertFP.npy', X_test)

In [24]:
#test.to_csv('data/PreBertFP/test_PreBertFP.csv', index=False)

### Data Load

In [3]:
X_train = np.load('../ReactionClassification_2024/data/PreBertFP_Pub/X_train_PreBertFP.npy',allow_pickle=True)

In [4]:
X_train1 = np.vstack(X_train)
X_train = np.squeeze(X_train1)

In [5]:
#X_train = np.load('data/RXNFP/X_train_RXNFP.npy')
X_test = np.load('../ReactionClassification_2024/data/PreBertFP_Pub/X_test_PreBertFP.npy',allow_pickle=True)
#X_val = np.load('../ReactionClassification_2024/data/OptBertFPFinal2/X_val_OptBertFP.npy',allow_pickle=True)

In [6]:
X_test1 = np.vstack(X_test)
X_test = np.squeeze(X_test1)

In [12]:
y_train = train['labels'].values
y_test = test['labels'].values

### Test on Test set

In [13]:
import faiss
index = faiss.IndexFlatL2(X_train.shape[1])
index.add(X_train)

In [14]:
k = 5
batch_size = 100 

num_batches = (X_test.shape[0] + batch_size - 1) // batch_size

In [15]:
D_test_1nn = np.zeros((X_test.shape[0], k), dtype=np.float32)
I_test_1nn = np.zeros((X_test.shape[0], k), dtype=np.int64)

for b in tqdm(range(num_batches), desc='FAISS Search on Test Data'):
    start = b * batch_size
    end = min((b + 1) * batch_size, X_test.shape[0])
    D, I = index.search(X_test[start:end], k)
    D_test_1nn[start:end, :] = D
    I_test_1nn[start:end, :] = I

FAISS Search on Test Data: 100%|██████████| 446/446 [05:19<00:00,  1.39it/s]


In [16]:
y_pred_test_1nn = np.array([np.argmax(np.bincount(y_train[neighbors])) for neighbors in I_test_1nn])

In [17]:
np.save('../ReactionClassification_2024/data/PreBertFP_Pub/y_pred_test_1nn', y_pred_test_1nn)

In [18]:
y_pred_test_1nn = np.load('../ReactionClassification_2024/data/PreBertFP_Pub/y_pred_test_1nn.npy')

In [19]:
print(f'Overall Accuracy: {accuracy_score(y_test, y_pred_test_1nn)}')

Overall Accuracy: 0.6428972613511267


In [21]:
report_test1nn = classification_report(y_test, y_pred_test_1nn, output_dict=True)
report_df = pd.DataFrame(report_test1nn).transpose()
df_test1nn = report_df[:-3].reset_index().rename(columns={'index': 'labels'})
df_test1nn['test_support'] = df_test1nn['support'].astype(int)
df_test1nn['labels'] = df_test1nn['labels'].astype(int)

In [23]:
train_class_support = train['labels'].value_counts().sort_index()
train_support_df = train_class_support.reset_index()
train_support_df.columns = ['labels', 'train_support']
train_support_df['labels'] = train_support_df['labels'].astype(int)

In [24]:
# Merge with df_val1nn on 'CLASS-ID'
df_test1nn_report = pd.merge(df_test1nn, train_support_df, on='labels', how='left')
# Sort based on the number of train_support
df_test1nn_report = df_test1nn_report.sort_values(by='train_support', ascending=False)
df_test1nn_report = df_test1nn_report.drop(columns=['support'])