In [5]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification, RobertaModel
from transformers import TrainingArguments, Trainer, IntervalStrategy

import pandas as pd
import numpy as np

import torch
from torch.utils.data import DataLoader
from torch import nn
import torch.nn.functional as F

from tqdm.auto import tqdm

In [6]:
# Setup
# Load a pretrained transformer model and tokenizer
model_name = "DeepChem/ChemBERTa-77M-MTR"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RobertaModel.from_pretrained(model_name, num_labels=2, add_pooling_layer=True)

Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
def get_embeddings(df):
    embedding_df = pd.DataFrame(columns=['SMILES'] + [f'chemberta2_feature_{i}' for i in range(1, 385)])

    for index, row in tqdm(df.iterrows(), total=df.shape[0], leave=False):
      # truncate to the maximum length accepted by the model if no max_length is provided
        encodings = tokenizer(row['SMILES'], return_tensors='pt', padding='max_length', truncation=True, max_length = 512)
        with torch.no_grad():
            output = model(**encodings)
            smiles_embeddings = output.last_hidden_state[0, 0, :]

        # Ensure you move the tensor back to cpu for numpy conversion
        dic = {**{'SMILES': row['SMILES']}, **dict(zip([f'chemberta2_feature_{i}' for i in range(1, 385)], smiles_embeddings.cpu().numpy().tolist()))}
        embedding_df.loc[len(embedding_df)] = pd.Series(dic)

    return embedding_df

In [8]:
from google.colab import files
# Directly process train, valid, and test datasets
datasets = ['train_dataset','test_dataset']

for dataset_name in tqdm(datasets, desc='Processing'):
    df = pd.read_csv(f'{dataset_name}.csv')  # Adjust path as necessary
    embeddings = get_embeddings(df)
    embeddings.to_csv(f'./chemberta2_{dataset_name}_features.csv')
    files.download(f'./chemberta2_{dataset_name}_features.csv')

Processing:   0%|          | 0/2 [00:00<?, ?it/s]

  0%|          | 0/351 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

  0%|          | 0/353 [00:00<?, ?it/s]

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>