In [6]:
import numpy as np
import pandas as pd
from datasets import load_dataset

ds = load_dataset("scikit-fingerprints/MoleculeNet_ESOL")
dataset = pd.DataFrame(ds['train'])
dataset.head()

Unnamed: 0,SMILES,label
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.77
1,Cc1occc1C(=O)Nc2ccccc2,-3.3
2,CC(C)=CCCC(C)=CC(=O),-2.06
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.87
4,c1ccsc1,-1.33


In [None]:
from tqdm import tqdm
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer

chemberta = AutoModelForMaskedLM.from_pretrained("DeepChem/ChemBERTa-77M-MTR")
tokenizer = AutoTokenizer.from_pretrained("DeepChem/ChemBERTa-77M-MTR")

def featurize_ChemBERTa(smiles_list, padding=True):
    embeddings_cls = torch.zeros(len(smiles_list), 600)
    embeddings_mean = torch.zeros(len(smiles_list), 600)

    with torch.no_grad():
        for i, smiles in enumerate(tqdm(smiles_list)):
            encoded_input = tokenizer(smiles, return_tensors="pt",padding=padding,truncation=True)
            model_output = chemberta(**encoded_input)
            
            embedding = model_output[0][::,0,::]
            embeddings_cls[i] = embedding
            
            embedding = torch.mean(model_output[0],1)
            embeddings_mean[i] = embedding
            
    return embeddings_cls.numpy(), embeddings_mean.numpy()

# Featurize the dataset
X_cls, X_mean = featurize_ChemBERTa(dataset['SMILES'].tolist(), padding=True)

# save the dataset as parquet
dataset['X_cls'] = list(X_cls)
dataset['X_mean'] = list(X_mean)
display(dataset.head())
dataset.to_parquet("esol.parquet", index=False)

Some weights of RobertaForMaskedLM were not initialized from the model checkpoint at DeepChem/ChemBERTa-77M-MTR and are newly initialized: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
100%|██████████| 1128/1128 [00:03<00:00, 351.07it/s]


Unnamed: 0,SMILES,label,X_cls,X_mean
0,OCC3OC(OCC2OC(OC(C#N)c1ccccc1)C(O)C(O)C2O)C(O)...,-0.77,"[0.2102647, 0.0, -0.27269894, 0.23714472, -0.0...","[0.77250636, 0.0, -0.08014017, -0.00040053617,..."
1,Cc1occc1C(=O)Nc2ccccc2,-3.3,"[-0.49697188, 0.0, -0.86021847, -0.5236808, 0....","[0.27756685, 0.0, 0.10707331, -0.11047705, -0...."
2,CC(C)=CCCC(C)=CC(=O),-2.06,"[0.15252441, 0.0, -0.6803664, -0.9552505, -0.2...","[0.248755, 0.0, 0.038657546, -0.42417583, -0.0..."
3,c1ccc2c(c1)ccc3c2ccc4c5ccccc5ccc43,-7.87,"[0.53337896, 0.0, -0.43694103, 0.2638903, 0.17...","[0.42031112, 0.0, 0.03631047, 0.19559778, -0.4..."
4,c1ccsc1,-1.33,"[-0.11247938, 0.0, -0.23641035, -0.39198166, -...","[0.13626462, 0.0, 0.010610735, -0.27940136, -0..."
