In [1]:
from transformers import AutoModel, AutoTokenizer
import torch

class ChemBERTaEmbedder:
    def __init__(self, model_name="DeepChem/ChemBERTa-10M-MLM", max_length=128): #DeepChem/ChemBERTa-77M-MLM"
        """Initialize model and tokenizer once."""
        self.model = AutoModel.from_pretrained(model_name, use_safetensors=True)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model.eval()
        self.max_length = max_length
        self.embedding_dim = 384  # ChemBERTa embedding dimension
        print(f"Model loaded: {model_name}")
        print(f"Max sequence length: {max_length}")

    def embed(self, smiles_list, pooling="cls", flatten=False):
        """Generate embeddings for SMILES strings with fixed output shape."""
        inputs = self.tokenizer(
            smiles_list,
            return_tensors="pt",
            padding="max_length",
            truncation=True,
            max_length=self.max_length
        )

        with torch.no_grad():
            outputs = self.model(**inputs)

        if pooling == "cls":
            embeddings = outputs.last_hidden_state[:, 0, :]
        elif pooling == "mean":
            attention_mask = inputs['attention_mask']
            token_embeddings = outputs.last_hidden_state
            input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
            sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
            sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
            embeddings = sum_embeddings / sum_mask
        elif pooling is None:
            embeddings = outputs.last_hidden_state
            if flatten:
                # Flatten to (batch_size, max_length * embedding_dim)
                embeddings = embeddings.reshape(embeddings.shape[0], -1)
        else:
            raise ValueError(f"Unknown pooling: {pooling}")

        return embeddings




In [None]:
import pandas as pd
#I downloaded the smiles map from s3

df = pd.read_parquet('/content/smiles_map (1).parquet')

df.head()

Unnamed: 0,pert_iname,smiles
0,10-DEBC,CCN(CC)CCCCN1C2=CC=CC=C2OC3=C1C=C(C=C3)Cl
1,1271738-62-5,CCCC1=CC2=C(N=CN=C2S1)N3CCN(CC3)C4=NCC(S4)(C)C
2,"16,16-dimethylprostaglandin-e2",CCCCC(C)(C)C(C=CC1C(CC(=O)C1CC=CCCCC(=O)O)O)O
3,17-hydroxyprogesterone-caproate,CCCCCC(=O)OC1(CCC2C1(CCC3C2CCC4=CC(=O)CCC34C)C...
4,"2',5'-dideoxyadenosine",CC1C(CC(O1)N2C=NC3=C(N=CN=C32)N)O


In [5]:
embedder = ChemBERTaEmbedder(max_length=128)

# Ensure all SMILES strings are valid by dropping NaN/None and converting to string
cleaned_smiles_list = df['smiles'].dropna().astype(str).tolist()

train_embeddings = embedder.embed(cleaned_smiles_list, pooling="mean").numpy()

print(train_embeddings.shape)

Some weights of RobertaModel were not initialized from the model checkpoint at DeepChem/ChemBERTa-10M-MLM and are newly initialized: ['pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Model loaded: DeepChem/ChemBERTa-10M-MLM
Max sequence length: 128
(1740, 384)


In [None]:
import pandas as pd
#only embedding values
df_embeddings = pd.DataFrame(train_embeddings)
df_embeddings.to_csv('embbedings.csv', index=False)

In [None]:
# Create a df with embeddings and molecule names

aligned_df = df.dropna(subset=['smiles']).copy()

# Ensure the 'smiles' column is of string type in the aligned_df as well,
aligned_df['smiles'] = aligned_df['smiles'].astype(str)

# Extract the 'pert_iname' values in the correct, aligned order.
pert_inames = aligned_df['pert_iname'].tolist()

# Create a DataFrame from the numerical embeddings.
df_embeddings_values = pd.DataFrame(train_embeddings)

# Check for alignment to prevent errors, although it should match if previous steps were correct.
if len(pert_inames) != len(df_embeddings_values):
    raise ValueError("Mismatch between the number of molecule names and embeddings. Please check data alignment.")

# Create a new DataFrame with the molecule names and then concatenate the embeddings.
df_final_embeddings = pd.DataFrame({
    'pert_iname': pert_inames
})

df_final_embeddings = pd.concat([df_final_embeddings, df_embeddings_values], axis=1)

# Save the combined DataFrame to a new CSV file.
df_final_embeddings.to_csv('embbedings_with_molecule_names.csv', index=False)

print("Embeddings with molecule names saved to 'embbedings_with_molecule_names.csv'")


Embeddings with molecule names saved to 'embbedings_with_molecule_names.csv'
