### Apply SynFormer to the SMILES strings to generate the dataset of synthesis tokens 

<div class="alert alert-warning">  
By this point, you already need the Huggingface dataset `pdb_protein_ligand_complexes` in the form of `pdb_protein_ligand_train.p` and `pdb_protein_ligand_test.p`. 
</pre>
</div>

<div class="alert alert-warning">  
The following code is based on the sample_naive.py script, i.e. no parallelization whatsoever. To make more efficient, rewrite based on scripts/sample.py instead of scripts/sample_naive.py 
</div>

In [None]:
# I want a dataset (lig_id, original_smiles, projected_smiles), and then possibly multiple projections 
# per lig_id entry, so (lig_id, original_smiles, projected_smiles, projection_id) 
# But besides the SMILES, we need all 3 outputs from SynFormer: token type, reaction token, reactant token 
# If I remember correctly, there is a vocabulary of reaction tokens, so it would be enough to store the reaction token IDs,
# which we can look up in the 
# I'm not sure if we can do the same with the reactants, as there is no vocabulary. But from the ID, perhaps we can
# get the fingerprint and then at will we can pass it through the embedding layer to get the embedding vector 
# Because if we store the entire embedding vector for each reactant in all examples, that dataset would be unnecessarily big
# Ideally, we just keep the IDs 
# We want the ground-truth embeddings anyway, not the predicted embeddings which would then be Nearest-Neighbor'ed to the ground truth embedding 
# So now the dataset would look like this: 

# (lig_id, original_smiles, projected_smiles, projection_id, token_types_tensor, reactions_tensor, reactants_tensor) 

# the tensors all seem to have a fixed sequence length of 24 
# I could also simply store all non-END and non-START tokens, and then during the prediction I prepend/append them, as they're always the same 

# optionally, I could also store the similarity score between the original and projected molecules 

The following code is partly based on `scripts/sample_naive.py` form the Synformer repo 

In [None]:
# Imports from scripts/sample_naive.py: 
import pathlib
import pickle
import torch
from omegaconf import OmegaConf
from synformer.chem.fpindex import FingerprintIndex
from synformer.chem.matrix import ReactantReactionMatrix
from synformer.chem.mol import Molecule
from synformer.models.synformer import Synformer

# My own imports:
import numpy as np 
import pandas as pd 
from synformer.scripts.sample_naive import load_model, featurize_smiles 
# from synformer.models.synformer import draw_generation_results
from synformer.data.common import TokenType
from datetime import datetime 

In [None]:
MODEL_PATH = "data/trained_weights/sf_ed_default.ckpt"
CONFIG_PATH = None 
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
model, fpindex, rxn_matrix = load_model(MODEL_PATH, CONFIG_PATH, DEVICE)

In [None]:
def get_synthetic_pathway(smiles, lig_id=None, repeat=1):
    """For a list of smiles"""
    data = []
    mol, feat = featurize_smiles(smiles, DEVICE, repeat=repeat)
    with torch.inference_mode():
        result = model.generate_without_stack(
            feat,
            rxn_matrix=rxn_matrix,
            fpindex=fpindex,
            temperature_token=1.0,
            temperature_reactant=0.1,
            temperature_reaction=1.0,
        )
        ll = model.get_log_likelihood(
            code=result.code,
            code_padding_mask=result.code_padding_mask,
            token_types=result.token_types,
            rxn_indices=result.rxn_indices,
            reactant_fps=result.reactant_fps,
            token_padding_mask=result.token_padding_mask,
        )
    stacks = result.build() 
    for i, stack in enumerate(stacks):
        # Only those sequences with stack depth of 1 (i.e. applying the building blocks and reactions leads to 1 final molecule) 
        # are considered valid results!! 
        if stack.get_stack_depth() == 1:
            analog_mol = stack.get_one_top()
            sim = analog_mol.sim(mol)
            # TODO: perhaps only continue if similarity score sufficiently high? 
            token_types = result.token_types[i]
            # Location of first END token (we only need to store tokens up until this point): 
            end_id = token_types.tolist().index(0)
            token_types = token_types[:end_id].tolist()
            rxn_indices = result.rxn_indices[i,:end_id].tolist()
            reactant_indices = result.reactant_indices[i,:end_id].tolist()
            data.append([
                lig_id,
                smiles, 
                analog_mol.smiles, 
                round(sim, 4), 
                token_types,
                rxn_indices,
                reactant_indices
            ])
    return data

Apply to dataset:

In [None]:
# Load lists of unique proteins and ligands: 

protein_ligand_train = pd.read_pickle("data/pdb_protein_ligand_train.p")
protein_ligand_test = pd.read_pickle("data/pdb_protein_ligand_test.p")

if os.path.exists("data/pdb_protein_ligand__unique_ligands.csv"):
    unique_proteins = pd.read_csv("data/pdb_protein_ligand__unique_ligands.csv")
else:
    ligands_train = protein_ligand_train[["lig_id", "smiles"]].drop_duplicates()
    ligands_test = protein_ligand_test[["lig_id", "smiles"]].drop_duplicates()
    unique_ligands = pd.concat([ligands_train, ligands_test]).drop_duplicates()
    unique_ligands.to_csv("data/pdb_protein_ligand__unique_ligands.csv")

if os.path.exists("data/pdb_protein_ligand__unique_proteins.csv"):
    unique_proteins = pd.read_csv("data/pdb_protein_ligand__unique_proteins.csv")
else:
    proteins_train = protein_ligand_train[["pdb_id", "seq"]]
    proteins_test = protein_ligand_test[["pdb_id", "seq"]]
    unique_proteins = pd.concat([proteins_train, proteins_test]).drop_duplicates()
    unique_proteins.to_csv("data/pdb_protein_ligand__unique_proteins.csv")
    
del protein_ligand_train
del protein_ligand_test

print(ligands_train.shape[0], "unique ligands")
print(ligands_test.shape[0], "unique ligands")

In [None]:
%%time 

data = {}
repeat = 2

# for ligands, dataset_name in [(ligands_test, "test"), (ligands_train, "train")]:
ligands = ligands_test
dataset_name = "test" 
data[dataset_name] = []

for i, row in ligands[:10].iterrows():
    try:
        lig_id = row["lig_id"].item() 
        smiles = row["smiles"]
        print(i, lig_id, smiles)
        records = get_synthetic_pathway(smiles, lig_id=lig_id, repeat=repeat)
        for record in records:
            if record not in data[dataset_name]:
                data[dataset_name].append(record)
    except Exception as e:
        # raise e 
        print(f"Error processing SMILES {i} ({e})")
pickle.dump(
    data[dataset_name], 
    open(f"data/synformer_ligands_{dataset_name}_{datetime.now().strftime('%Y-%m-%d %H-%M-%S')}.pkl", "wb")
)

In [None]:
columns = ["lig_id", "smiles_original", "smiles_proj", "similarity", "token_types", "rxn_indices", "reactant_indices"]

df = pd.DataFrame(data["test"], columns=columns)
df

Just for convenience: reconstructing the sequence of building blocks and reactions: 

In [None]:
def reconstruct_pathway(token_types, rxn_indices, reactant_indices):
    """
    From a list of token types, reaction indices and reactant indices, construct the synthetic pathway
    E.g. [1, 3], [0, 99], [0, 10]
         This would be mapped to [START, B10, END]
         At seq index 0, we have token type 1, which corresponds to the START token, so that's the first token 
         At seq index 1, we have token type 3, which corresponds to a REACTANT token, which we fetch from reactant_indices
                         and in this case is token ID 10, so the token is "B10"
         That's all of them. 
         At the end, I append an END token.
    Token types:
      0: END token (also used for padding, following the actual END token) 
      1: START token 
      2: REACTION
      3: REACTANT
    """
    pathway = []
    for i, token_type in enumerate(token_types):
        match token_type:
            case 1:
                token = "START"
            case 2:
                token = f"R{rxn_indices[i]}"
            case 3:
                token = f"B{reactant_indices[i]}"
            case _:
                token = None 
        pathway.append(token)
    pathway.append("END")
    return pathway

In [None]:
for _, row in df.iloc[:10].iterrows():
    print(reconstruct_pathway(row["token_types"], row["rxn_indices"], row["reactant_indices"]))