In [1]:
import os
import re
import numpy as np
import pandas as pd
import textwrap
import pickle
import torch
from accelerate import init_empty_weights, Accelerator
from transformers import AutoModelForCausalLM, AutoTokenizer
from custom_modeling_opt import CustomOPTForCausalLM
from matplotlib import pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from torch.nn.utils.rnn import pad_sequence

import multiprocessing


In [2]:
seed_value=42
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

In [3]:
import rdkit.Chem as Chem
import sys
from rdkit.Chem import RDConfig, MACCSkeys, QED
from rdkit.Chem.rdMolDescriptors import CalcTPSA, CalcCrippenDescriptors
from rdkit.Chem import Descriptors
sys.path.append(os.path.join(RDConfig.RDContribDir, 'SA_Score'))
import sascorer

In [4]:
device = "cuda:1"

In [5]:
tokenizer = AutoTokenizer.from_pretrained("/home/menuab/code/ChemLacticaTestSuite/src/tokenizer/ChemLacticaTokenizer_50066/")
print('tokenizer size: ', len(tokenizer))

tokenizer size:  50066


In [6]:
checkpoint_path = "/home/menuab/code/checkpoints/f2c6ebb289994595a478f513/125m_126k_f2c6/"
checkpoint_path = "/home/menuab/code/checkpoints/f3fbd012918247a388efa732/125m_126k_f3fb/"
checkpoint_path = "/home/menuab/code/checkpoints/90758da0b8564bae8a14bbef/125m_63k_9075/"
checkpoint_path

'/home/menuab/code/checkpoints/90758da0b8564bae8a14bbef/125m_63k_9075/'

In [7]:
model = CustomOPTForCausalLM.from_pretrained(
            checkpoint_path,
            use_flash_attn=True,
            torch_dtype=torch.bfloat16
            )
model.eval()
model.to(device)
print(f'model loaded with embedding size of : {model.model.decoder.embed_tokens.num_embeddings}')
assert(model.model.decoder.embed_tokens.num_embeddings == len(tokenizer))

model loaded with embedding size of : 50066


In [8]:
# moses = pd.read_csv('./data/dataset_v1.csv')
# moses.head()

In [9]:
# moses = moses[moses.SPLIT == 'train']
# moses.head()

In [10]:
# moses[['SAS', 'QED', 'CLOGP', 'WEIGHT']] = 0

In [11]:
# moses.head()

In [None]:

# Define the worker function
def process_molecule(mol):
    print(mol)
    mol_source = Chem.MolFromSmiles(mol)
    sas_score = sascorer.calculateScore(mol_source) 
    qed_score = round(QED.qed(mol_source), 3)
    clogp_score = round(Descriptors.MolLogP(mol_source), 3)
    weight_score = round(Descriptors.ExactMolWt(mol_source), 3)
    return sas_score, qed_score, clogp_score, weight_score

# Main function to use multiprocessing
def process_many_molecules(df):
    # Create a pool of workers
    with multiprocessing.Pool(20) as pool:
        # Process the molecules using the pool
        results = list(tqdm(pool.imap(process_molecule, df.iloc[:,0]), total=len(df)))

    # Store the results back into the DataFrame
    for en, (sas_score, qed_score, clogp_score, weight_score) in enumerate(results):
        df.iloc[en, 2] = sas_score
        df.iloc[en, 3] = qed_score
        df.iloc[en, 4] = clogp_score
        df.iloc[en, 5] = weight_score



In [24]:
# process_many_molecules(moses)

In [14]:
# moses.to_csv('moses_train_properties.csv')

In [15]:
moses = pd.read_csv('moses_train_properties.csv')

In [16]:
moses.head(5)

Unnamed: 0.1,Unnamed: 0,SMILES,SPLIT,SAS,QED,CLOGP,WEIGHT
0,0,CCCS(=O)c1ccc2[nH]c(=NC(=O)OC)[nH]c2c1,train,3.812618,0.897,1.681,281.083
1,1,CC(C)(C)C(=O)C(Oc1ccc(Cl)cc1)n1ccnc1,train,2.936327,0.862,3.729,292.098
2,3,Cc1c(Cl)cccc1Nc1ncccc1C(=O)OCC(O)CO,train,2.663795,0.701,2.297,336.088
3,4,Cn1cnc2c1c(=O)n(CC(O)CO)c(=O)n2C,train,2.989161,0.646,-2.213,254.102
4,5,CC1Oc2ccc(Cl)cc2N(CC(O)CO)C1=O,train,3.05575,0.853,0.807,271.061


In [17]:
batch_size = 64

# Initialize lists to store results
targets1, targets2, targets3, targets4 = [], [], [], []
sas_scores, clogp_scores, qed_scores, weight_scores = [], [], [], []
sac_invalids, mols = [], []

# Process in batches
for batch_start in tqdm(range(0, min(1280, len(moses)), batch_size)):
    batch_end = min(batch_start + batch_size, len(moses))
    inputs_batch = []

    for i in range(batch_start, batch_end):
        _, mol, split, sas, qed, clogp, weight = moses.iloc[i, :] # one more column in CSV
        prompt = f"</s>[SAS]{sas:.2f}[/SAS][CLOGP]{clogp:.2f}[/CLOGP][QED]{qed:.2f}[/QED][WEIGHT]{weight:.2f}[/WEIGHT]"
        inputs = tokenizer(prompt, return_tensors="pt").input_ids
        inputs = inputs.to(device)
        # print(inputs.shape, prompt)
        inputs_batch.append(inputs.squeeze(0))  # Remove the extra dimension

    min_length = min([len(i) for i in inputs_batch])

    inputs_batch = torch.stack([i[:min_length] for i in inputs_batch])  # Cutting the last tokens of long sequences
    # print(torch.stack(inputs_batch))

    # Generate outputs for the entire batch
    out = model.generate(inputs_batch, max_new_tokens=600, do_sample=False, eos_token_id=20, return_dict_in_generate=True, output_scores=True)

    # Process outputs for each sequence in the batch
    for seq in out.sequences:
        decoded = tokenizer.decode(seq)
        # print(decoded)
        try:
            if "[END_SMILES]" not in decoded:
                continue
            captured_text = decoded[decoded.find("[START_SMILES]")+len("[START_SMILES]"):decoded.find("[END_SMILES]")]
            mols.append(captured_text)
        except:
            continue


  0%|          | 0/20 [00:00<?, ?it/s]

In [18]:
len(mols)

856

In [30]:
generated = pd.DataFrame(mols, columns=['SMILES'])

In [31]:
generated[['split', 'SAS', 'QED', 'CLOGP', 'WEIGHT']] = 0

In [32]:
generated.head(10)

Unnamed: 0,SMILES,split,SAS,QED,CLOGP,WEIGHT
0,CC1=C(C=CC(=C1)C2=NN(C(=O)C2)C3CCS(=O)(=O)C3)O,0,0,0,0,0
1,CC1=CC(=C(C=C1)C)C(C)NC(=O)C2=CC(=C(C=C2)F)Cl,0,0,0,0,0
2,CC1=C(C=CC(=C1)OCC(=O)OCC2=NC(=NO2)C(C)C)O,0,0,0,0,0
3,CC(C)(C)OC(=O)NCC(=O)NCC1=CC(=O)NN1,0,0,0,0,0
4,CC(C)(C)NS(=O)(=O)C1=CC(=C(C=C1)F)C(=O)N,0,0,0,0,0
5,CC1=CC(=C(C=C1)C)C(C)NC(=O)C2=C(N=CC=C2)OC,0,0,0,0,0
6,CC1=C(C=CC(=C1)Br)NC(=O)C(C)SC2=NN=C(S2)C,0,0,0,0,0
7,CC(C)(C)OC(=O)NCC(C1=CC(=CC=C1)Br)OC,0,0,0,0,0
8,CC(C)(C)OC(=O)NC(CC1=CC=CC=C1)C(=O)O,0,0,0,0,0
9,CC1=CC(=C(C=C1)C)N2C(=O)C(N(C2=O)C)CC(=O)O,0,0,0,0,0


In [None]:
process_many_molecules(generated)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

  0%|          | 0/856 [00:00<?, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
CC1=CC(=C(C=C1)C)C(C)NC(=O)C2=C(N=CC=C2)OCCC(C)(C)OC(=O)NC(CC1=CC=CC=C1)C(=O)OCC(C)(C)NS(=O)(=O)C1=CC(=C(C=C1)F)C(=O)NCC(C)(C)OC(=O)NCC(C1=CC(=CC=C1)Br)OCCC1=C(C=C(C=C1)C(C)NC(=O)C2=C(N=CC(=C2)C)OC)ClCC1=C(C=CC(=C1)C(=O)C(C)SC2=NNC(=N2)C)Fhuggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
CC1=CC(=C(C=C1)C(=O)N2CC(C(C2)O)C(=O)OC)FCC1=CC(=C(C=C1)C)N2C(=O)C(N(C2=O)C)CC(=O)OCC1=CC=CC=C1NC(=O)C2=CC=CC=C2N(C)CC(=O)N



CC(C)(C)C(CC1=CC(=C(C=C1)OC)OC)OCCO

CC1=CC(=C(C=C1)C)OCC(=O)NC2=CC=

In [None]:
generated.head(10)

In [None]:
plt.hist(sas_scores, range=(0,8), bins=50, alpha=.5, density=True);
plt.hist(targets1, range=(0,8), bins=50, alpha=.5, density=True);

In [None]:
plt.hist(clogp_scores, range=(-2,10), bins=50, alpha=.5, density=True);
plt.hist(targets2, range=(-2,10), bins=50, alpha=.5, density=True);

In [None]:
plt.hist(qed_scores, range=(0,1), bins=50, alpha=.5, density=True);
plt.hist(targets3, range=(0,1), bins=50, alpha=.5, density=True);

In [None]:
plt.hist(weight_scores, range=(100,500), bins=100, alpha=.5, density=True);
plt.hist(targets4, range=(100,500), bins=100, alpha=.5, density=True);

In [None]:
(np.array(targets4) < 250).sum()

In [None]:
plt.scatter(sac_targets1, sac_scores)

In [None]:
plt.plot(sorted(zip(sac_targets1, sac_scores)))

In [None]:
for _, sas, qed, clogp, weight in moses.iloc[1:10,[2,3,4,5]]:
          # mol, split, sas, qed, clogp, weight = sample
          print(sas, qed, clogp, weight)

In [None]:

for i in range(len(moses)):
          a,v=moses.iloc[i,3:5]
          print(a,v)
          break