In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Assuming `molecular_data` is a list of molecular structures in SELFIES format
molecular_data = [
    'C[C@H](N)C(=O)O',       # Alanine
    'CC(C)C[C@H](N)C(=O)O',  # Leucine
    'N[C@@H](CC1=CNC2=C1C=CC=C2)C(=O)O',  # Tryptophan
    'CC(C)(C)C(=O)N[C@@H](CCC(=O)O)C(=O)O',  # Valine
    'CC(C)CC[C@@H](C(=O)O)N',  # Isoleucine
    'CC1=CC(=CC=C1)C[C@H](N)C(=O)O',  # Phenylalanine
    'CC(C)CC(=O)O',           # Butanoic acid
    'C1CCC(CC1)NC(=O)C2=CC=CC=C2',  # Cyclohexylphenylurea
    'CC(=O)OC1=CC=CC=C1C(=O)O',  # Aspirin
    'CCO',  # Ethanol
]

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenized_data = [tokenizer.encode(molecule, return_tensors='pt') for molecule in molecular_data]


In [3]:
config = GPT2Config(vocab_size=tokenizer.vocab_size, n_positions=512, n_ctx=512, n_embd=768, n_layer=12, n_head=12)
model = GPT2LMHeadModel(config)


In [6]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 20
for epoch in range(num_epochs):
    for batch in tokenized_data:
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


In [7]:
model.eval()
with torch.no_grad():
    for batch in tokenized_data:
        outputs = model(batch, labels=batch)
        eval_loss = outputs.loss.item()
        print(f'Evaluation loss: {eval_loss}')


Evaluation loss: 0.09984041750431061
Evaluation loss: 0.29020264744758606
Evaluation loss: 0.04059005528688431
Evaluation loss: 0.19716304540634155
Evaluation loss: 0.30552205443382263
Evaluation loss: 0.20819862186908722
Evaluation loss: 0.2720796763896942
Evaluation loss: 0.0556388720870018
Evaluation loss: 0.18612971901893616
Evaluation loss: 0.32996782660484314


In [11]:
def generate_molecule(prompt, max_length=50, temperature=1.0):
    # Handle empty prompt case
    if not prompt:
        prompt = "[START]"  # Use a special token or any non-empty string

    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    
    # Decode the generated text
    generated_molecule = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_molecule

# Example prompt (can be a partial SELFIES string or non-empty string)
prompt = "[START]"
new_molecule = generate_molecule(prompt)
print("Generated Molecule:", new_molecule)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Molecule: [START]CC1NC2=CNC2=C1C=CC=C2)C(=O)O)O)O)O)O)O)O)O)N[C(=O)
