In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer, GPT2Config


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

# Assuming `molecular_data` is a list of molecular structures in SELFIES format
molecular_data = [
    'C[C@H](N)C(=O)O',       # Alanine
    'CC(C)C[C@H](N)C(=O)O',  # Leucine
    'N[C@@H](CC1=CNC2=C1C=CC=C2)C(=O)O',  # Tryptophan
    'CC(C)(C)C(=O)N[C@@H](CCC(=O)O)C(=O)O',  # Valine
    'CC(C)CC[C@@H](C(=O)O)N',  # Isoleucine
    'CC1=CC(=CC=C1)C[C@H](N)C(=O)O',  # Phenylalanine
    'CC(C)CC(=O)O',           # Butanoic acid
    'C1CCC(CC1)NC(=O)C2=CC=CC=C2',  # Cyclohexylphenylurea
    'CC(=O)OC1=CC=CC=C1C(=O)O',  # Aspirin
    'CCO',  # Ethanol
]

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenized_data = [tokenizer.encode(molecule, return_tensors='pt') for molecule in molecular_data]


In [3]:
config = GPT2Config(vocab_size=tokenizer.vocab_size, n_positions=512, n_ctx=512, n_embd=768, n_layer=12, n_head=12)
model = GPT2LMHeadModel(config)


In [6]:
optimizer = torch.optim.AdamW(model.parameters(), lr=5e-5)
num_epochs = 20
for epoch in range(num_epochs):
    for batch in tokenized_data:
        outputs = model(batch, labels=batch)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()


In [7]:
model.eval()
with torch.no_grad():
    for batch in tokenized_data:
        outputs = model(batch, labels=batch)
        eval_loss = outputs.loss.item()
        print(f'Evaluation loss: {eval_loss}')


Evaluation loss: 0.09984041750431061
Evaluation loss: 0.29020264744758606
Evaluation loss: 0.04059005528688431
Evaluation loss: 0.19716304540634155
Evaluation loss: 0.30552205443382263
Evaluation loss: 0.20819862186908722
Evaluation loss: 0.2720796763896942
Evaluation loss: 0.0556388720870018
Evaluation loss: 0.18612971901893616
Evaluation loss: 0.32996782660484314


In [11]:
def generate_molecule(prompt, max_length=50, temperature=1.0):
    # Handle empty prompt case
    if not prompt:
        prompt = "[START]"  # Use a special token or any non-empty string

    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    # Generate text
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    
    # Decode the generated text
    generated_molecule = tokenizer.decode(output[0], skip_special_tokens=True)
    return generated_molecule

# Example prompt (can be a partial SELFIES string or non-empty string)
prompt = "[START]"
new_molecule = generate_molecule(prompt)
print("Generated Molecule:", new_molecule)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Generated Molecule: [START]CC1NC2=CNC2=C1C=CC=C2)C(=O)O)O)O)O)O)O)O)O)N[C(=O)


In [2]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
from qiskit import QuantumCircuit, transpile, assemble
from qiskit_aer import AerSimulator
from qiskit.visualization import plot_histogram
import selfies as sf
import numpy as np




# Initialize the quantum simulator
simulator = AerSimulator()

# Load the tokenizer and model
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.eval()


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

In [10]:
# Example mapping from SELFIES tokens to quantum gates
token_to_gate = {
    'C': 'h',  # Hadamard gate
    'O': 'x',  # Pauli-X gate
    'N': 'z',  # Pauli-Z gate
    '[=O]': 'y',  # Pauli-Y gate
    '[C@H]': 'rx',  # Rotation around X-axis
    '[C@@H]': 'ry',  # Rotation around Y-axis
    '[C@@]': 'rz',  # Rotation around Z-axis
    '(': 'cx',  # CNOT gate (control X)
    ')': 'cz',  # CNOT gate (control Z)
    # Add more mappings as needed
}

# Function to add gates to the quantum circuit based on token
def add_gate(qc, token, qubit):
    if token in token_to_gate:
        gate = token_to_gate[token]
        if gate == 'h':
            qc.h(qubit)
        elif gate == 'x':
            qc.x(qubit)
        elif gate == 'z':
            qc.z(qubit)
        elif gate == 'y':
            qc.y(qubit)
        elif gate == 'rx':
            qc.rx(np.pi/2, qubit)
        elif gate == 'ry':
            qc.ry(np.pi/2, qubit)
        elif gate == 'rz':
            qc.rz(np.pi/2, qubit)
        elif gate == 'cx':
            qc.cx(qubit, (qubit + 1) % qc.num_qubits)
        elif gate == 'cz':
            qc.cz(qubit, (qubit + 1) % qc.num_qubits)
        # Add more gates as needed


In [7]:
def generate_quantum_circuit(tokens):
    tokens = list(tokens)  # Convert generator to list
    num_qubits = len(tokens)
    qc = QuantumCircuit(num_qubits, num_qubits)
    
    for i, token in enumerate(tokens):
        add_gate(qc, token, i)
    
    # Measure the qubits
    qc.measure(range(num_qubits), range(num_qubits))
    
    return qc


In [8]:
def simulate_circuit(qc):
    compiled_circuit = transpile(qc, simulator)
    qobj = assemble(compiled_circuit)
    result = simulator.run(qobj).result()
    counts = result.get_counts(qc)
    return counts

def decode_output(counts):
    # Example decoding logic (needs to be customized based on your encoding)
    most_common_state = max(counts, key=counts.get)
    decoded_selfies = ""
    for bit in most_common_state:
        decoded_selfies += "C" if bit == "0" else "O"  # Example decoding
    return decoded_selfies


In [21]:
def generate_molecule(prompt, max_length=50, temperature=1.0):
    if not prompt:
        prompt = "[START]"  # Use a special token or any non-empty string

    # Encode the prompt
    input_ids = tokenizer.encode(prompt, return_tensors='pt')
    
    # Generate tokens
    with torch.no_grad():
        output = model.generate(
            input_ids,
            max_length=max_length,
            temperature=temperature,
            num_return_sequences=1,
            do_sample=True,
            top_k=50,
            top_p=0.95
        )
    
    # Decode tokens to string
    generated_selfies = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Validate the generated SELFIES string
    try:
        tokens = list(sf.split_selfies(generated_selfies))  # Convert generator to list
    except ValueError as e:
        print(f"Malformed SELFIES string: {generated_selfies}")
        return None  # Or handle the malformed SELFIES appropriately

    # Generate quantum circuit
    print(tokens)
    qc = generate_quantum_circuit(tokens)
    
    # Simulate quantum circuit
    counts = simulate_circuit(qc)
    
    # Decode the output to SELFIES
    decoded_selfies = decode_output(counts)
    
    return decoded_selfies

# Example prompt (can be a partial SELFIES string or non-empty string)
prompt = "[START]"
new_molecule = generate_molecule(prompt)
print("Generated Molecule:", new_molecule)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Malformed SELFIES string: [START]

HISTORY. A new statute, by a combination of clauses, will soon be made of the most ancient or most solemn articles: namely: A new statute, by a combination of clauses, will soon be made of the
Generated Molecule: None
