In [1]:
# Import packages
import numpy as np
import tensorflow as tf
from keras._tf_keras.keras import layers
from keras._tf_keras.keras.preprocessing.text import Tokenizer
from keras._tf_keras.keras.preprocessing.sequence import pad_sequences

In [2]:
# Esempio di stringhe SMILES (in un'applicazione reale, useresti un dataset più ampio)
smiles_corpus = [
    "CC(C)CC1=CC=CC=C1", "CC(C)C1=CC=C(C=C1)C(O)=O", "CC(C)C1=CC=CC=C1O", "CCC1=CC=C(C=C1)C=O",
    "CCOC(=O)C1=CC=CC=C1", "CC(C)C1=CC=C(C=C1)O", "COC1=CC=C(C=C1)C(O)=O", "CCCC1=CC=C(C=C1)O",
    "CC(C)CC1=CC=CC=C1O", "CC(C)COC1=CC=CC=C1", "CCC1=CC=CC=C1OC", "CCOC1=CC=CC=C1C",
    "CCCC1=CC=CC=C1C", "CC(C)C1=CC=C(C=C1)C=O", "COC1=CC=C(C=C1)C=O", "CCC1=CC=C(C=C1)C=O",
    "CC(C)C1=CC=C(C=C1)C(O)=O", "CCOC1=CC=CC=C1O", "CCCC1=CC=C(C=C1)O", "CC(C)C1=CC=CC=C1C",
    "CCCC1=CC=CC=C1C=O", "CCOC1=CC=CC=C1C=O", "CC(C)COC1=CC=CC=C1O", "COC1=CC=C(C=C1)C(O)=O",
    "CC(C)C1=CC=CC=C1OC", "CCCC1=CC=CC=C1OC", "COC1=CC=CC=C1OC", "CCC1=CC=C(C=C1)C(O)=O",
    "CCCC1=CC=CC=C1OC=O", "CCOC1=CC=CC=C1OC", "CC(C)C1=CC=CC=C1CO", "CCCC1=CC=C(C=C1)C=O",
    "CCOC1=CC=CC=C1CO", "COC1=CC=CC=C1CO", "CC(C)CC1=CC=C(C=C1)O", "CC(C)CC1=CC=CC=C1CO",
    "CCCC1=CC=CC=C1CO", "CCOC1=CC=CC=C1COC", "COC1=CC=C(C=C1)CO", "CCCC1=CC=CC=C1COC",
    "CC(C)C1=CC=CC=C1COC", "CCCC1=CC=C(C=C1)COC", "COC1=CC=C(C=C1)COC", "CCOC1=CC=CC=C1COC",
    "CC(C)C1=CC=C(C=C1)C=O", "CCCC1=CC=CC=C1COC=O", "CC(C)CC1=CC=CC=C1COC=O", "CCOC1=CC=CC=C1COC=O",
    "COC1=CC=CC=C1COC=O", "CCCC1=CC=CC=C1COC=O", "CCOC1=CC=CC=C1C(O)=O", "CC(C)COC1=CC=CC=C1C(O)=O",
    "CC(C)C1=CC=C(C=C1)C=O", "CCCC1=CC=C(C=C1)COC=O", "CCOC1=CC=CC=C1C(O)=O", "COC1=CC=C(C=C1)COC=O",
    "CCCC1=CC=CC=C1C(O)=O", "CC(C)C1=CC=CC=C1COC=O", "COC1=CC=CC=C1C(O)=O", "CCCC1=CC=CC=C1C(O)=O",
    "CCOC1=CC=CC=C1C=O", "CC(C)C1=CC=CC=C1COC", "COC1=CC=C(C=C1)COC=O", "CCCC1=CC=CC=C1COC=O",
    "CC(C)C1=CC=C(C=C1)COC=O", "CCCC1=CC=CC=C1COC=O", "COC1=CC=C(C=C1)COC", "CCOC1=CC=CC=C1COC=O",
    "CC(C)COC1=CC=CC=C1COC=O", "CCCC1=CC=CC=C1COC=O", "CC(C)COC1=CC=CC=C1COC", "COC1=CC=CC=C1COC",
    "CCOC1=CC=CC=C1COC=O", "CC(C)C1=CC=CC=C1COC=O", "CCCC1=CC=CC=C1COC", "CC(C)COC1=CC=CC=C1COC",
    "COC1=CC=CC=C1COC=O", "CCCC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC=O", "COC1=CC=CC=C1COC",
    "CC(C)COC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC=O", "CCOC1=CC=CC=C1COC=O",
    "CC(C)COC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC",
    "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC=O", "CCOC1=CC=CC=C1COC=O", "CCCC1=CC=CC=C1COC",
    "CCOC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC=O",
    "CCOC1=CC=CC=C1COC=O", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC",
    "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC",
    "COC1=CC=CC=C1COC=O", "CCOC1=CC=CC=C1COC=O", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC=O",
    "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC",
    "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC",
    "COC1=CC=CC=C1COC=O", "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC",
    "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC",
    "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC",
    "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC",
    "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC",
    "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC",
    "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC",
    "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC",
    "CCCC1=CC=CC=C1COC", "COC1=CC=CC=C1COC", "CCOC1=CC=CC=C1COC", "CCCC1=CC=CC=C1COC"
]


In [3]:
# Tokenizzazione del testo
tokenizer = Tokenizer(char_level=True) # ci interessa scendere nel dettaglio delle molecole
tokenizer.fit_on_texts(smiles_corpus) # estrae i token dal corpus
total_words = len(tokenizer.word_index) + 1 # la lunghezza del vocabolario
print(tokenizer.word_index)

# Creazione delle sequenze di input
input_sequences = [] #lista vuota delle sequenze di input
for line in smiles_corpus: #itero sulle line del corpus i.e. sulle SMILE
    #print(line)
    token_list = tokenizer.texts_to_sequences([line])[0]
    #print(token_list)
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        #print(n_gram_sequence)
        input_sequences.append(n_gram_sequence)

# Padding delle sequenze per avere tutte le stesse lunghezze
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

# Creazione dei dati di input e output
X = input_sequences[:, :-1]
y = tf.keras.utils.to_categorical(input_sequences[:, -1], num_classes=total_words)

print("Forma di X:", X.shape)
print("Forma di y:", y.shape)
print("Numero totale di parole:", total_words)

{'c': 1, '=': 2, '1': 3, 'o': 4, '(': 5, ')': 6}
Forma di X: (2570, 23)
Forma di y: (2570, 7)
Numero totale di parole: 7


# Transformer

In [4]:
def create_transformer_model(vocab_size, embed_dim, num_heads, ff_dim, maxlen):
    inputs = layers.Input(shape=(maxlen - 1,))

    # Strato di embedding
    embedding_layer = layers.Embedding(vocab_size, embed_dim, input_length=maxlen - 1)(inputs)

    # Blocco Trasformatore
    attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)(embedding_layer, embedding_layer)
    attention_output = layers.LayerNormalization(epsilon=1e-6)(attention_output + embedding_layer)

    ff_output = layers.Dense(ff_dim, activation="relu")(attention_output)
    ff_output = layers.Dense(embed_dim)(ff_output)
    x = layers.LayerNormalization(epsilon=1e-6)(ff_output + attention_output)

    # Pooling e classificatore finale
    x = layers.GlobalAveragePooling1D()(x)
    x = layers.Dense(20, activation="relu")(x)
    outputs = layers.Dense(vocab_size, activation="softmax")(x)

    model = tf.keras.Model(inputs=inputs, outputs=outputs)

    model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
    return model

Compile the TRANSFORMER

In [5]:
# Parametri del modello -- da fine-tunare
embed_dim = 64  # Dimensione degli embeddings
num_heads = 2   # Numero di teste nel Multi-head attention
ff_dim = 64     # Dimensione del feed-forward
vocab_size = total_words

# Creazione del modello
generator = create_transformer_model(vocab_size, embed_dim, num_heads, ff_dim, max_sequence_len)

# summary del modello
generator.summary()



In [6]:
# train the transformer
generator.fit(X, y, epochs=20, batch_size=2)

Epoch 1/20
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - accuracy: 0.5030 - loss: 1.3675
Epoch 2/20
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5314 - loss: 1.2587
Epoch 3/20
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.5988 - loss: 1.0416
Epoch 4/20
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.6305 - loss: 0.9318
Epoch 5/20
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 4ms/step - accuracy: 0.6495 - loss: 0.8739
Epoch 6/20
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 7ms/step - accuracy: 0.6886 - loss: 0.7866
Epoch 7/20
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.6996 - loss: 0.7543
Epoch 8/20
[1m1285/1285[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - accuracy: 0.7144 - loss: 0.7575
Epoch 9/20
[1m1285/1285

<keras.src.callbacks.history.History at 0x1fa5efc8820>

# Generate

In [7]:
def generate_text(seed_text, next_words, model, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
        predicted = np.argmax(model.predict(token_list), axis=-1)

        output_char = ''
        for char, index in tokenizer.word_index.items():
            if index == predicted:
                output_char = char
                break
        seed_text += output_char
    return seed_text

# Example of generating a SMILES string
# "CC(C)CC1=CC=    CC=C1"
# "CC(C)CC1=CC=    CC=C1O"
# CC(C)ccc1=c=cc=c1o

seed_text = "CC(C)CC1=CC="
next_words = 5
generated_smiles = generate_text(seed_text, next_words, generator, max_sequence_len)
print("Generated SMILES:", generated_smiles)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 114ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step
Generated SMILES: CC(C)CC1=CC=c=c1o
