In [None]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import InputLayer, LSTM, Dropout, Dense, Softmax
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split

# SmilesToOneHotEncoding Class
The main class that handles the complete pipeline for converting SMILES strings to one-hot encoded sequences suitable for LSTM training. It includes methods for data loading, preprocessing, tokenization, encoding, model building, training, and molecule generation.

In [None]:
class SmilesToOneHotEncoding:
    """
    A class to convert SMILES (Simplified Molecular Input Line Entry System) strings
    to one-hot encoded sequences, suitable for training machine learning models like LSTMs.
    It handles data loading, translation of specific characters, padding, vocabulary creation,
    and one-hot encoding of SMILES strings.
    """
    def __init__(self, filename, size_dataset = 100,  min_length = 20, max_length = 100):
        """
        Initializes the SmilesToOneHotEncoding class with dataset parameters and
        performs initial data preprocessing steps.

        Args:
            filename (str): Path to the file containing SMILES strings.
            size_dataset (int): The number of SMILES strings to load from the dataset.
                                Defaults to 100.
            min_length (int): Minimum length of SMILES strings to be considered.
                              Molecules shorter than this will be discarded.
                              Defaults to 20.
            max_length (int): Maximum length to pad/truncate SMILES strings to.
                              Molecules longer than this will be discarded.
                              Defaults to 100.
        """
        # Constructor initializes parameters and preprocesses data
        self.filename = filename
        self.size_dataset = size_dataset
        self.min_length = min_length
        self.max_length = max_length
        # Preprocess data upon initialization: load, translate, and pad molecules
        self.molecule_list = self.preprocess_data()
        # Create tokenizer and character-to-index mapping for encoding
        self.tokenizer, self.char_to_index = self.make_vocabulary()

    def load_file(self):
        """
        Loads SMILES strings from the specified file.

        Returns:
            list: A list of SMILES strings, with each line truncated at the first comma.
        """
        # load file from datasets
        with open(self.filename, 'r') as file:
            filecontent = file.readlines()

        # split each line using comma as the delimiter and take the first part
        for i in range(len(filecontent)):
            filecontent[i] = filecontent[i].split(" ,")[0]
        return filecontent

    def translation(self, molecule_list):
        """
        Translates specific characters in the SMILES strings to simplify the vocabulary.
        Specifically, 'Br' is replaced with 'R' and 'Cl' with 'L'.

        Args:
            molecule_list (list): A list of raw SMILES strings.

        Returns:
            list: The list of SMILES strings with specified characters translated.
        """
        for i in range(len(molecule_list)):
            molecule_list[i] = molecule_list[i].replace('Br','R') # Replace Bromine with 'R'
            molecule_list[i] = molecule_list[i].replace('Cl','L') # Replace Chlorine with 'L'
        return molecule_list

    def padding(self, molecule_list):
        """
        Pads SMILES strings with special tokens ('G' for start, 'E' for end, 'A' for padding)
        to a uniform `max_length`. Molecules outside `min_length` and `max_length` bounds are filtered.

        Args:
            molecule_list (list): A list of translated SMILES strings.

        Returns:
            list: A new list of padded and filtered SMILES strings.
        """
        new_list = []
        for i in range(len(molecule_list)):
            # Only consider molecules within the specified length range
            if not len(molecule_list[i]) >= self.max_length and not len(molecule_list[i]) < self.min_length:
                molecule = molecule_list[i]
                # Add start 'G' and end 'E' tokens
                molecule = 'G' + molecule + 'E'
                # Pad with 'A' characters to reach max_length
                molecule =  molecule + 'A' * (self.max_length - len(molecule))
                new_list.append(molecule)
        return new_list

    def preprocess_data(self):
        """
        Orchestrates the data preprocessing pipeline: loading, character translation,
        and padding/filtering of SMILES strings.

        Returns:
            list: A list of fully preprocessed SMILES strings.
        """
        # Load a subset of the file based on size_dataset
        molecule_list = self.load_file()[0:self.size_dataset]
        # Translate specific characters in the molecule list
        molecule_list = self.translation(molecule_list)
        # Pad molecules and filter by length
        molecule_list = self.padding(molecule_list)
        return molecule_list

    def make_vocabulary(self):
        """
        Creates a character-level vocabulary from the preprocessed SMILES strings
        using Keras's Tokenizer, and generates a character-to-index mapping.

        Returns:
            tuple: A tuple containing:
                - tokenizer (keras.preprocessing.text.Tokenizer): The fitted Tokenizer object.
                - char_to_index (dict): A dictionary mapping each character to its integer index.
        """
        # Instantiate the tokenizer with specific attributes for character-level tokenization
        tokenizer = Tokenizer(
            filters = '.',      # The period (.) will be removed from the text.
            split = '',         # An empty string means no splitting on spaces/other delimiters, tokenizing characters directly.
            char_level = True,  # Tokenization will be done at the character level.
            lower = False,      # Characters will not be converted to lowercase.
            num_words = 45      # Limits the tokenizer to consider only the top 45 most frequent words (characters in this case).
        )

        # Build the vocabulary from the preprocessed molecule list
        tokenizer.fit_on_texts(self.molecule_list)
        # Get the character-to-index dictionary
        char_to_index = tokenizer.word_index
        return tokenizer, char_to_index

    def str_to_encode(self):
        """
        Converts the preprocessed SMILES strings into one-hot encoded numerical sequences.
        Each character is represented as a binary vector in a sequence.

        Returns:
            numpy.ndarray: A 3D NumPy array of one-hot encoded sequences.
                           Shape: (num_molecules, max_length, vocab_size).
        """
        # Convert text sequences to integer sequences based on the vocabulary
        sequences = self.tokenizer.texts_to_sequences(self.molecule_list)

        # Keras Tokenizer reserves index 0 for padding. To use 0-based indexing for one-hot encoding,
        # we subtract 1 from all indices. This assumes no actual character is mapped to 0.
        for i in range(len(sequences)):
            sequences[i] = np.subtract(sequences[i], np.ones(self.max_length))

        # Convert integer sequences to one-hot encoded sequences
        # num_classes is the size of the vocabulary (including '0' if it were used)
        sequences = to_categorical(sequences, num_classes = len(self.char_to_index) + 1) # +1 for 0-based index compatibility
        return sequences

    def get_targets(self, sequences):
        """
        Generates target sequences (y) for training a character-level LSTM model.
        The target sequence for an input sequence is essentially the input sequence
        shifted one position to the left, with the last character replaced by an 'E' token (or padded with 'A').
        This means y[i, t, :] is the expected next character after x[i, t-1, :].

        Args:
            sequences (numpy.ndarray): The one-hot encoded input sequences (X).

        Returns:
            numpy.ndarray: A 3D NumPy array of target sequences (y).
                           Shape: (num_molecules, max_length, vocab_size).
        """
        y = np.zeros(sequences.shape) # Initialize target array with zeros, same shape as input sequences
        # For each sequence, the target is the next character in the original sequence.
        # Copy all elements except the first, shifting them one position to the left.
        # This means y[i, 0:-1, :] gets sequences[i, 1:, :]
        for i in range(sequences.shape[0]):
          y[i, 0:-1, :] = sequences[i, 1:, :]
          # For the very last character position in the target sequence, set the 'A' token (index 0) to 1.
          # This ensures that padding is learned as the end target.
          if 0 in self.char_to_index.values(): # Check if 'A' is mapped to 0, which it is after subtraction
              y[i, -1, 0] = 1 # Set the last element of the sequence in y to represent the padding character 'A'
          else:
              # If 'A' is not index 0, find its index and set it
              padding_char_index = self.char_to_index['A'] # Assuming 'A' is the padding character
              y[i, -1, padding_char_index] = 1 # Set the last element of the sequence in y to represent the padding character 'A'
        return y

    def build_lstm_model(self, num_layers_LSTM = 2):
        """
        Builds and compiles a Sequential Keras LSTM model for character generation.

        Args:
            num_layers_LSTM (int): The number of LSTM layers to include in the model.
                                   Defaults to 2.

        Returns:
            tensorflow.keras.models.Sequential: The compiled Keras LSTM model.
        """
        # Define the input shape for the LSTM layers: (sequence_length, vocabulary_size)
        input_shape = (self.max_length, len(self.char_to_index) + 1) # +1 for 0-based index compatibility
        vocab_size = len(self.char_to_index) + 1 # +1 for 0-based index compatibility

        # Initialize a sequential model
        model = Sequential()

        # Add an InputLayer to explicitly define the input shape
        model.add(InputLayer(input_shape=input_shape))

        # Add specified number of LSTM layers with Dropout
        for i in range(num_layers_LSTM):
          # LSTM layer with 256 units, returning sequences for stacking, and ReLU activation
          model.add(LSTM(256, return_sequences=True, activation='relu'))
          # Dropout layer to prevent overfitting
          model.add(Dropout(0.2))

        # Densely Connected Layer mapping LSTM output to vocabulary size
        model.add(Dense(vocab_size))

        # Output Layer with Softmax Activation to get probability distribution over vocabulary
        model.add(Softmax())

        # Compile the model with Adam optimizer and categorical crossentropy loss
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics = ['accuracy'])

        return model

    def train_lstm_model(self, model, sequences, epochs=25, batch_size = 16):
        """
        Trains the provided LSTM model using the generated sequences and targets.

        Args:
            model (tensorflow.keras.models.Sequential): The compiled Keras LSTM model to be trained.
            sequences (numpy.ndarray): The one-hot encoded input sequences (X).
            epochs (int): Number of epochs to train the model for. Defaults to 25.
            batch_size (int): Number of samples per gradient update. Defaults to 16.

        Returns:
            tensorflow.keras.callbacks.History: A History object containing training metrics.
        """
        # Create input sequences (X) and target sequences (y)
        x = sequences # Dataset
        y = self.get_targets(x) # Targets generated from input sequences

        # Split the data into training and testing sets for evaluation
        train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state = 42)

        # Train the LSTM model and return the history object
        return model.fit(train_x, train_y, epochs=epochs, batch_size=batch_size)

    def sample_with_temperature(self, predictions, temperature=0.8):
        """
        Samples an index from a probability distribution, adjusting randomness using a temperature parameter.
        Higher temperature (e.g., >1.0) leads to more diverse but less predictable samples.
        Lower temperature (e.g., <1.0) makes the model more confident and samples more likely options.

        Args:
            predictions (numpy.ndarray): An array of predicted probabilities for the next character.
            temperature (float): A value controlling the randomness of sampling. Defaults to 0.8.

        Returns:
            int: The index of the sampled character.
        """
        # Convert predictions to float64 to prevent potential precision issues with log/exp operations
        predictions = np.asarray(predictions).astype('float64')

        # Apply temperature: divide log probabilities by temperature. This makes probabilities
        # for less likely characters higher (more random) if temperature > 1, or lower (less random)
        # if temperature < 1.
        predictions = np.log(predictions) / temperature
        # Exponentiate to convert back to scaled probabilities
        exp_predictions = np.exp(predictions)

        # Normalize to get a valid probability distribution
        predictions = exp_predictions / np.sum(exp_predictions)
        # Sample an index using a multinomial distribution based on the scaled probabilities
        probabilities = np.random.multinomial(1, predictions, 1)

        # Return the index of the sampled character
        return np.argmax(probabilities)

    def generate_molecule(self, model):
        """
        Generates a new SMILES molecule character by character using the trained LSTM model.
        The generation starts with a 'G' token and continues until an 'E' token is predicted
        or the `max_length` is reached.

        Args:
            model (tensorflow.keras.models.Sequential): The trained Keras LSTM model.

        Returns:
            str: The generated SMILES string.
        """
        start_token = 'G'
        generated_molecule = start_token

        # Iterate up to max_length - 1 (since 'G' is already present)
        for i in range(self.max_length - 1):
            # Convert the current generated molecule string to sequence of integers
            sequence = self.tokenizer.texts_to_sequences([generated_molecule])[0]

            # Pad the sequence to max_length. 'pre' padding means zeros are added at the beginning.
            # This ensures the input shape matches what the model expects.
            sequence = pad_sequences([sequence], maxlen=self.max_length, padding='pre')

            # Adjust indices to be 0-based for one-hot encoding if necessary (tokenizer maps to 1-based by default)
            # and convert the integer sequence to one-hot encoded format
            sequence_to_encode = np.subtract(sequence, np.ones(sequence.shape, dtype=int))
            sequence = to_categorical(sequence_to_encode, num_classes=len(self.char_to_index) + 1)

            # Reshape the sequence to match the model's expected input shape (batch_size, max_length, vocab_size)
            sequence_new = np.reshape(sequence, (1, self.max_length, len(self.char_to_index) + 1))

            # Predict probabilities for the next character at the current position 'i'
            # model.predict returns probabilities for each position in the sequence, we take the one for the current token being generated
            # The `i` here refers to the actual character being predicted, not the padding index.
            predicted_prob = model.predict(sequence_new)[0, i + self.max_length - len(generated_molecule)] # Adjust index based on where the actual sequence starts after 'pre' padding

            # Sample the next character index using temperature-controlled sampling
            predicted_index_adjusted = self.sample_with_temperature(predicted_prob, temperature=1.0)

            # Convert the sampled index back to the original tokenizer index (add 1 if it was 0-based)
            if predicted_index_adjusted == 0:
                # If index 0 was sampled (which corresponds to 'A' after subtracting 1 in str_to_encode),
                # we map it back to the first non-padding character in tokenizer.index_word
                predicted_char = self.tokenizer.index_word[1] # Assuming 1 is the actual first char after padding
            else:
                predicted_char = self.tokenizer.index_word[predicted_index_adjusted + 1] # +1 to get back to 1-based indexing

            # Append the predicted character to the generated molecule string
            generated_molecule += predicted_char

            # Check if the predicted character is the 'E' (end) token, if so, stop generation
            if predicted_char == 'E': # Use character directly for comparison
                break

        return generated_molecule

# Testing the Complete Pipeline
This section loads SMILES data, builds an LSTM model, trains it, and generates new molecules.

In [None]:
generating = SmilesToOneHotEncoding("/data_50_len100.txt",15,20,100)

# Load file + Translation + Padding
list_of_molecules = generating.preprocess_data()
#print(list_of_molecules)

# Make Vocabulary
tokenizer, char_to_index = generating.make_vocabulary()
print("Index that represent the char",char_to_index)

# String to Encode
sequence = generating.str_to_encode()
#for i in range(len(sequence)):
#  print(sequence[i].shape)

# Build LSTM model
model = generating.build_lstm_model(num_layers_LSTM = 2)
print(model.summary())

# Train LSTM model
training = generating.train_lstm_model(model, sequence, 25, 16)
print(training)

# Generate Molecule (1)
generate_molecule = generating.generate_molecule(model)
print(generate_molecule)

Index that represent the char {'A': 1, 'c': 2, 'C': 3, '(': 4, ')': 5, '1': 6, 'n': 7, '2': 8, 'N': 9, 'O': 10, '3': 11, 'G': 12, 'E': 13, '-': 14, '=': 15, '4': 16, '#': 17, '[': 18, 'H': 19, ']': 20, 'L': 21, 's': 22, 'F': 23}
Sequences do str_to_encode [[12, 3, 10, 2, 6, 2, 2, 2, 2, 2, 6, 14, 2, 6, 2, 2, 2, 7, 8, 7, 2, 4, 9, 5, 7, 2, 6, 8, 13, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [12, 9, 3, 4, 15, 10, 5, 2, 6, 2, 2, 4, 14, 2, 8, 2, 2, 2, 2, 2, 8, 5, 22, 2, 6, 9, 2, 6, 2, 2, 2, 7, 2, 6, 13, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1], [12, 9, 3, 3, 7, 6, 2, 4, 3, 8, 3, 3, 9, 3, 3, 8, 5, 7, 2, 8, 2, 2, 4, 3, 4, 9, 5, 15, 10, 5, 2, 2, 2, 8, 6, 13, 1, 1, 1, 1, 1, 1