In [13]:
import numpy as np
import pickle 


# Function for one-hot encoding 
def one_hot_encode(smiles_string, char_to_int, vocabulary_size):
    encoding = np.zeros((len(smiles_string), vocabulary_size))
    for i, char in enumerate(smiles_string):
        encoding[i, char_to_int[char]] = 1
    return encoding

# Read the data from the TSV file
def read_data_tsv(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            fields = line.strip().split('\t')
            smiles = fields[0]  # Assuming SMILES is in the first column
            data.append(smiles)
    return data


# Create a vocabulary of unique characters
def create_vocabulary(data, output_file_path):
    vocabulary = set(''.join(data))
    char_to_int = {char: i for i, char in enumerate(vocabulary)}
    int_to_char = {i: char for char, i in char_to_int.items()}
    with open(output_file_path, 'wb') as file:
        pickle.dump((char_to_int, int_to_char), file)
    return char_to_int, int_to_char

# Perform one-hot encoding on the entire dataset
def one_hot_encode_dataset(data, char_to_int, vocabulary_size):
    max_length = max(len(smiles_string) for smiles_string in data)
    encoded_data = np.zeros((len(data), max_length, vocabulary_size))
    for i, smiles_string in enumerate(data):
        encoding = one_hot_encode(smiles_string, char_to_int, vocabulary_size)
        pad_width = ((0, max_length - len(smiles_string)), (0, 0))
        padded_encoding = np.pad(encoding, pad_width, mode='constant', constant_values=0)
        encoded_data[i] = padded_encoding
    return encoded_data


# Save the encoded data to a new NPY file
def save_encoded_data(encoded_data, output_file_path):
    np.save(output_file_path, encoded_data)

# Main function
if __name__ == "__main__":
    # File paths
    input_file_path = "structures.smiles.tsv"
    output_file_path = "encoded_data.npy"  # Save as NPY file
    vocab_file_path = "char_int_mapping.pkl"  # Save as pickle file

    # Read the data from the TSV file
    data = read_data_tsv(input_file_path)

    # Create the vocabulary and mapping and save to a file
    char_to_int, int_to_char = create_vocabulary(data, vocab_file_path)

    # Perform one-hot encoding on the dataset
    encoded_data = one_hot_encode_dataset(data, char_to_int, len(char_to_int))

    # Save the encoded data to a new NPY file
    save_encoded_data(encoded_data, output_file_path)

    print("One-hot encoding completed and saved to", output_file_path)


One-hot encoding completed and saved to encoded_data.npy
