<a href="https://colab.research.google.com/github/aahmadf123/ML_ChemE/blob/main/LSTM_AMPs_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Generative ML Models in Protein Engineering  

Reference:  
Mardikoraem M, Wang Z, Pascual N, Woldring D. Generative models for protein sequence modeling: recent advances and future directions. Brief Bioinform. 2023 Sep 22;24(6):bbad358. doi: 10.1093/bib/bbad358. PMID: 37864295; PMCID: PMC10589401.  
[paper](https://academic.oup.com/bib/article/24/6/bbad358/7325909)  
[Models](https://drive.google.com/file/d/1-hoO3tdwTOhPSf0qR6WcuifX3MD-FUN0/view?usp=drive_link)  


# LSTM Generative Model for Antimicrobial Peptides
===================================================================

This script shows how to:

1.   Preprocess AMP sequences (tokenize amino acids).
2.   Train an LSTM-based model to predict the next amino acid.
3.   Generate new sequences by sampling from the trained model.

Examples of AMP structures and sequences  
https://www.rcsb.org/structure/6G4I --> 'FLPILASLAAKFGPKLFCLVTKKC'  
https://www.rcsb.org/structure/2L36 --> 'GIGKALKKAKKGIGAVLKVLTTGL'  

AMP Sequene Database: https://aps.unmc.edu/downloads

[Real AMP Structures](https://drive.google.com/file/d/1-d_51oVdIBav2ls8hFX9pxv6ayfWqycl/view?usp=sharing)

In [1]:
import numpy as np
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.optimizers import Adam

##1. Antimicrobial Peptide (AMP) Sequence Dataset
Note: With only ~150 AMP sequences (each length 24), overfitting is likely.
      Consider data augmentation, dropout, or pretraining on larger protein sets.

In [2]:
amp_sequences = [
            'FLPLLAGLAANFLPTIICKISYKC',
            'FLPFIARLAAKVFPSIICSVTKKC',
            'GVLSNVIGYLKKLGTGALNAVLKQ',
            'GLFSVLGAVAKHVLPHVVPVIAEK',
            'GLFKVLGSVAKHLLPHVAPVIAEK',
            'GLFKVLGSVAKHLLPHVVPVIAEK',
            'GLFGVLGSIAKHVLPHVVPVIAEK',
            'MFFSSKKCKTVSKTFRGPCVRNAN',
            'LLKELWTKMKGAGKAVLGKIKGLL',
            'LLKELWTKIKGAGKAVLGKIKGLL',
            'FWGALIKGAAKLIPSVVGLFKKKQ',
            'FLPVVAGLAAKVLPSIICAVTKKC',
            'FLPAIVGAAGQFLPKIFCAISKKC',
            'FLPAIVGAAGKFLPKIFCAISKKC',
            'FFPIVAGVAGQVLKKIYCTISKKC',
            'FLPIIAGIAAKVFPKIFCAISKKC',
            'FLPMLAGLAASMVPKLVCLITKKC',
            'FLPMLAGLAASMVPKFVCLITKKC',
            'FLPFIAGMAAKFLPKIFCAISKKC',
            'FLPAIAGMAAKFLPKIFCAISKKC',
            'FLPFIAGVAAKFLPKIFCAISKKC',
            'FLPAIAGVAAKFLPKIFCAISKKC',
            'FLPAIVGAAAKFLPKIFCVISKKC',
            'FLPFIAGMAANFLPKIFCAISKKC',
            'FLPIIAGVAAKVFPKIFCAISKKC',
            'FLPIIASVAAKVFSKIFCAISKKC',
            'FLPIIASVAANVFSKIFCAISKKC',
            'GLNTLKKVFQGLHEAIKLINNHVQ',
            'GLNALKKVFQGIHEAIKLINNHVQ',
            'DSHAKRHHGYKRKFHEKHHSHRGY',
            'FLPLLAGLAANFLPKIFCKITKKC',
            'FLPILAGLAAKIVPKLFCLATKKC',
            'FLPLIAGLAANFLPKIFCAITKKC',
            'FLPVIAGVAAKFLPKIFCAITKKC',
            'FWGALAKGALKLIPSLFSSFSKKD',
            'ITSVSWCTPGCTSEGGGSGCSHCC',
            'GLLNGLALRLGKRALKKIIKRLCR',
            'ALWKDILKNAGKAALNEINQLVNQ',
            'GLRSKIWLWVLLMIWQESNKFKKM',
            'GKGRWLERIGKAGGIIIGGALDHL',
            'FLGALIKGAIHGGRFIHGMIQNHH',
            'FLGLLFHGVHHVGKWIHGLIHGHH',
            'FLPMLAGLAANFLPKLFCKITKKC',
            'FLPLAVSLAANFLPKLFCKITKKC',
            'FLPLLAGLAANFFPKIFCKITRKC',
            'FLPILASLAAKFGPKLFCLVTKKC',
            'FLPILASLAAKLGPKLFCLVTKKC',
            'FLPILASLAATLGPKLLCLITKKC',
            'GIFSNMYARTPAGYFRGPAGYAAN',
            'GLKDKFKSMGEKLKQYIQTWKAKF',
            'SLKDKVKSMGEKLKQYIQTWKAKF',
            'GFRDVLKGAAKAFVKTVAGHIANI',
            'GIKDWIKGAAKKLIKTVASNIANQ',
            'GFKDWIKGAAKKLIKTVASSIANQ',
            'VIPFVASVAAEMMQHVYCAASKKC',
            'FFGTALKIAANVLPTAICKILKKC',
            'FFGTALKIAANILPTAICKILKKC',
            'ILPFVAGVAAEMMQHVYCAASKKC',
            'FLPAIVGAAAKFLPKIFCAISKKC',
            'FLPIIAGVAAKVLPKIFCAISKKC',
            'FLPIIAGIAAKFLPKIFCTISKKC',
            'FLPVIAGVAANFLPKLFCAISKKC',
            'FLPIIAGAAAKVVQKIFCAISKKC',
            'FLPIIAGAAAKVVEKIFCAISKKC',
            'FLPAVLRVAAKIVPTVFCAISKKC',
            'FLPAVLRVAAQVVPTVFCAISKKC',
            'FMGGLIKAATKIVPAAYCAITKKC',
            'FLPILAGLAAKLVPKVFCSITKKC',
            'FLPILAGLAANILPKVFCSITKKC',
            'FFPIIAGMAAKLIPSLFCKITKKC',
            'FMGSALRIAAKVLPAALCQIFKKC',
            'DSHEKRHHEHRRKFHEKHHSHRGY',
            'WRSLGRTLLRLSHALKPLARRSGW',
            'VTSWSLCTPGCTSPGGGSNCSFCC',
            'VIPFVASVAAEMMHHVYCAASKRC',
            'SPAGCRFCCGCCPNMRGCGVCCRF',
            'GRGREFMSNLKEKLSGVKEKMKNS',
            'FLPVLTGLTPSIVPKLVCLLTKKC',
            'FLPVLAGLTPSIVPKLVCLLTKKC',
            'FFPMLAGVAARVVPKVICLITKKC',
            'DSMGAVKLAKLLIDKMKCEVTKAC',
            'FLPGVLRLVTKVGPAVVCAITRNC',
            'VIVFVASVAAEMMQHVYCAASKKC',
            'FLPAVIRVAANVLPTAFCAISKKC',
            'IDPFVAGVAAEMMQHVYCAASKKC',
            'INPFVAGVAAEMMQHVYCAASKKC',
            'ILPFVAGVAAEMMKHVYCAASKKC',
            'IIPFVAGVAAEMMEHVYCAASKKC',
            'QLPFVAGVACEMCQCVYCAASKKC',
            'ILPFVAGVAAEMMEHVYCAASKKC',
            'ILPFVAGVAAMEMEHVYCAASKKC',
            'FLPAVLLVATHVLPTVFCAITRKC',
            'IPWKLPATFRPVERPFSKPFCRKD',
            'FLPLLAGVVANFLPQIICKIARKC',
            'FLGSLLGLVGKVVPTLFCKISKKC',
            'FIGPVLKIAAGILPTAICKIFKKC',
            'FVGPVLKIAAGILPTAICKIYKKC',
            'FLGPIIKIATGILPTAICKFLKKC',
            'FLPLIASLAANFVPKIFCKITKKC',
            'FLPLIASVAANLVPKIFCKITKKC',
            'FLSTLLKVAFKVVPTLFCPITKKC',
            'KRKCPKTPFDNTPGAWFAHLILGC',
            'FLGLIFHGLVHAGKLIHGLIHRNR',
            'FLPAVIRVAANVLPTVFCAISKKC',
            'FLPAVLRVAAKVVPTVFCLISKKC',
            'FLSTALKVAANVVPTLFCKITKKC',
            'FLPIVAGLAANFLPKIVCKITKKC',
            'FLSTLLNVASNVVPTLICKITKKC',
            'FLSTLLNVASKVVPTLFCKITKKC',
            'FLPMLAGLAANFLPKIVCKITKKC',
            'FIGPVLKMATSILPTAICKGFKKC',
            'FLGPIIKMATGILPTAICKGLKKC',
            'FLPIIAGVAAKVLPKLFCAITKKC',
            'FLPVIAGLAAKVLPKLFCAITKKC',
            'RKGWFKAMKSIAKFIAKEKLKEHL',
            'FLPAVLKVAAHILPTAICAISRRC',
            'FMGTALKIAANVLPAAFCKIFKKC',
            'KLGFENFLVKALKTVMHVPTSPLL',
            'GWLPTFGKILRKAMQLGPKLIQPI',
            'GNGVVLTLTHECNLATWTKKLKCC',
            'ITIPPIVKNTLKKFIKGAVSALMS',
            'FLPGLIKAAVGVGSTILCKITKKC',
            'FLPGLIKAAVGIGSTIFCKISKKC',
            'FLPGLIKVAVGVGSTILCKITKKC',
            'FLPGLIKAAVGIGSTIFCKISRKC',
            'FLPMLAGLAANFLPKIICKITKKC',
            'FLPIVASLAANFLPKIICKITKKC',
            'FWGALAKGALKLIPSLVSSFTKKD',
            'FFPLIAGLAARFLPKIFCSITKRC',
            'VIPFVASVAAEMMQHVYCAASKRC',
            'FFPSIAGLAAKFLPKIFCSITKRC',
            'FLPAVLRVAAKVGPAVFCAITQKC',
            'FLGMLLHGVGHAIHGLIHGKQNVE',
            'NPAGCRFCCGCCPNMIGCGVCCRF',
            'IWSFLIKAATKLLPSLFGGGKKDS',
            'RNGCIVDPRCPYQQCRRPLYCRRR',
            'ILELAGNAARDNKKTRIIPRHLQL',
            'FLPLLAGLAANFLPTIICKIARKC',
            'FLPAIIGMAAKVLPAFLCKITKKC',
            'RRRRRFRRVIRRIRLPKYLTINTE',
            'GNGVLKTISHECNMNTWQFLFTCC',
            'FLPILAGLAANLVPKLICSITKKC',
            'FLGAVLKVAGKLVPAAICKISKKC',
            'FLGALFKVASKLVPAAICSISKKC',
            'FLPVIAGIAANVLPKLFCKLTKRC',
            'FFPIIARLAAKVIPSLVCAVTKKC',
            'KRVNWRKVGRNTALGASYVLSFLG',
            'GHSVDRIPEYFGPPGLPGPVLFYS',
            'FLPLIAGVAAKVLPKIFCAISKKC',
            'SDSVVSDIICTTFCSVTWCQSNCC',
            'FLPLLAGLAANFLPQIICKIARKC',
            'FLGTVLKVAAKVLPAALCQIFKKC',
            'QSHLSMCRYCCCKGNKGCGFCCKF',
            'VFDIIKDAGKQLVAHAMGKIAEKV',
            'VFDIIKDAGRQLVAHAMGKIAEKV',
            'FLPLLAGLAASFLPTIFCKISRKC',
            'FFPIVAGVAAKVLKKIFCTISKKC',
    # AMP sequences, each of length 24
]

## 2. Build a character-to-index mapping
Depending on your data, you might have 20 canonical amino acids + special tokens if needed.

In [3]:
unique_amino_acids = sorted(list(set("".join(amp_sequences))))
# e.g., unique_amino_acids might look like: ["A", "C", "D", "E", ..., "Y"]

char_to_idx = {char: idx for idx, char in enumerate(unique_amino_acids)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}

vocab_size = len(unique_amino_acids)  # e.g., could be 20 if strictly canonical


## 3. Convert sequences to integer arrays

In [4]:
encoded_sequences = []
for seq in amp_sequences:
    encoded_sequences.append([char_to_idx[c] for c in seq])

encoded_sequences = np.array(encoded_sequences)  # shape: (num_sequences, seq_length)




## 4. Prepare training data
   We can train a "next-character prediction" model.   
   We can treat the amino acid sequence as a tiime series.    
   For each position t in a sequence, predict the amino acid at position t+1.       
   We'll "shift" the sequence by 1 for targets.    
   Input: [X_0, X_1, ..., X_{22}],   
   Target: [X_1, X_2, ..., X_{23}].
   
   We do this for all sequences.

In [5]:
X = encoded_sequences[:, :-1]  # all but last character
y = encoded_sequences[:, 1:]   # all but first character

## 5. Define LSTM model

**Sequential**: This creates a linear stack of layers to build the LSTM model.  

**Embedding**: This layer converts each amino acid index into a dense vector representation (embedding) of size embedding_dim. This allows the model to capture relationships between amino acids.  

**LSTM**: This is the core layer, learning long-term dependencies in the sequence data. lstm_units sets the dimensionality of the LSTM's hidden state.
return_sequences=True makes the LSTM output a sequence for each input sequence,
  necessary for predicting the next amino acid at each position.  

**Dense**: This is the output layer, with vocab_size neurons. It uses the 'softmax'
  activation to produce a probability distribution over all possible amino acids,
  representing the model's prediction for the next amino acid in the sequence.  

**Adam**: An optimization algorithm that helps the model learn more effectively.  

**compile**: Configures the model for training, specifying the loss function, optimizer, and evaluation metrics.  

**model.summary()**: Prints a summary of the model's architecture.


In [6]:
model = Sequential()
# Embedding layer: (vocab_size) distinct amino acid characters -> embedding_dim vectors
embedding_dim = 8
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=23))

# LSTM layer
lstm_units = 64
model.add(LSTM(lstm_units, return_sequences=True))

# Final Dense layer for classification over the vocabulary
model.add(Dense(vocab_size, activation='softmax'))

optimizer = Adam(learning_rate=0.01)
model.compile(loss='sparse_categorical_crossentropy', optimizer=optimizer, metrics=['accuracy'])

model.summary()





## 6. Train the model
*Note: Because the dataset is small, this is primarily an illustrative example.*

**X** and **y**: Represent the input and target data for training.
  X contains the encoded AMP sequences shifted by one position,
  and y contains the original encoded sequences shifted by one position to the right,
  so the model learns to predict the next amino acid in the sequence.  
**epochs**: The number of times the model sees the entire training dataset.  
**batch_size**: The number of samples processed before the model's internal parameters are updated.  
**model.fit**: Starts the training process.


In [7]:
epochs = 50
batch_size = 16
model.fit(X, y, epochs=epochs, batch_size=batch_size)

Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 21ms/step - accuracy: 0.1388 - loss: 2.8691
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 20ms/step - accuracy: 0.1848 - loss: 2.6058
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.2054 - loss: 2.5384
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step - accuracy: 0.2187 - loss: 2.4263
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - accuracy: 0.2674 - loss: 2.3375
Epoch 6/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step - accuracy: 0.3166 - loss: 2.1431
Epoch 7/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - accuracy: 0.3821 - loss: 2.0508
Epoch 8/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - accuracy: 0.4203 - loss: 1.9191
Epoch 9/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7b2a05d23ad0>

## 7. Generating new sequences

**generate_sequence**: This function takes the trained model, a starting sequence (seed_seq),
  and a desired sequence length as input. It uses the model to predict the next amino acid step-by-step, generating a new sequence.  
**seed**: The starting point for sequence generation, in this case, the amino acid 'F'.  
The loop runs 20 times, generating and printing 20 new AMP sequences.


In [8]:
def generate_sequence(model, seed_seq, length=24):
    """
    Generate a new sequence of desired length using the trained model.
    :param model: trained LSTM model
    :param seed_seq: list of integer-encoded amino acids (starting sequence)
    :param length: desired total length of generated sequence
    :return: string of amino acids
    """
    generated = seed_seq[:]  # copy

    for _ in range(length - len(seed_seq)):
        # Predict next amino acid distribution
        input_seq = np.array(generated[-1:])  # last amino acid as input
        input_seq = input_seq.reshape(1, -1)  # shape: (1, 1)

        # Model expects a fixed input length of 23 for each training example,
        # so for generation, we can adapt in different ways.
        # Simplest approach: pad/truncate to length=23 and only use last token for the next prediction
        # We'll do a simple approach:
        padded_seq = np.zeros((1, 23))
        padded_seq[0, 22] = input_seq[0, 0]

        # Predict
        preds = model.predict(padded_seq, verbose=0)[0, 22, :]

        next_idx = np.random.choice(range(vocab_size), p=preds)
        generated.append(next_idx)

    # Convert generated integer tokens to string
    generated_str = "".join(idx_to_char[idx] for idx in generated)
    return generated_str

# Example usage:
# Start generation from a single amino acid: 'F'
seed = [char_to_idx['F']]  # or choose any valid token from your vocab
for i in range(20):
    new_peptide = generate_sequence(model, seed, length=24)
    print("Generated Peptide:", new_peptide)

Generated Peptide: FKKNKNNKLKKKKFNNKNKNKKKG
Generated Peptide: FKNKNKNNKKNNKKKKNKKNNKLG
Generated Peptide: FKNKKNNKSKKNKFKLFKKNKNNN
Generated Peptide: FKNNKKKKLKFNKKKNKKNNKGKK
Generated Peptide: FKKLKKKKNNKNKKNKKFKNKFKK
Generated Peptide: FKNKLKFKGKGKKKKNNNNNKNKN
Generated Peptide: FKKKKKKNNKNNKFKFKNKNKFKF
Generated Peptide: FKKKNKNKNKKKLKKNKGKKGNKK
Generated Peptide: FKKNKKLKNNKNNNNKNKKFKGNN
Generated Peptide: FKNNKNKKNNKVGKKFKFKKNKNK
Generated Peptide: FKVKFKKKLNNKLKNNNNKKKKLK
Generated Peptide: FKFKKKNNNGNNKKKNKFKKNNGK
Generated Peptide: FKKFKKNKKKNKNNNKKNKNKFKN
Generated Peptide: FKKGNKKVKNKKNKKVKKGKKNKN
Generated Peptide: FKKNNKVKKKNNKKKNKLKNNKLG
Generated Peptide: FKFKKKNKFKVKKKKKFKKKNNKK
Generated Peptide: FKFKNKKFKNKNKNKKKNKFNKNN
Generated Peptide: FKFKKVGKKNKNGNKNKNKKKKFK
Generated Peptide: FCKGNNKNNNNKNNKGKFKKNKGK
Generated Peptide: FKKKKFKNKKKNKKKKFKKKLKFK


###LSTM Generated AMP Sequences (previously generated):



In [9]:
'''
FLLRYYLRFRYLRFLRYLRYYYYL
FLCRYYCRYYYLRRYCNYFLNLNL
FLRFCRYYYLGYYLLRFRYYLRRY
FLRYCRYYFGLLRYLRYLRYCRYL
FLRYLRYYYYCRFLRFLRYCRYLR
FYCRFRYLRKYLRYYYLRYYYRFR
FCNRYRYRYYRYYYLLFRYYYYLR
FLCRYCRYYYYYLRFSRFRRYYCR
FCRYCRYRYLCLRCRRYYLRRYLR
FLRLRLRRYLRLLRYCRFLRYYYL
FRYRYLRRYYFYCRLCRYLRYCRY
FCRYYYYRYCRYLRYYYLGYLRYL
FRYYLRFLRFCRRLRYLCRCRYRY
FLRYYYCRRYCRYCKYLGYCRRFR
FCRYCRYMNYFLRFLRYRFRYYYF
FRYYRYYYYYLLRYYYRRRCRYCR
FCNLYCRFCRFLRCLRYYYCRYRY
FCRRYLRYYYYYYYCRYLRYLRYY
FCRYMNLRRRLRYYLCRYCRYRYR
FLRRYYRCNRFLRYFYYLRYLRRY
'''

'\nFLLRYYLRFRYLRFLRYLRYYYYL\nFLCRYYCRYYYLRRYCNYFLNLNL\nFLRFCRYYYLGYYLLRFRYYLRRY\nFLRYCRYYFGLLRYLRYLRYCRYL\nFLRYLRYYYYCRFLRFLRYCRYLR\nFYCRFRYLRKYLRYYYLRYYYRFR\nFCNRYRYRYYRYYYLLFRYYYYLR\nFLCRYCRYYYYYLRFSRFRRYYCR\nFCRYCRYRYLCLRCRRYYLRRYLR\nFLRLRLRRYLRLLRYCRFLRYYYL\nFRYRYLRRYYFYCRLCRYLRYCRY\nFCRYYYYRYCRYLRYYYLGYLRYL\nFRYYLRFLRFCRRLRYLCRCRYRY\nFLRYYYCRRYCRYCKYLGYCRRFR\nFCRYCRYMNYFLRFLRYRFRYYYF\nFRYYRYYYYYLLRYYYRRRCRYCR\nFCNLYCRFCRFLRCLRYYYCRYRY\nFCRRYLRYYYYYYYCRYLRYLRYY\nFCRYMNLRRRLRYYLCRYCRYRYR\nFLRRYYRCNRFLRYFYYLRYLRRY\n'

#Transformer Generative Model for Antimicrobial Peptides
===============================================================================

This script demonstrates a simplified Transformer for generating short protein sequences.




In [10]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.models import Model


##1. Antimicrobial Peptide Sequence Database  
**amp_sequences**: list of the AMP protein sequences the model will learn from.

In [11]:
amp_sequences = [
            'FLPLLAGLAANFLPTIICKISYKC',
            'FLPFIARLAAKVFPSIICSVTKKC',
            'GVLSNVIGYLKKLGTGALNAVLKQ',
            'GLFSVLGAVAKHVLPHVVPVIAEK',
            'GLFKVLGSVAKHLLPHVAPVIAEK',
            'GLFKVLGSVAKHLLPHVVPVIAEK',
            'GLFGVLGSIAKHVLPHVVPVIAEK',
            'MFFSSKKCKTVSKTFRGPCVRNAN',
            'LLKELWTKMKGAGKAVLGKIKGLL',
            'LLKELWTKIKGAGKAVLGKIKGLL',
            'FWGALIKGAAKLIPSVVGLFKKKQ',
            'FLPVVAGLAAKVLPSIICAVTKKC',
            'FLPAIVGAAGQFLPKIFCAISKKC',
            'FLPAIVGAAGKFLPKIFCAISKKC',
            'FFPIVAGVAGQVLKKIYCTISKKC',
            'FLPIIAGIAAKVFPKIFCAISKKC',
            'FLPMLAGLAASMVPKLVCLITKKC',
            'FLPMLAGLAASMVPKFVCLITKKC',
            'FLPFIAGMAAKFLPKIFCAISKKC',
            'FLPAIAGMAAKFLPKIFCAISKKC',
            'FLPFIAGVAAKFLPKIFCAISKKC',
            'FLPAIAGVAAKFLPKIFCAISKKC',
            'FLPAIVGAAAKFLPKIFCVISKKC',
            'FLPFIAGMAANFLPKIFCAISKKC',
            'FLPIIAGVAAKVFPKIFCAISKKC',
            'FLPIIASVAAKVFSKIFCAISKKC',
            'FLPIIASVAANVFSKIFCAISKKC',
            'GLNTLKKVFQGLHEAIKLINNHVQ',
            'GLNALKKVFQGIHEAIKLINNHVQ',
            'DSHAKRHHGYKRKFHEKHHSHRGY',
            'FLPLLAGLAANFLPKIFCKITKKC',
            'FLPILAGLAAKIVPKLFCLATKKC',
            'FLPLIAGLAANFLPKIFCAITKKC',
            'FLPVIAGVAAKFLPKIFCAITKKC',
            'FWGALAKGALKLIPSLFSSFSKKD',
            'ITSVSWCTPGCTSEGGGSGCSHCC',
            'GLLNGLALRLGKRALKKIIKRLCR',
            'ALWKDILKNAGKAALNEINQLVNQ',
            'GLRSKIWLWVLLMIWQESNKFKKM',
            'GKGRWLERIGKAGGIIIGGALDHL',
            'FLGALIKGAIHGGRFIHGMIQNHH',
            'FLGLLFHGVHHVGKWIHGLIHGHH',
            'FLPMLAGLAANFLPKLFCKITKKC',
            'FLPLAVSLAANFLPKLFCKITKKC',
            'FLPLLAGLAANFFPKIFCKITRKC',
            'FLPILASLAAKFGPKLFCLVTKKC',
            'FLPILASLAAKLGPKLFCLVTKKC',
            'FLPILASLAATLGPKLLCLITKKC',
            'GIFSNMYARTPAGYFRGPAGYAAN',
            'GLKDKFKSMGEKLKQYIQTWKAKF',
            'SLKDKVKSMGEKLKQYIQTWKAKF',
            'GFRDVLKGAAKAFVKTVAGHIANI',
            'GIKDWIKGAAKKLIKTVASNIANQ',
            'GFKDWIKGAAKKLIKTVASSIANQ',
            'VIPFVASVAAEMMQHVYCAASKKC',
            'FFGTALKIAANVLPTAICKILKKC',
            'FFGTALKIAANILPTAICKILKKC',
            'ILPFVAGVAAEMMQHVYCAASKKC',
            'FLPAIVGAAAKFLPKIFCAISKKC',
            'FLPIIAGVAAKVLPKIFCAISKKC',
            'FLPIIAGIAAKFLPKIFCTISKKC',
            'FLPVIAGVAANFLPKLFCAISKKC',
            'FLPIIAGAAAKVVQKIFCAISKKC',
            'FLPIIAGAAAKVVEKIFCAISKKC',
            'FLPAVLRVAAKIVPTVFCAISKKC',
            'FLPAVLRVAAQVVPTVFCAISKKC',
            'FMGGLIKAATKIVPAAYCAITKKC',
            'FLPILAGLAAKLVPKVFCSITKKC',
            'FLPILAGLAANILPKVFCSITKKC',
            'FFPIIAGMAAKLIPSLFCKITKKC',
            'FMGSALRIAAKVLPAALCQIFKKC',
            'DSHEKRHHEHRRKFHEKHHSHRGY',
            'WRSLGRTLLRLSHALKPLARRSGW',
            'VTSWSLCTPGCTSPGGGSNCSFCC',
            'VIPFVASVAAEMMHHVYCAASKRC',
            'SPAGCRFCCGCCPNMRGCGVCCRF',
            'GRGREFMSNLKEKLSGVKEKMKNS',
            'FLPVLTGLTPSIVPKLVCLLTKKC',
            'FLPVLAGLTPSIVPKLVCLLTKKC',
            'FFPMLAGVAARVVPKVICLITKKC',
            'DSMGAVKLAKLLIDKMKCEVTKAC',
            'FLPGVLRLVTKVGPAVVCAITRNC',
            'VIVFVASVAAEMMQHVYCAASKKC',
            'FLPAVIRVAANVLPTAFCAISKKC',
            'IDPFVAGVAAEMMQHVYCAASKKC',
            'INPFVAGVAAEMMQHVYCAASKKC',
            'ILPFVAGVAAEMMKHVYCAASKKC',
            'IIPFVAGVAAEMMEHVYCAASKKC',
            'QLPFVAGVACEMCQCVYCAASKKC',
            'ILPFVAGVAAEMMEHVYCAASKKC',
            'ILPFVAGVAAMEMEHVYCAASKKC',
            'FLPAVLLVATHVLPTVFCAITRKC',
            'IPWKLPATFRPVERPFSKPFCRKD',
            'FLPLLAGVVANFLPQIICKIARKC',
            'FLGSLLGLVGKVVPTLFCKISKKC',
            'FIGPVLKIAAGILPTAICKIFKKC',
            'FVGPVLKIAAGILPTAICKIYKKC',
            'FLGPIIKIATGILPTAICKFLKKC',
            'FLPLIASLAANFVPKIFCKITKKC',
            'FLPLIASVAANLVPKIFCKITKKC',
            'FLSTLLKVAFKVVPTLFCPITKKC',
            'KRKCPKTPFDNTPGAWFAHLILGC',
            'FLGLIFHGLVHAGKLIHGLIHRNR',
            'FLPAVIRVAANVLPTVFCAISKKC',
            'FLPAVLRVAAKVVPTVFCLISKKC',
            'FLSTALKVAANVVPTLFCKITKKC',
            'FLPIVAGLAANFLPKIVCKITKKC',
            'FLSTLLNVASNVVPTLICKITKKC',
            'FLSTLLNVASKVVPTLFCKITKKC',
            'FLPMLAGLAANFLPKIVCKITKKC',
            'FIGPVLKMATSILPTAICKGFKKC',
            'FLGPIIKMATGILPTAICKGLKKC',
            'FLPIIAGVAAKVLPKLFCAITKKC',
            'FLPVIAGLAAKVLPKLFCAITKKC',
            'RKGWFKAMKSIAKFIAKEKLKEHL',
            'FLPAVLKVAAHILPTAICAISRRC',
            'FMGTALKIAANVLPAAFCKIFKKC',
            'KLGFENFLVKALKTVMHVPTSPLL',
            'GWLPTFGKILRKAMQLGPKLIQPI',
            'GNGVVLTLTHECNLATWTKKLKCC',
            'ITIPPIVKNTLKKFIKGAVSALMS',
            'FLPGLIKAAVGVGSTILCKITKKC',
            'FLPGLIKAAVGIGSTIFCKISKKC',
            'FLPGLIKVAVGVGSTILCKITKKC',
            'FLPGLIKAAVGIGSTIFCKISRKC',
            'FLPMLAGLAANFLPKIICKITKKC',
            'FLPIVASLAANFLPKIICKITKKC',
            'FWGALAKGALKLIPSLVSSFTKKD',
            'FFPLIAGLAARFLPKIFCSITKRC',
            'VIPFVASVAAEMMQHVYCAASKRC',
            'FFPSIAGLAAKFLPKIFCSITKRC',
            'FLPAVLRVAAKVGPAVFCAITQKC',
            'FLGMLLHGVGHAIHGLIHGKQNVE',
            'NPAGCRFCCGCCPNMIGCGVCCRF',
            'IWSFLIKAATKLLPSLFGGGKKDS',
            'RNGCIVDPRCPYQQCRRPLYCRRR',
            'ILELAGNAARDNKKTRIIPRHLQL',
            'FLPLLAGLAANFLPTIICKIARKC',
            'FLPAIIGMAAKVLPAFLCKITKKC',
            'RRRRRFRRVIRRIRLPKYLTINTE',
            'GNGVLKTISHECNMNTWQFLFTCC',
            'FLPILAGLAANLVPKLICSITKKC',
            'FLGAVLKVAGKLVPAAICKISKKC',
            'FLGALFKVASKLVPAAICSISKKC',
            'FLPVIAGIAANVLPKLFCKLTKRC',
            'FFPIIARLAAKVIPSLVCAVTKKC',
            'KRVNWRKVGRNTALGASYVLSFLG',
            'GHSVDRIPEYFGPPGLPGPVLFYS',
            'FLPLIAGVAAKVLPKIFCAISKKC',
            'SDSVVSDIICTTFCSVTWCQSNCC',
            'FLPLLAGLAANFLPQIICKIARKC',
            'FLGTVLKVAAKVLPAALCQIFKKC',
            'QSHLSMCRYCCCKGNKGCGFCCKF',
            'VFDIIKDAGKQLVAHAMGKIAEKV',
            'VFDIIKDAGRQLVAHAMGKIAEKV',
            'FLPLLAGLAASFLPTIFCKISRKC',
            'FFPIVAGVAAKVLKKIFCTISKKC',
    # ...
]


##2. Build character-to-index mapping  
**unique_amino_acids**: This extracts all the unique characters (amino acid letters) from the sequences.  
**char_to_idx**: This dictionary maps each amino acid character to a unique numerical index (e.g., 'F' might be 0, 'L' might be 1, etc.).  
**idx_to_char**: This dictionary does the reverse, mapping numerical indices back to amino acid characters.  
**vocab_size**: This stores the total number of unique amino acids in the dataset.

In [12]:
unique_amino_acids = sorted(list(set("".join(amp_sequences))))
char_to_idx = {char: idx for idx, char in enumerate(unique_amino_acids)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
vocab_size = len(unique_amino_acids)

##3. Convert to integer arrays  
**encoded_sequences**: This converts the original protein sequences (amp_sequences) into numerical representations using the **char_to_idx** mapping.
  Each amino acid is replaced with its corresponding index.  
**seq_length**: This sets the maximum length of the sequences the model will handle
  (24 amino acids in this case).  
**X** and **y**: These are created to train the model. **X** contains the input sequences
  (all but the last amino acid), and y contains the target sequences (all but the first amino acid). This setup is for next-token prediction, where the model learns to predict the next amino acid in a sequence.  


In [13]:
encoded_sequences = []
for seq in amp_sequences:
    encoded_sequences.append([char_to_idx[c] for c in seq])
encoded_sequences = np.array(encoded_sequences)  # shape: (num_sequences, seq_length)

# Prepare training data for next-token prediction
seq_length = 24
X = encoded_sequences[:, :-1]  # shape: (num_sequences, seq_length-1)
y = encoded_sequences[:, 1:]   # shape: (num_sequences, seq_length-1)



##4. Build a small Transformer model  
We'll define the input, embedding, transformer block, and final dense layer.  
This section defines the architecture of the Transformer model using TensorFlow's Keras API.  
**embedding_dim**, **num_heads**, **ff_dim**: These are hyperparameters that control the size and complexity of the model.
The model consists of an input layer, an embedding layer (to represent amino acids as vectors),
  a positional encoding layer (to provide information about the order of amino acids),
  a transformer encoder block (the core of the model for learning relationships between amino acids),
  and a final dense layer (to output predictions for the next amino acid).  
**model.compile**: This configures the model for training, specifying the optimizer (adam),
  loss function (sparse_categorical_crossentropy), and metrics to track (accuracy).  
**model.summary()**: This displays a summary of the model's architecture.  

In [14]:
embedding_dim = 16
num_heads = 2
ff_dim = 32  # feed-forward layer size in transformer

# Define Input
inputs = layers.Input(shape=(seq_length-1,))  # each example is length-1 = 23

# Token Embedding + Positional Embedding
token_embedding = layers.Embedding(input_dim=vocab_size, output_dim=embedding_dim)(inputs)

# Basic positional encoding
positions = tf.range(start=0, limit=seq_length-1, delta=1)
positional_encoding = layers.Embedding(input_dim=seq_length, output_dim=embedding_dim)(positions)
positional_encoding = positional_encoding[None, ...]  # shape: (1, seq_length-1, embedding_dim)

# Add token embedding and positional encoding
x = token_embedding + positional_encoding



###Transformer Encoder Block (simplified)  
This is where self-attention is handled
The model uses query, key, and attention weighting, although implicitly.  
The **layers.MultiHeadAttention** layer handles these steps internally.  
By passing x as both the query and the key/value (using (x, x)), the model is essentially performing self-attention, comparing different parts of the input sequence with itself.  
The **key_dim** argument specifies the dimensionality of the keys and queries, influencing the complexity of the attention calculations.


In [15]:
attention_output = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embedding_dim)(x, x)
attention_output = layers.Dropout(0.1)(attention_output)
x = layers.LayerNormalization(epsilon=1e-6)(x + attention_output)

ffn = layers.Dense(ff_dim, activation='relu')(x)
ffn = layers.Dense(embedding_dim)(ffn)
ffn = layers.Dropout(0.1)(ffn)
x = layers.LayerNormalization(epsilon=1e-6)(x + ffn)

# Final Dense Layer over vocab
outputs = layers.Dense(vocab_size, activation='softmax')(x)

model = Model(inputs=inputs, outputs=outputs)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
model.summary()


##5. Train the Transformer  

**epochs**: The number of times the model will go through the entire training data.  
**batch_size**: The number of training examples processed in each iteration.  
**model.fit**: This starts the training process, using the prepared data (X, y) and the specified training parameters.  

In [16]:
epochs = 50
batch_size = 16
model.fit(X, y, epochs=epochs, batch_size=batch_size)


Epoch 1/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 8ms/step - accuracy: 0.1075 - loss: 3.1347
Epoch 2/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.1842 - loss: 2.7542
Epoch 3/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2122 - loss: 2.6545
Epoch 4/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.2606 - loss: 2.5294
Epoch 5/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.2624 - loss: 2.5138 
Epoch 6/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2798 - loss: 2.4100
Epoch 7/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.3085 - loss: 2.3780
Epoch 8/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.3193 - loss: 2.3599
Epoch 9/50
[1m10/10[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

<keras.src.callbacks.history.History at 0x7b2a01b2c590>

##6. Generation function  
**generate_transformer_sequence**: This function takes the trained model and a starting amino acid (**start_token**) and generates a new peptide sequence of the specified length.  
It works by repeatedly predicting the next amino acid based on the previous ones, using the model's learned knowledge.  
The example usage demonstrates how to generate 20 new sequences starting with 'F' and 20 starting with 'G'.  


In [17]:
def generate_transformer_sequence(model, start_token, length=24):
    """
    Generate a new peptide sequence from a transformer model.
    :param model: trained Keras model
    :param start_token: integer index of first amino acid
    :param length: desired total length
    :return: generated amino acid sequence (string)
    """
    generated = [start_token]

    for i in range(length-1):
        # We feed the current sequence (minus 1 for next-token prediction)
        input_seq = np.array(generated)[None, ...]  # shape: (1, current_length)

        # Model expects length=23 for training; in generation we can adapt.
        # We'll zero-pad to length=23 for simplicity (or you can dynamically mask).
        pad_len = (seq_length - 1) - len(generated)
        if pad_len < 0:
            # If your sequence is already at length=23, we only use the last 23 tokens
            input_seq = np.array(generated[-(seq_length-1):])[None, ...]
            pad_len = 0

        input_seq = np.pad(input_seq, ((0,0),(0,pad_len)), 'constant', constant_values=0)

        preds = model.predict(input_seq, verbose=0)
        # We want the last position's distribution
        last_pos = len(generated)-1 if len(generated) < (seq_length-1) else (seq_length-2)
        prob_dist = preds[0, last_pos]  # shape: (vocab_size,)

        next_idx = np.random.choice(range(vocab_size), p=prob_dist)
        generated.append(next_idx)

    # Convert to string
    generated_str = "".join(idx_to_char[idx] for idx in generated)
    return generated_str

###Transformer Generated AMP Sequences

In [18]:
for i in range(20):
    start_token = char_to_idx['F']
    new_peptide = generate_transformer_sequence(model, start_token, length=24)
    print("Generated Peptide (Transformer):", new_peptide)

for i in range(20):
    start_token = char_to_idx['G']
    new_peptide = generate_transformer_sequence(model, start_token, length=24)
    print("Generated Peptide (Transformer):", new_peptide)

Generated Peptide (Transformer): FMAAIAMAACIFHVIVKCAAYIPV
Generated Peptide (Transformer): FAAAMVAAIAMVAEAAAAAAAFFA
Generated Peptide (Transformer): FIVYAAVAAAMAVIVAMSKKHMEM
Generated Peptide (Transformer): FHEAIVIVAAMAVAIAMAALYAHI
Generated Peptide (Transformer): FAAAAIACAAMAAKVAIYVAVYAV
Generated Peptide (Transformer): FIVHHAVVALMVAMQYMAEAAGCV
Generated Peptide (Transformer): FAAAFARVAAMAAFQIIFAAAGMY
Generated Peptide (Transformer): FAAVIAYQAADAMAMYAMMAAAIQ
Generated Peptide (Transformer): FVFAVALAAAAALFFVICAAAAVV
Generated Peptide (Transformer): FLAAVAAAAAMQVIIAQVVAQALA
Generated Peptide (Transformer): FLAAAAMAAAVAKAQAAAVAKALM
Generated Peptide (Transformer): FMAIIAQHVAAAANDAQIAYAHAA
Generated Peptide (Transformer): FAAAAAAAAAEAAIVAAAAAAIIL
Generated Peptide (Transformer): FVATAIAMAAMAVFQIYCALKANM
Generated Peptide (Transformer): FIFAAAMMAAFAMEAAACAIIAMA
Generated Peptide (Transformer): FMLAAAQAAAVVAVAAVKKAIAIA
Generated Peptide (Transformer): FVYLAAAAAANVQFYVAAAAVMAM
Generated Pept

###Transformer Generated AMP Sequences (previously generated):

In [19]:
'''
FIPAIAAAAAAAAAAVFIPAACNF
FFFEIAAAAAPAAAAAACNESKAS
FYCAAADPAFNVAAYVPAPFPPCP
FKDSIAAAAPAFTAPIFCAIFKCC
FFFPIFGVAAAAAPAAFCAIAPFC
FFPAVAAAAAAVPAAAYCAATCFS
FADAVAAAAAAAAAAAYCAATAAP
FFPAAYCAAAAYCPTCCPCCPCCC
FVAAAAAAQAAGAAATAAAAAFAC
FFAYIPAAAAAFPAAAYCPYCCCC
FFDAFDAAAAAAPLAAFPAIPPVF
FCAAAAAAAAAAAAAIFFPFFPAA
FRVAAAGAAAEAAAAAIAIIAFFF
FFPAAAAAEFCAAAAYIPVIPPVC
FEEIFVAAAAAAAAAAIEAAAEAA
FFPANMVAAAEAAAAVFFPATAAY
FFFMIAAAAAAAACAPAAWHECQC
FIFAIAHAAAAAAAAFWCAAATAA
FVADFEIAAAFFFPAASFMITAAT
FYAFLAAAAAAYVAAMCAPFPPVF
GAAAFAAAAAAFAAAAYCAAAFFC
GETAAAAAAAAFVAAVYCAAFPAY
GVPAIFDTAAPAAPAAICAASWRC
GVLIAAAAAAAYLPAAAPAVIPMW
GVAARGFLAAACAAAAFCADAPSH
GAAGEAAAAAEHAAFVFAAAAAAA
GIDARAAAAAACAAAAICAFFLDC
GAAAVAAAAGAAAAAAYPAAAAEC
GFAAAAAAAAAFTVAAFCAEMTVA
GRPAAAAAAAAEECAVYCGVIPFC
GAAAIKAAAAAIVPHVFKAAEWND
GSAAAAAAAAFVAAAAVEAISKKC
GRVAFAAAAAHAAAAAVLPAAFYF
GVAAAAEIIALAAAAAAAAEEFFD
GAAAAAAVAAIAAPAAFCAAAVNC
GFPAIAAMAAFYAPAAPAMIPPAC
GAFPAPQIAAAYVPTARCAASKCC
GAAAIAPAAANDAPAAGPAASKKC
GAAAVAIVWVEMMAAMIAAAMMEG
GLFFPAAAAAAPAPAAGEFFYLFC
'''

'\nFIPAIAAAAAAAAAAVFIPAACNF\nFFFEIAAAAAPAAAAAACNESKAS\nFYCAAADPAFNVAAYVPAPFPPCP\nFKDSIAAAAPAFTAPIFCAIFKCC\nFFFPIFGVAAAAAPAAFCAIAPFC\nFFPAVAAAAAAVPAAAYCAATCFS\nFADAVAAAAAAAAAAAYCAATAAP\nFFPAAYCAAAAYCPTCCPCCPCCC\nFVAAAAAAQAAGAAATAAAAAFAC\nFFAYIPAAAAAFPAAAYCPYCCCC\nFFDAFDAAAAAAPLAAFPAIPPVF\nFCAAAAAAAAAAAAAIFFPFFPAA\nFRVAAAGAAAEAAAAAIAIIAFFF\nFFPAAAAAEFCAAAAYIPVIPPVC\nFEEIFVAAAAAAAAAAIEAAAEAA\nFFPANMVAAAEAAAAVFFPATAAY\nFFFMIAAAAAAAACAPAAWHECQC\nFIFAIAHAAAAAAAAFWCAAATAA\nFVADFEIAAAFFFPAASFMITAAT\nFYAFLAAAAAAYVAAMCAPFPPVF\nGAAAFAAAAAAFAAAAYCAAAFFC\nGETAAAAAAAAFVAAVYCAAFPAY\nGVPAIFDTAAPAAPAAICAASWRC\nGVLIAAAAAAAYLPAAAPAVIPMW\nGVAARGFLAAACAAAAFCADAPSH\nGAAGEAAAAAEHAAFVFAAAAAAA\nGIDARAAAAAACAAAAICAFFLDC\nGAAAVAAAAGAAAAAAYPAAAAEC\nGFAAAAAAAAAFTVAAFCAEMTVA\nGRPAAAAAAAAEECAVYCGVIPFC\nGAAAIKAAAAAIVPHVFKAAEWND\nGSAAAAAAAAFVAAAAVEAISKKC\nGRVAFAAAAAHAAAAAVLPAAFYF\nGVAAAAEIIALAAAAAAAAEEFFD\nGAAAAAAVAAIAAPAAFCAAAVNC\nGFPAIAAMAAFYAPAAPAMIPPAC\nGAFPAPQIAAAYVPTARCAASKCC\nGAAAIAPAAANDAPAAGPAASKKC\nGAAAVAIVW

#Variational Autoencoder (VAE) for AMP Sequences  
======================================================================  

This script demonstrates a basic character-level VAE for protein sequences.  
The encoder compresses sequences into a latent vector.  
The decoder reconstructs
the original sequence from the latent vector.  
New sequences can be generated by sampling from the latent space.


In [20]:
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers, Model

##1. Antimicrobial Peptide Sequence Database  

In [21]:
amp_sequences = [
            'FLPLLAGLAANFLPTIICKISYKC',
            'FLPFIARLAAKVFPSIICSVTKKC',
            'GVLSNVIGYLKKLGTGALNAVLKQ',
            'GLFSVLGAVAKHVLPHVVPVIAEK',
            'GLFKVLGSVAKHLLPHVAPVIAEK',
            'GLFKVLGSVAKHLLPHVVPVIAEK',
            'GLFGVLGSIAKHVLPHVVPVIAEK',
            'MFFSSKKCKTVSKTFRGPCVRNAN',
            'LLKELWTKMKGAGKAVLGKIKGLL',
            'LLKELWTKIKGAGKAVLGKIKGLL',
            'FWGALIKGAAKLIPSVVGLFKKKQ',
            'FLPVVAGLAAKVLPSIICAVTKKC',
            'FLPAIVGAAGQFLPKIFCAISKKC',
            'FLPAIVGAAGKFLPKIFCAISKKC',
            'FFPIVAGVAGQVLKKIYCTISKKC',
            'FLPIIAGIAAKVFPKIFCAISKKC',
            'FLPMLAGLAASMVPKLVCLITKKC',
            'FLPMLAGLAASMVPKFVCLITKKC',
            'FLPFIAGMAAKFLPKIFCAISKKC',
            'FLPAIAGMAAKFLPKIFCAISKKC',
            'FLPFIAGVAAKFLPKIFCAISKKC',
            'FLPAIAGVAAKFLPKIFCAISKKC',
            'FLPAIVGAAAKFLPKIFCVISKKC',
            'FLPFIAGMAANFLPKIFCAISKKC',
            'FLPIIAGVAAKVFPKIFCAISKKC',
            'FLPIIASVAAKVFSKIFCAISKKC',
            'FLPIIASVAANVFSKIFCAISKKC',
            'GLNTLKKVFQGLHEAIKLINNHVQ',
            'GLNALKKVFQGIHEAIKLINNHVQ',
            'DSHAKRHHGYKRKFHEKHHSHRGY',
            'FLPLLAGLAANFLPKIFCKITKKC',
            'FLPILAGLAAKIVPKLFCLATKKC',
            'FLPLIAGLAANFLPKIFCAITKKC',
            'FLPVIAGVAAKFLPKIFCAITKKC',
            'FWGALAKGALKLIPSLFSSFSKKD',
            'ITSVSWCTPGCTSEGGGSGCSHCC',
            'GLLNGLALRLGKRALKKIIKRLCR',
            'ALWKDILKNAGKAALNEINQLVNQ',
            'GLRSKIWLWVLLMIWQESNKFKKM',
            'GKGRWLERIGKAGGIIIGGALDHL',
            'FLGALIKGAIHGGRFIHGMIQNHH',
            'FLGLLFHGVHHVGKWIHGLIHGHH',
            'FLPMLAGLAANFLPKLFCKITKKC',
            'FLPLAVSLAANFLPKLFCKITKKC',
            'FLPLLAGLAANFFPKIFCKITRKC',
            'FLPILASLAAKFGPKLFCLVTKKC',
            'FLPILASLAAKLGPKLFCLVTKKC',
            'FLPILASLAATLGPKLLCLITKKC',
            'GIFSNMYARTPAGYFRGPAGYAAN',
            'GLKDKFKSMGEKLKQYIQTWKAKF',
            'SLKDKVKSMGEKLKQYIQTWKAKF',
            'GFRDVLKGAAKAFVKTVAGHIANI',
            'GIKDWIKGAAKKLIKTVASNIANQ',
            'GFKDWIKGAAKKLIKTVASSIANQ',
            'VIPFVASVAAEMMQHVYCAASKKC',
            'FFGTALKIAANVLPTAICKILKKC',
            'FFGTALKIAANILPTAICKILKKC',
            'ILPFVAGVAAEMMQHVYCAASKKC',
            'FLPAIVGAAAKFLPKIFCAISKKC',
            'FLPIIAGVAAKVLPKIFCAISKKC',
            'FLPIIAGIAAKFLPKIFCTISKKC',
            'FLPVIAGVAANFLPKLFCAISKKC',
            'FLPIIAGAAAKVVQKIFCAISKKC',
            'FLPIIAGAAAKVVEKIFCAISKKC',
            'FLPAVLRVAAKIVPTVFCAISKKC',
            'FLPAVLRVAAQVVPTVFCAISKKC',
            'FMGGLIKAATKIVPAAYCAITKKC',
            'FLPILAGLAAKLVPKVFCSITKKC',
            'FLPILAGLAANILPKVFCSITKKC',
            'FFPIIAGMAAKLIPSLFCKITKKC',
            'FMGSALRIAAKVLPAALCQIFKKC',
            'DSHEKRHHEHRRKFHEKHHSHRGY',
            'WRSLGRTLLRLSHALKPLARRSGW',
            'VTSWSLCTPGCTSPGGGSNCSFCC',
            'VIPFVASVAAEMMHHVYCAASKRC',
            'SPAGCRFCCGCCPNMRGCGVCCRF',
            'GRGREFMSNLKEKLSGVKEKMKNS',
            'FLPVLTGLTPSIVPKLVCLLTKKC',
            'FLPVLAGLTPSIVPKLVCLLTKKC',
            'FFPMLAGVAARVVPKVICLITKKC',
            'DSMGAVKLAKLLIDKMKCEVTKAC',
            'FLPGVLRLVTKVGPAVVCAITRNC',
            'VIVFVASVAAEMMQHVYCAASKKC',
            'FLPAVIRVAANVLPTAFCAISKKC',
            'IDPFVAGVAAEMMQHVYCAASKKC',
            'INPFVAGVAAEMMQHVYCAASKKC',
            'ILPFVAGVAAEMMKHVYCAASKKC',
            'IIPFVAGVAAEMMEHVYCAASKKC',
            'QLPFVAGVACEMCQCVYCAASKKC',
            'ILPFVAGVAAEMMEHVYCAASKKC',
            'ILPFVAGVAAMEMEHVYCAASKKC',
            'FLPAVLLVATHVLPTVFCAITRKC',
            'IPWKLPATFRPVERPFSKPFCRKD',
            'FLPLLAGVVANFLPQIICKIARKC',
            'FLGSLLGLVGKVVPTLFCKISKKC',
            'FIGPVLKIAAGILPTAICKIFKKC',
            'FVGPVLKIAAGILPTAICKIYKKC',
            'FLGPIIKIATGILPTAICKFLKKC',
            'FLPLIASLAANFVPKIFCKITKKC',
            'FLPLIASVAANLVPKIFCKITKKC',
            'FLSTLLKVAFKVVPTLFCPITKKC',
            'KRKCPKTPFDNTPGAWFAHLILGC',
            'FLGLIFHGLVHAGKLIHGLIHRNR',
            'FLPAVIRVAANVLPTVFCAISKKC',
            'FLPAVLRVAAKVVPTVFCLISKKC',
            'FLSTALKVAANVVPTLFCKITKKC',
            'FLPIVAGLAANFLPKIVCKITKKC',
            'FLSTLLNVASNVVPTLICKITKKC',
            'FLSTLLNVASKVVPTLFCKITKKC',
            'FLPMLAGLAANFLPKIVCKITKKC',
            'FIGPVLKMATSILPTAICKGFKKC',
            'FLGPIIKMATGILPTAICKGLKKC',
            'FLPIIAGVAAKVLPKLFCAITKKC',
            'FLPVIAGLAAKVLPKLFCAITKKC',
            'RKGWFKAMKSIAKFIAKEKLKEHL',
            'FLPAVLKVAAHILPTAICAISRRC',
            'FMGTALKIAANVLPAAFCKIFKKC',
            'KLGFENFLVKALKTVMHVPTSPLL',
            'GWLPTFGKILRKAMQLGPKLIQPI',
            'GNGVVLTLTHECNLATWTKKLKCC',
            'ITIPPIVKNTLKKFIKGAVSALMS',
            'FLPGLIKAAVGVGSTILCKITKKC',
            'FLPGLIKAAVGIGSTIFCKISKKC',
            'FLPGLIKVAVGVGSTILCKITKKC',
            'FLPGLIKAAVGIGSTIFCKISRKC',
            'FLPMLAGLAANFLPKIICKITKKC',
            'FLPIVASLAANFLPKIICKITKKC',
            'FWGALAKGALKLIPSLVSSFTKKD',
            'FFPLIAGLAARFLPKIFCSITKRC',
            'VIPFVASVAAEMMQHVYCAASKRC',
            'FFPSIAGLAAKFLPKIFCSITKRC',
            'FLPAVLRVAAKVGPAVFCAITQKC',
            'FLGMLLHGVGHAIHGLIHGKQNVE',
            'NPAGCRFCCGCCPNMIGCGVCCRF',
            'IWSFLIKAATKLLPSLFGGGKKDS',
            'RNGCIVDPRCPYQQCRRPLYCRRR',
            'ILELAGNAARDNKKTRIIPRHLQL',
            'FLPLLAGLAANFLPTIICKIARKC',
            'FLPAIIGMAAKVLPAFLCKITKKC',
            'RRRRRFRRVIRRIRLPKYLTINTE',
            'GNGVLKTISHECNMNTWQFLFTCC',
            'FLPILAGLAANLVPKLICSITKKC',
            'FLGAVLKVAGKLVPAAICKISKKC',
            'FLGALFKVASKLVPAAICSISKKC',
            'FLPVIAGIAANVLPKLFCKLTKRC',
            'FFPIIARLAAKVIPSLVCAVTKKC',
            'KRVNWRKVGRNTALGASYVLSFLG',
            'GHSVDRIPEYFGPPGLPGPVLFYS',
            'FLPLIAGVAAKVLPKIFCAISKKC',
            'SDSVVSDIICTTFCSVTWCQSNCC',
            'FLPLLAGLAANFLPQIICKIARKC',
            'FLGTVLKVAAKVLPAALCQIFKKC',
            'QSHLSMCRYCCCKGNKGCGFCCKF',
            'VFDIIKDAGKQLVAHAMGKIAEKV',
            'VFDIIKDAGRQLVAHAMGKIAEKV',
            'FLPLLAGLAASFLPTIFCKISRKC',
            'FFPIVAGVAAKVLKKIFCTISKKC',
]

In [22]:
unique_amino_acids = sorted(list(set("".join(amp_sequences))))
char_to_idx = {char: idx for idx, char in enumerate(unique_amino_acids)}
idx_to_char = {idx: char for char, idx in char_to_idx.items()}
vocab_size = len(unique_amino_acids)

##2. Encode sequences as integers

In [23]:
encoded_data = []
for seq in amp_sequences:
    encoded_data.append([char_to_idx[c] for c in seq])
encoded_data = np.array(encoded_data)  # shape (num_sequences, 24)

##3. One-hot encode for VAE

In [24]:
one_hot_data = tf.keras.utils.to_categorical(encoded_data, num_classes=vocab_size)
# shape: (num_sequences, 24, vocab_size)

##4. Define hyperparameters

In [25]:
seq_length = 24
latent_dim = 16  # dimension of the latent space
hidden_dim = 64  # dimension of LSTM or dense hidden units

##5. Sampling function for the VAE  

In [26]:
def sampling(args):
    z_mean, z_log_var = args
    epsilon = tf.keras.backend.random_normal(shape=(tf.shape(z_mean)[0], latent_dim))
    return z_mean + tf.exp(0.5 * z_log_var) * epsilon

##6. Define a custom loss function for the VAE  

In [27]:
def vae_loss_fn(y_true, y_pred, z_mean, z_log_var):
    # Reconstruction loss (categorical crossentropy)
    reconstruction_loss = tf.keras.losses.categorical_crossentropy(y_true, y_pred)
    reconstruction_loss = tf.reduce_sum(reconstruction_loss, axis=1)  # Sum over sequence length

    # KL divergence loss
    kl_loss = 1 + z_log_var - tf.square(z_mean) - tf.exp(z_log_var)
    kl_loss = -0.5 * tf.reduce_sum(kl_loss, axis=1)

    # Total loss
    total_loss = reconstruction_loss + kl_loss
    return tf.reduce_mean(total_loss)


##7. Encoder model  


In [28]:
encoder_inputs = layers.Input(shape=(seq_length, vocab_size))
x = layers.LSTM(hidden_dim)(encoder_inputs)
z_mean = layers.Dense(latent_dim)(x)
z_log_var = layers.Dense(latent_dim)(x)
z = layers.Lambda(sampling)([z_mean, z_log_var])
encoder = Model(encoder_inputs, [z_mean, z_log_var, z], name="encoder")

##8. Decoder model  

In [29]:
latent_inputs = layers.Input(shape=(latent_dim,))
dec_x = layers.RepeatVector(seq_length)(latent_inputs)
dec_x = layers.LSTM(hidden_dim, return_sequences=True)(dec_x)
decoder_outputs = layers.TimeDistributed(layers.Dense(vocab_size, activation='softmax'))(dec_x)
decoder = Model(latent_inputs, decoder_outputs, name="decoder")

##9. VAE model that connects encoder and decoder  


In [30]:
z_mean_tensor, z_log_var_tensor, z_tensor = encoder(encoder_inputs)
outputs = decoder(z_tensor)
vae = Model(encoder_inputs, outputs, name="vae")

##10. Create a custom model class to incorporate the custom loss  


In [31]:
class VAEModel(tf.keras.Model):
    def __init__(self, encoder, decoder, **kwargs):
        super(VAEModel, self).__init__(**kwargs)
        self.encoder = encoder
        self.decoder = decoder

    def train_step(self, data):
        if isinstance(data, tuple):
            data = data[0]

        with tf.GradientTape() as tape:
            z_mean, z_log_var, z = self.encoder(data)
            reconstruction = self.decoder(z)
            loss = vae_loss_fn(data, reconstruction, z_mean, z_log_var)

        trainable_vars = self.trainable_variables
        gradients = tape.gradient(loss, trainable_vars)
        self.optimizer.apply_gradients(zip(gradients, trainable_vars))

        return {"loss": loss}


##11. Create and compile the custom VAE model  


In [32]:
custom_vae = VAEModel(encoder, decoder)
custom_vae.compile(optimizer='adam')

##12. Build the model  


In [33]:
custom_vae.build(input_shape=(None, seq_length, vocab_size))
custom_vae.summary()

##13. Train VAE  


In [None]:
epochs = 50
batch_size = 8
custom_vae.fit(one_hot_data, epochs=epochs, batch_size=batch_size)

Epoch 1/50
[1m10/20[0m [32m━━━━━━━━━━[0m[37m━━━━━━━━━━[0m [1m0s[0m 20ms/step - loss: 72.4462

##14. Generate new sequences by sampling from latent space  
    
  Sample from the latent space and decode a new sequence.  
  **decoder**: the trained decoder model  
  **sample_z**: optional latent vector. If None, randomly sample from N(0,1).  
  **return**: generated protein sequence as a string
    

In [None]:
def generate_new_sequence(decoder, sample_z=None):
    if sample_z is None:
        sample_z = np.random.randn(1, latent_dim)  # random from normal distribution
    pred = decoder.predict(sample_z)[0]  # shape: (24, vocab_size)

    # Convert one-hot distribution at each position to a chosen amino acid
    seq_indices = [np.argmax(prob) for prob in pred]
    seq_string = "".join(idx_to_char[idx] for idx in seq_indices)
    return seq_string



###Generating VAE AMP Sequences

In [None]:
for i in range(20):
  new_peptide = generate_new_sequence(decoder)
  print("Generated Peptide (VAE):", new_peptide)

###VAE Generated AMP Sequences (previously generated):  


In [None]:
'''
FFPLAAGAAAALLPPIFCCIKKKC
FFPLIAAAAAAVLPPICCIIKKKC
FLLLLLAAAAKLPPPICAIKKKKC
FPPIIAGAAAALLPPIFCIIKKKC
FFPLVAAAAAVVVPPICCIIKKKC
FFFLLAGAAAAVVPPVCCAIKKKC
FFPLVAAAAAVVVPPVCCAIKKKC
FLLLVVAAAAVVPPVVCAAIKKKC
FFLLLAGAAAKVPPPICCAAKKKC
FFPPAAGAAAAVLPPICCAIKKKC
FFLLLAGAAAKLLPKICCIIKKKC
FFFLLLGGGGGGAAVHHHVAAAKK
FFLLLAGAAAKLPPPICCIIKKKC
FFLLLGGAAAKKLPPIICAAAKKK
FLLLLLLAAAKLPPPICCIKKKCC
FLLLGGAAAKKLPPIICAAKKKCC
FLLLLLLAAAKLPPVICCIIKKKC
FFLLIAGAAAAKLPPIICCIIKKC
LLLLLLLLLLLLLLKKKKKKKKKK
FFLLVVAAAAVVPPPVAAAAKKCC
"""

[Structural Comparison](https://drive.google.com/file/d/10NPBDNodfBnLrOzYK-g3XFJd26KnJ-5y/view?usp=sharing)