In [3]:
#Written with help of ChatGPT

import numpy as np
import keras
from keras.layers import Dense, Dropout, Layer, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D, Input, Masking
from keras.models import Model, load_model
from sklearn.model_selection import train_test_split

# Define amino acid vocabulary
amino_acids = 'ACDEFGHIKLMNPQRSTVWY'  # 20 standard amino acids
vocab_size = len(amino_acids)
aa_to_index = {aa: i for i, aa in enumerate(amino_acids)}

# Transformer Block
class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1, **kwargs):
        super(TransformerBlock, self).__init__(**kwargs)
        self.embed_dim = embed_dim
        self.num_heads = num_heads
        self.ff_dim = ff_dim
        self.rate = rate

        self.att = MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
        self.ffn = keras.Sequential(
            [Dense(ff_dim, activation="relu"), Dense(embed_dim)]
        )
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att(inputs, inputs)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output)
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output)

    def get_config(self):
        config = super(TransformerBlock, self).get_config()
        config.update({
            'embed_dim': self.embed_dim,
            'num_heads': self.num_heads,
            'ff_dim': self.ff_dim,
            'rate': self.rate,
        })
        return config

# Function to one-hot encode peptide sequences
def one_hot_encode_sequences(sequences, vocab_size):
    max_length = max(len(seq) for seq in sequences)  # Get maximum length
    encoded = np.zeros((len(sequences), max_length, vocab_size))
    for i, seq in enumerate(sequences):
        for j, aa in enumerate(seq):
            encoded[i, j, aa_to_index[aa]] = 1
    return encoded

# Sample peptide sequences (with variable lengths)
peptide_sequences = ['ARG', 'TC', 'G', 'CDEF', 'HIKLMN', 'PQR', 'STVWY']  # Example peptide sequences
targets = np.random.randint(0, 2, len(peptide_sequences))  # Random binary targets (0 or 1)

# One-hot encode the peptide sequences
encoded_sequences = one_hot_encode_sequences(peptide_sequences, vocab_size)

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(encoded_sequences, targets, test_size=0.2, random_state=42)

# Model Creation
def create_model(embed_dim, num_heads, ff_dim, vocab_size, input_length):
    inputs = Input(shape=(None, vocab_size))  # Variable length input
    x = Dense(embed_dim)(inputs)  # Project input to embed_dim
    transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
    x = transformer_block(x)
    x = GlobalAveragePooling1D()(x)
    x = Dropout(0.1)(x)
    x = Dense(20, activation="relu")(x)
    x = Dropout(0.1)(x)
    outputs = Dense(1, activation="sigmoid")(x)
    model = Model(inputs=inputs, outputs=outputs)
    return model

In [4]:
# Model parameters
embed_dim = 32  # Embedding size for each token
num_heads = 2  # Number of attention heads
ff_dim = 32  # Hidden layer size in the feed-forward network

# Create and compile the model
model = create_model(embed_dim, num_heads, ff_dim, vocab_size, None)
model.compile(optimizer='Adam', loss="binary_crossentropy", metrics=["accuracy"])

# Train the model
model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=5, batch_size=8)

2024-09-24 14:41:41.486094: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fcdaf1844c0>

In [5]:
model.save("peptide_transformer_model.h5")

In [11]:
# Load the model
loaded_model = load_model("peptide_transformer_model.h5", custom_objects={"TransformerBlock": TransformerBlock})

In [14]:
from keras.callbacks import LearningRateScheduler

# Fine-tune the model on a new dataset
new_peptide_sequences = ['AR', 'GTC', 'F', 'DE', 'HIK']  # New sequences
new_targets = np.random.randint(0, 2, len(new_peptide_sequences))  # New targets

# One-hot encode new sequences
new_encoded_sequences = one_hot_encode_sequences(new_peptide_sequences, vocab_size)

# Train/test split for new data
X_train_new, X_test_new, y_train_new, y_test_new = train_test_split(new_encoded_sequences, new_targets, test_size=0.2, random_state=42)

# Define a function that returns the desired learning rate
def scheduler(epoch, lr):
    # Set a constant low learning rate for finetuning
    return 0.0001

# Create the learning rate scheduler callback
lr_scheduler = LearningRateScheduler(scheduler)

# Compile your model with 'Adam' as optimizer
loaded_model.compile(optimizer='Adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Fit the model with the learning rate scheduler callback
loaded_model.fit(X_train_new, y_train_new, validation_data=(X_test_new, y_test_new), 
                 epochs=5, batch_size=8, callbacks=[lr_scheduler])

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fcd93f87d30>