**GAN**

In [None]:
!git clone https://github.com/AvonYangXX1/AMPLify-Feedback.git
import tensorflow as tf
from tensorflow.keras import layers
import numpy as np
import os

Cloning into 'AMPLify-Feedback'...
remote: Enumerating objects: 469, done.[K
remote: Counting objects: 100% (118/118), done.[K
remote: Compressing objects: 100% (73/73), done.[K
remote: Total 469 (delta 60), reused 97 (delta 45), pack-reused 351[K
Receiving objects: 100% (469/469), 204.06 MiB | 16.13 MiB/s, done.
Resolving deltas: 100% (227/227), done.
Updating files: 100% (94/94), done.


In [None]:
# Generator
def build_generator(seq_length, depth, latent_dim):
    inputs0 = layers.Input(shape=(latent_dim,), name="Input0")
    x = layers.Dense(256, activation='relu', name="Dense0")(inputs0)
    x = layers.BatchNormalization(name="Norm0")(x)
    x = layers.Dense(256, activation='relu', name="Dense1")(x)
    x = layers.BatchNormalization(name="Norm1")(x)
    x = layers.Dense(256, activation='relu', name="Dense2")(x)
    x = layers.BatchNormalization(name="Norm2")(x)
    x = layers.Dense(256, activation='relu', name="Dense3")(x)
    x = layers.BatchNormalization(name="Norm3")(x)
    x = layers.Dense(256, activation='relu', name="Dense4")(x)
    x = layers.Dense(seq_length*depth, activation='linear', name="DenseResize")(x)
    x = layers.Reshape((seq_length, depth), name="Reshape")(x)
    # x = layers.RepeatVector(seq_length, name="RepeatVector")(x)
    # x = layers.LSTM(256, return_sequences=True, name="GRU0")(x)
    x = layers.Dense(depth, activation="softmax", name="Output")(x)
    model = tf.keras.models.Model(inputs=inputs0, outputs=x)
    return model

In [None]:
# Discriminator
def build_discriminator(seq_length, depth):
    model = tf.keras.Sequential(name="discriminator")
    model.add(layers.Conv1D(32, 5, name="Conv1D"))
    model.add(layers.Flatten(name="Flatten"))
    model.add(layers.Dense(512, activation='relu', name="Dense0"))
    model.add(layers.Dropout(0.3, name="Dropout"))
    model.add(layers.Dense(256, activation='relu', name="Dense1"))
    model.add(layers.Dense(1, activation='sigmoid', name="Output"))
    return model

In [None]:
# GAN
def compile_gan(generator, discriminator):
    discriminator.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=[tf.keras.metrics.FalsePositives(),
                                   tf.keras.metrics.FalseNegatives()])
    discriminator.trainable = False
    gan_input0 = layers.Input(shape=(latent_dim,))
    gan_output = discriminator(generator(gan_input0))
    gan = tf.keras.Model(gan_input0, gan_output)
    gan.compile(loss='binary_crossentropy', optimizer=tf.keras.optimizers.Adam(1e-4))
    return gan

In [None]:
aa_vocal = np.load("AMPLify-Feedback/model_weights/SeqTV_vocal.npy")
pep_decoder = tf.keras.layers.StringLookup(vocabulary=aa_vocal[1:], invert=True, oov_token='')

In [None]:
def train_gan(generator, discriminator, gan, path, epochs, batch_size, latent_dim, demo_noise):
    for epoch in range(epochs):
        files = os.listdir(path)
        for file in files:
            seq = np.load(f"{path}/{file}", allow_pickle=True)
            seq = tf.one_hot(seq.squeeze(), depth=43)
            total_d_loss = 0
            total_g_loss = 0
            num_batches = int(seq.shape[0] / batch_size)
            for i in range(0, seq.shape[0], batch_size):
                real_sequences = seq[i:i + batch_size]
                current_batch_size = real_sequences.shape[0]

                # Generate Fake sequence
                noise = (np.random.rand(current_batch_size, latent_dim)-0.5)*2
                generated_sequences = generator.predict(noise, verbose=0)

                # Labels for real and fake data
                real_labels = np.ones((current_batch_size, 1))
                fake_labels = np.zeros((current_batch_size, 1))

                # Train discriminator
                discriminator.trainable = True
                d_loss_real = discriminator.train_on_batch(real_sequences, real_labels)
                d_loss_fake = discriminator.train_on_batch(generated_sequences, fake_labels)
                d_loss = 0.5 * np.add(d_loss_real, d_loss_fake)
                discriminator.trainable = False

                # Train generator
                g_loss = gan.train_on_batch(noise, np.ones((current_batch_size, 1)))

                total_d_loss += d_loss
                total_g_loss += g_loss
                # print(f"Epoch {epoch+1}/{epochs}; {file}; Batch {i}/{num_batches}; FP {d_loss[1]/current_batch_size:.4f}; FN {d_loss[2]/current_batch_size:.4f}; G_loss {g_loss:.4f}")

            demo_seq = generator(demo_noise)
            demo_seq = tf.math.argmax(demo_seq, axis=2)
            demo_seq = pep_decoder(demo_seq).numpy().astype('str')
            demo_seq = ["".join(chars) for chars in demo_seq]
            print(demo_seq[0])
            print(f"Epoch {epoch+1}/{epochs}; FP {total_d_loss[1]/seq.shape[0]:.4f}; FN {total_d_loss[2]/seq.shape[0]:.4f}; G_Loss {total_g_loss/num_batches:.4f}")
            del seq
            # generator.save(f"drive/MyDrive/MIT687/Generator.keras")
            # discriminator.save(f"drive/MyDrive/MIT687/Discriminator.keras")

In [None]:
latent_dim = 2
seq_length = 190
depth = 43
path = "AMPLify-Feedback/processed_data/gan_train_data"
np.random.seed(8701)
demo_noise = noise = (np.random.rand(1, latent_dim)-0.5)*2

generator = build_generator(seq_length, depth, latent_dim)
discriminator = build_discriminator(seq_length,depth)
gan = compile_gan(generator, discriminator)

In [None]:
# Train GAN
train_gan(generator, discriminator, gan, path, epochs=2, batch_size=22, latent_dim=latent_dim, demo_noise=demo_noise)

MCIYSIEVVRILIEEKEKKTIASLKSSEIGIKEEKISRASISRERRIEEIIRESEKEISFTGIRPISKALEVAIEEITYIIPIK
Epoch 1/2; FP 0.0044; FN 0.0022; G_Loss 4.0894
MAKLLALTVAALVKIGVVLGAALILALLAVLVQQLKRSSSKRLRQSAREGDLLALETALLDNATTGATELSLVAILVISIVELAGVESGVDGGRTDGVSLLLIGLLRAKLSLRSA
Epoch 1/2; FP 0.0043; FN 0.0055; G_Loss 10.9583
MSNNLVGGGTLVKELNLELLELKEEAAKRVLGVGSSGLNKGLLLGVIGGNLEEGFTGGVVLLLNYTDGKGGGEKLDVETFLKIQTGEVDEENGGDGGFGELGEIGGGRDNEE
Epoch 1/2; FP 0.0073; FN 0.0152; G_Loss 5.4502
MSQRVQSLVLLVVLLLLQLQFDLSRALLLLEQRLQSLLVALAEKNQLLQLSSLLLQQLQLCQSCLQSQRKLISGGNLFSDVVEKKGLEGVLNGSQLLN
Epoch 1/2; FP 0.0060; FN 0.0095; G_Loss 6.6378
MSKETSTGIDDVETKRIVSTEELTTYTVLEENVTNVETVSVASETDKLTTNYVQGNTDTTTVVGVTETDATRTGDESYSVQSERGLCQ
Epoch 1/2; FP 0.0048; FN 0.0067; G_Loss 6.9888
MSGGLLSEASVLSERLSREVTDVADWSSDLSELCGLSGLHKSCVKEV
Epoch 1/2; FP 0.0047; FN 0.0055; G_Loss 8.2052
MSLKSACNTAAIIAALIFAHKESGSQESSKIFNASL
Epoch 1/2; FP 0.0039; FN 0.0049; G_Loss 8.4688
SYPTIASGRIAILLGGVALGSYAIGRFVSRVV
Epoch 1/2; FP 0.0033; FN 0.0037; G_Loss 9.1144
GFV

In [None]:
# After GAN is trained
def generate_sequences(generator, latent_dim, num_sequences):
    noise = (np.random.rand(num_sequences, latent_dim)-0.5)*2
    generated_sequences = generator.predict(noise, verbose=0)
    return onehot2seq(generated_sequences)

def onehot2seq(onehot):
    demo_seq = tf.math.argmax(onehot, axis=2)
    demo_seq = pep_decoder(demo_seq).numpy().astype('str')
    demo_seq = ["".join(chars) for chars in demo_seq]
    return demo_seq

In [None]:
num_sequences=10
generated_seqs = generate_sequences(generator, latent_dim, num_sequences=num_sequences)
generated_seqs

['AVTVSLDLK',
 'DVLDFIF',
 'AVTVSLDLK',
 'DVLDPIF',
 'EVGALEGKQKKLAPK',
 'DVGALDGMFAL',
 'EVGALEGKkKKLAK',
 'DVLDTI',
 'AVTVSLDLK',
 'AVRRGDRFH']

In [None]:
generator.save("/content/AMPLify-Feedback-main/model_weights/PeptideGenerator_new.keras")
discriminator.save("/content/AMPLify-Feedback-main/model_weights/PeptideDiscriminator_new.keras")

In [None]:
# np.save("/content/AMPLify-Feedback/processed_data/GAN_seq/generated_seqs_10",generated_seqs)

In [None]:
#Convert to One_hot
# generated_seqs_one_hot = tf.one_hot(generated_seqs.squeeze(), depth=43)
# np.save("/content/AMPLify-Feedback/processed_data/GAN_seq/generated_seqs_one_hot_10",generated_seqs_one_hot)