<a href="https://colab.research.google.com/github/bjatkin/Pepto-GAN/blob/master/Pepto_GAN.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Imports for everything here

In [0]:
import argparse
import os
import numpy as np
import math
import random
import csv
from tqdm import tqdm

import matplotlib.pyplot as plt

import torchvision.transforms as transforms
from torchvision.utils import save_image

from torch.utils.data import Dataset, DataLoader
from torch.autograd import Variable

import torch.nn as nn
import torch.nn.functional as F
import torch

cuda = True if torch.cuda.is_available() else False
Tensor = torch.cuda.FloatTensor if cuda else torch.FloatTensor

# Utility Functions

Genral functions that have broad use across differnt sections of the code base. Should be functions rather than classes.

In [0]:
def one_hot(peptide):
  encodings = []
  for aa in peptide:
    encoding = torch.zeros(len(gan_opt.amino_acids))
    index = gan_opt.amino_acids.index(aa)
    encoding[index] = 1.0
    encodings.append(encoding)
  return torch.stack(encodings)

def one_hot_s(peptides):
  all_encodings = []
  for peptied in peptides:
    all_encodings.append(one_hot(peptied))
  return torch.stack(all_encodings)

def score_peptide(peptide):
  return tox_predictor(one_hot(peptide).unsqueeze(0).cuda()).item()

def decode_peptide(peptide):
  pep = ""
  for p in peptide:
    i = p.argmax()
    pep = pep + gan_opt.amino_acids[i]
  
  return pep

def decode_peptide_s(peptides):
  peps = []
  for peptide in peptides:
    peps.append(decode_peptide(peptide))
  
  return np.asarray(peps)

def generate_random_peptide():
  encodings = []
  l = len(gan_opt.amino_acids)
  for _ in range(l):
    encoding = torch.zeros(gan_opt.peptide_length)
    index = random.randrange(0, gan_opt.peptide_length)
    encoding[index] = 1.0
    encodings.append(encoding)
  return torch.stack(encodings)

def generate_random_peptide_s(count):
  all_encodings = []
  for _ in range(count):
    all_encodings.append(generate_random_peptide())
  return torch.stack(all_encodings)

def save_model(model, file_name):
  torch.save(model.state_dict(), file_name)

def load_model(model, file_name):
  model.load_state_dict(torch.load(file_name))

# Network Opts

This section should contain global options that configure each network

In [0]:
# Toxic Classifier Options

class tox_opt():
  n_epochs=10
  batch_size=64
  lr=0.0005
  validate_every=500
  load_network = True
  load_model_file="tox_classifier.pt"
  save_model_file="tox_classifier.pt"


# GAN Options

class gan_opt():
  n_epochs=120
  batch_size=64
  lr=0.0002
  b1=0.5
  b2=0.999
  latent_dim=100
  peptide_length=8
  amino_acids="CDFGHILNRSVY"
  d_update_every=100
  load_discriminator=False
  load_disc_file="tox_discriminatro.pt"
  save_disc_file="tox_discriminator.pt"
  load_generator=False
  load_gen_file="tox_generator.pt"
  save_gen_file="tox_generator.pt"
  train_gan=True

# Washerstein GAN Options

class was_opt():
  n_epochs=120
  batch_size=64
  n_critic=5
  lr=0.001
  b1=0.5
  b2=0.999
  latent_dim=100
  clip_value=0.01
  load_generator=False
  load_gen_file="was_tox_generator.pt"
  save_gen_file="was_tox_generator.pt"
  train_gan=True


# Toxic Peptide Classifier

the goal of this model is to predict the toxicity score of a peptide. This can then be used to access the success of the GAN

In [0]:
# -------------------------
# Toxic Peptide classifier
# -------------------------

class ToxicityPredictor(nn.Module):
  def __init__(self):
    super(ToxicityPredictor, self).__init__()

    self.model = nn.Sequential(
        nn.Linear(int(len(gan_opt.amino_acids) * gan_opt.peptide_length), 512),
        nn.ReLU(),
        nn.Linear(512, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, 256),
        nn.ReLU(),
        nn.Linear(256, 1),
    )

  def forward(self, pep):
    pep_flat = pep.view(pep.size(0), -1)
    tox = self.model(pep_flat)

    return tox

# Loss function
toxicity_loss = nn.L1Loss()

# Initialize Predictor
tox_predictor = ToxicityPredictor()

if cuda:
  tox_predictor.cuda()
  toxicity_loss.cuda()

# Optimizer
optimizer_tox = torch.optim.Adam(tox_predictor.parameters(), lr=tox_opt.lr)

In [0]:
# --------------------
# Peptides Dataloader
# --------------------
class ToxicPeptideDataset(Dataset):
  def __init__(self, file_name, labels_file_name="", train=True, soft=False):
    if soft:
      self.peptides = self.one_soft(np.load(file_name))
    else:
      self.peptides = self.one_hot(np.load(file_name))

    self.use_labels = False
    if labels_file_name != "":
      self.use_labels = True
      # These come in as strings and need to be floats
      self.labels = [float(l) for l in np.load(labels_file_name)]

      # Test train split
      split = len(self.labels) // 10
      if train:
        self.peptides = self.peptides[split:]
        self.labels = self.labels[split:]
      else:
        self.peptides = self.peptides[:split]
        self.labels = self.labels[:split]

  def one_hot(self, peptides):
    all_encodings = []
    for peptide in peptides:
      encodings = []
      for aa in peptide:
        encoding = torch.zeros(len(gan_opt.amino_acids))
        index = gan_opt.amino_acids.index(aa)
        encoding[index] = 1.0
        encodings.append(encoding)
      all_encodings.append(torch.stack(encodings))

    return all_encodings
  
  def one_soft(self, peptides, alpha=0.5):
    all_encodings = []
    for peptide in peptides:
      encodings = []
      for aa in peptide:
        cs = len(gan_opt.amino_acids)
        encoding = torch.ones(cs) * (alpha/(cs-1))
        index = gan_opt.amino_acids.index(aa)
        encoding[index] = 1 - alpha
        encodings.append(encoding)
      all_encodings.append(torch.stack(encodings))

    return all_encodings

    
  def __getitem__(self, index):
    if self.use_labels:
      return self.peptides[index], self.labels[index]
    return self.peptides[index]
    
  def __len__(self):
    return len(self.peptides)

if not tox_opt.load_network:
  tox_data_train = ToxicPeptideDataset("allPeptideSequences.npy", "allPeptideScores.npy", train=True)
  tox_dataloader_train = torch.utils.data.DataLoader(tox_data_train, batch_size=tox_opt.batch_size, shuffle=True)

  tox_data_test = ToxicPeptideDataset("allPeptideSequences.npy", "allPeptideScores.npy", train=False)
  tox_dataloader_test = torch.utils.data.DataLoader(tox_data_test, batch_size=tox_opt.batch_size, shuffle=True)

In [0]:
def accuracy(y_hat, y_truth):
  diff = torch.abs(y_hat-y_truth)
  count = y_hat.size()[0]
  return (count - torch.sum(diff))/count

In [9]:
# ------------------------------
#  Toxicicty Predictor Training
# ------------------------------

losses = []
v_losses = []
acc = []
def tox_train():
  loop = tqdm(total=len(tox_dataloader_train) * tox_opt.n_epochs, position=0)
  for epoch in range(tox_opt.n_epochs):

      for i, (peps, toxs) in enumerate(tox_dataloader_train):
          peps = Variable(peps.type(Tensor))
          toxs = Variable(toxs.type(Tensor))
          optimizer_tox.zero_grad()

          y_hat = tox_predictor(peps)

          # loss = toxicity_loss(y_hat.squeeze(), toxs)
          # pdb.set_trace()
          loss = torch.sum(torch.abs(y_hat.squeeze()-toxs))
          loss.backward()
          losses.append(loss.item())
          optimizer_tox.step()
          last_acc = 0
          if i % tox_opt.validate_every:
            a = []
            for v_peps, v_toxs in tox_dataloader_test:
              v_peps = Variable(v_peps.type(Tensor))
              v_toxs = Variable(v_toxs.type(Tensor))
              v_y_hat = tox_predictor(v_peps)
              a.append(accuracy(v_y_hat.squeeze(), v_toxs).item())
            last_acc = np.mean(a)
            acc.append((len(losses), last_acc))

          loop.set_description("Epoch {}, Batch {}, Toxic_Loss {:.4f}, Accuracy {:.4f}".format(epoch, i, loss.item(), last_acc))
          loop.update()

# No need to retrain if we can just load the network from a file
if not tox_opt.load_network:
  tox_train()
  save_model(tox_predictor, tox_opt.save_model_file)

  # Plot accuracy and loss
  plt.plot(losses, label='losses')
  plt.title('Toxicicity Predictor Losses')
  plt.legend()
  plt.show()

  a, b = zip(*acc)
  plt.plot(a, b, label='accuracy')
  plt.title('Tocicity Accuracy')
  plt.legend()
  plt.show()

if tox_opt.load_network:
  load_model(tox_predictor, tox_opt.load_model_file)
  print("Network Successfully Loaded!")


Network Successfully Loaded!


# Pepto GAN

this section contains the code to run a simple gan forward on the peptide dataset. Ultimately it should produce toxic peptides as its output

*be sure to re-run all the cells in this section each time the GAN is run otherwise the Generator and the Discriminator will not be re-initalized*

## Tweak #1

**Problem:**The generator network was really struggling to learn how to produce peptides. The probelm apeard to be that the discriminator was learning too quickly and turning the learning rate for the discriminator down enough to allow the generator to catch up was preventing the discriminator from learning useful information that would help imporve the generator. I hypothesize that the reason for this is that the one hot encodings we were using were too high contrast. Before the discriminator learning for valid peptide configurations or toxicities it instead looked only at contrast. It learned this so quickly that the generator could not adjust in time and was burried by the discriminator.

**Fix:**We fixed this by using label smoothing on our one hot peptide encoding.

**Result:**This resulted in a marked improvement of the GAN. the discriminator learning rate was increase by a factor of 5 (every 500 steps to every 100 steps). also the number of training epochs that the GAN was able to sustain jumped by an order of magnitued (10 epochs to 100 epocsh)

* Unique Peptides Generated: 15
* Mean Score: -0.439821
* Standard Deviation: 0.219031

## Tweak #2
**Problem:**(See tweak #1)

**Fix:**Change the contrast on the latent z vector so it's very high to see if that promotes high contrast in the output.

**Result:**This result did not improve the output in the same way that label smoothing did. Unfortunately, with the same learning rate (100 steps) the number of training epochs that were successful before the Discriminator ran away with things was only about 10.

Wait Maybe this did work... Even though It cant train as long...

* Unique Peptides Generated: 65
* Mean Score: -0.732333
* Standard Deviation: 0.177144

## Tweak #3
**Problem:**

In [0]:
# ---------------------------------------
# Define the Generator and Discriminator
# ---------------------------------------
class Generator(nn.Module):
    def __init__(self):
        super(Generator, self).__init__()

        def block(in_feat, out_feat, normalize=True):
            layers = [nn.Linear(in_feat, out_feat)]
            if normalize:
                layers.append(nn.BatchNorm1d(out_feat, 0.8))
            layers.append(nn.LeakyReLU(0.2, inplace=True))
            return layers

        self.model = nn.Sequential(
            *block(gan_opt.latent_dim, 128, normalize=False),
            *block(128, 256),
            *block(256, 512),
            *block(512, 1024),
            nn.Linear(1024, int(gan_opt.peptide_length * len(gan_opt.amino_acids))),
            nn.Tanh()
        )

    def forward(self, z):
        pep = self.model(z)
        pep = pep.view(pep.size()[0], gan_opt.peptide_length, len(gan_opt.amino_acids))
        pep = F.softmax(pep, dim=2)
        return pep


class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(int(len(gan_opt.amino_acids) * gan_opt.peptide_length), 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 1),
            nn.Sigmoid(),
        )

    def forward(self, pep):
        pep_flat = pep.view(pep.size(0), -1)
        validity = self.model(pep_flat)

        return validity

# Dataset
gan_data = ToxicPeptideDataset("newMostToxicNCSequences.npy", soft=True)
gan_dataloader = torch.utils.data.DataLoader(gan_data, batch_size=gan_opt.batch_size, shuffle=True)

# Loss function
adversarial_loss = torch.nn.BCELoss()

# Initialize generator and discriminator
generator = Generator()
discriminator = Discriminator()

if cuda:
    generator.cuda()
    discriminator.cuda()
    adversarial_loss.cuda()

# Optimizers
optimizer_G = torch.optim.Adam(generator.parameters(), lr=gan_opt.lr, betas=(gan_opt.b1, gan_opt.b2))
optimizer_D = torch.optim.Adam(discriminator.parameters(), lr=gan_opt.lr, betas=(gan_opt.b1, gan_opt.b2))

In [26]:
# --------------
#  GAN Training
# --------------

def gan_train():
  for epoch in range(gan_opt.n_epochs):

      for i, peps in enumerate(gan_dataloader):

          # Adversarial ground truths
          valid = Variable(Tensor(peps.size(0), 1).fill_(1.0), requires_grad=False)
          fake = Variable(Tensor(peps.size(0), 1).fill_(0.0), requires_grad=False)

          # Configure input
          real_peps = Variable(peps.type(Tensor))

          # -----------------
          #  Train Generator
          # -----------------

          optimizer_G.zero_grad()

          # Sample noise as generator input
          z = Variable(Tensor(np.random.normal(0, 1, (peps.shape[0], gan_opt.latent_dim))))

          # Sample high constrast noise as generator input (Tweak #2)
          # z = Variable(Tensor(np.random.randint(0, 2, (peps.shape[0], gan_opt.latent_dim))))

          # Generate a batch of images
          gen_peps = generator(z)

          # Loss measures generator's ability to fool the discriminator
          g_loss = adversarial_loss(discriminator(gen_peps), valid)

          g_loss.backward()
          optimizer_G.step()

          # ---------------------
          #  Train Discriminator - every d_update_every steps
          # ---------------------

          batches_done = epoch * len(gan_dataloader) + i
          if batches_done % gan_opt.d_update_every == 0:
            optimizer_D.zero_grad()

            # Measure discriminator's ability to classify real from generated samples
            real_loss = adversarial_loss(discriminator(real_peps), valid)
            fake_loss = adversarial_loss(discriminator(gen_peps.detach()), fake)
            d_loss = (real_loss + fake_loss) / 2

            d_loss.backward()
            optimizer_D.step()

      print("epoch {}/{} d loss {:.4f}, g loss {:.4f}".format(epoch+1, gan_opt.n_epochs, d_loss.item(), g_loss.item()))
      for pep in gen_peps:
        name = decode_peptide(pep)
        score = score_peptide(name)
        if score < -0.4 and score > -1:
          print("{}: {:.6f}".format(name, score))
        if score <= -1:
          print("#---------------------#")
          print("# {}: {:.6f} #".format(name, score))
          print("#---------------------#")

if gan_opt.load_discriminator:
  load_model(discriminator, gan_opt.load_disc_file)

if gan_opt.load_generator:
  load_model(generator, gan_opt.load_gen_file)

if gan_opt.train_gan:
  gan_train()
  save_model(discriminator, gan_opt.save_disc_file)
  save_model(generator, gan_opt.save_gen_file)

epoch 1/120 d loss 0.6822, g loss 0.6669
FYIFDYDH: -0.487841
epoch 2/120 d loss 0.6718, g loss 0.6563
FIFFLLGC: -0.991104
FIFFLLGC: -0.991104
FIFFLSGC: -0.903147
epoch 3/120 d loss 0.6569, g loss 0.6598
epoch 4/120 d loss 0.6334, g loss 0.6822
IYVVHYFV: -0.928313
FYVVCNCC: -0.580193
IFCVHYCV: -0.470415
FYVVCNCC: -0.580193
FFVVFNCV: -0.858832
epoch 5/120 d loss 0.6282, g loss 0.6593
FCLDVVRF: -0.424812
FCLGVVRF: -0.417656
#---------------------#
# FCVFVVRC: -1.053175 #
#---------------------#
epoch 6/120 d loss 0.5953, g loss 0.7060
CCFCVIYF: -0.876874
#---------------------#
# CCIYIIYF: -1.073743 #
#---------------------#
CCFCVFYF: -0.862477
#---------------------#
# VCFYIIYF: -1.098245 #
#---------------------#
VCICIFYF: -0.963212
epoch 7/120 d loss 0.5424, g loss 0.7645
#---------------------#
# VLIYFFHY: -1.038467 #
#---------------------#
YHIISSVY: -0.470117
YHIISSVY: -0.470117
YHIISSVY: -0.470117
epoch 8/120 d loss 0.5492, g loss 0.7479
FYCFCCCG: -0.628265
FYVFCCCG: -0.640252
CYVF

KeyboardInterrupt: ignored

# Pepto GAN - Washerstein GAN

In [0]:
# ---------------------------------
# Define the Washerstein Generator and Descriminator
# ---------------------------------
class WGenerator(nn.Module):
    def __init__(self):
        super(WGenerator, self).__init__()

        def block(in_feat, out_feat, normalize=True):
            layers = [nn.Linear(in_feat, out_feat)]
            if normalize:
                layers.append(nn.BatchNorm1d(out_feat, 0.8))
            layers.append(nn.LeakyReLU(0.2, inplace=True))
            return layers

        self.model = nn.Sequential(
            *block(was_opt.latent_dim, 128, normalize=False),
            *block(128, 256),
            *block(256, 512),
            *block(512, 1024),
            nn.Linear(1024, int(gan_opt.peptide_length * len(gan_opt.amino_acids))),
            nn.Tanh()
        )

    def forward(self, z):
        pep = self.model(z)
        pep = pep.view(pep.size()[0], gan_opt.peptide_length, len(gan_opt.amino_acids))
        pep = F.softmax(pep, dim=2)
        return pep

class WDiscriminator(nn.Module):
    def __init__(self):
        super(WDiscriminator, self).__init__()

        self.model = nn.Sequential(
            nn.Linear(int(len(gan_opt.amino_acids) * gan_opt.peptide_length), 512),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(512, 256),
            nn.LeakyReLU(0.2, inplace=True),
            nn.Linear(256, 1),
        )

    def forward(self, pep):
        pep_flat = pep.view(pep.size(0), -1)
        validity = self.model(pep_flat)

        return validity

# Initialize generator and discriminator
w_generator = WGenerator()
w_discriminator = WDiscriminator()

if cuda:
    w_generator.cuda()
    w_discriminator.cuda()

# Configure data loader
w_gan_data = ToxicPeptideDataset("newMostToxicNCSequences.npy", soft=True)
w_gan_dataloader = torch.utils.data.DataLoader(w_gan_data, batch_size=was_opt.batch_size, shuffle=True)

# Optimizers
w_optimizer_G = torch.optim.RMSprop(w_generator.parameters(), lr=was_opt.lr)
w_optimizer_D = torch.optim.RMSprop(w_discriminator.parameters(), lr=was_opt.lr)

In [50]:
# --------------------------
#  Washerstein GAN Training
# --------------------------

for epoch in range(was_opt.n_epochs):

    for i, peps in enumerate(w_gan_dataloader):

        # Configure input
        real_peps = Variable(peps.type(Tensor))

        # ---------------------
        #  Train Discriminator
        # ---------------------

        w_optimizer_D.zero_grad()

        # Sample noise as generator input
        z = Variable(Tensor(np.random.normal(0, 1, (peps.shape[0], was_opt.latent_dim))))

        # Generate a batch of peptides
        gen_peps = w_generator(z).detach()
        # Adversarial loss
        d_loss = -torch.mean(w_discriminator(real_peps)) + torch.mean(w_discriminator(gen_peps))

        d_loss.backward()
        w_optimizer_D.step()

        # Clip weights of discriminator
        for p in w_discriminator.parameters():
            p.data.clamp_(-was_opt.clip_value, was_opt.clip_value)

        # Train the generator every n_critic iterations
        if i % was_opt.n_critic == 0:

            # -----------------
            #  Train Generator
            # -----------------

            w_optimizer_G.zero_grad()

            # Generate a batch of images
            gen_peps = w_generator(z)
            # Adversarial loss
            g_loss = -torch.mean(w_discriminator(gen_peps))

            g_loss.backward()
            w_optimizer_G.step()

    unique, mean, std = evaluate_gan(w_generator)
    print("epoch {}/{} d loss {:.4f}, g loss {:.4f}, unique {:.4f}%, mean: {:.4f}, std: {:.4f}".format(epoch+1, was_opt.n_epochs, d_loss.item(), g_loss.item(), unique, mean, std))
    for pep in gen_peps:
        name = decode_peptide(pep)
        score = score_peptide(name)
        if score <= -1:
            print("#---------------------#")
            print("# {}: {:.6f} #".format(name, score))
            print("#---------------------#")


epoch 1/120 d loss 0.0035, g loss -0.0250, unique % 76.3000, mean: -0.6012, std: 0.2517
YFYYVFHY: -0.830692
FCYFCCGY: -0.720864
CSCCGCLF: -0.738111
FCCVLSCL: -0.662414
FYFVCGCV: -0.704689
epoch 2/120 d loss -0.0076, g loss -0.0242, unique % 70.0000, mean: -0.6072, std: 0.2796
NYVFVVFF: -0.907981
YIDCFVYF: -0.664242
#---------------------#
# FFYYCFGI: -1.093925 #
#---------------------#
SYFGLCLF: -0.576327
FCLCCCGI: -0.668881
epoch 3/120 d loss -0.0044, g loss -0.0247, unique % 70.7000, mean: -0.6380, std: 0.2853
LFVCGCGY: -0.461138
FCICCSNI: -0.634364
ICCLLILG: -0.726356
CFCCVFYV: -0.867083
FYYFGCVY: -0.442133
epoch 4/120 d loss -0.0305, g loss -0.0300, unique % 71.6000, mean: -0.6441, std: 0.2928
VSLVCIYG: -0.572658
#---------------------#
# FYGFRFFV: -1.174257 #
#---------------------#
LFVCYYGY: -0.734607
epoch 5/120 d loss -0.0339, g loss -0.0226, unique % 75.0000, mean: -0.6123, std: 0.2989
#---------------------#
# FFIVLSFI: -1.156998 #
#---------------------#
LYVFGVFY: -0.633343


KeyboardInterrupt: ignored

# GAN evalutation

Code to evaluate gan performance on several heuristics to see which setup provides the best results.

In [0]:
# ---------------
# GAN evaluation
# ---------------

def evaluate_gan(test_model, draw_histogram=False):
  count = 1000
  z = Variable(Tensor(np.random.normal(0, 1, (count, gan_opt.latent_dim))))

  gen = test_model(z)
  peptides = decode_peptide_s(gen)
  peptides = np.unique(peptides)
  peptides = one_hot_s(peptides)

  gen_scores = tox_predictor(peptides.cuda())

  if draw_histogram:
    print("Unique Peptides Generated: {}".format(peptides.size(0)))
    print("Mean Score: {:.6f}".format(gen_scores.mean()))
    print("Standard Deviation: {:.6f}".format(gen_scores.std()))
    # generate comparison data
    data = generate_random_peptide_s(count)
    data_scores = tox_predictor(data.cuda())

    colors = ['red', 'blue']
    labels = ['generated peptides', 'random peptides']
    plt.hist(
        [gen_scores.detach().cpu().numpy(), data_scores.detach().cpu().numpy()], 
        bins=10, density=True, histtype='bar', color=colors, label=labels)
    plt.legend(prop={'size': 10})
    plt.title('distribution of peptide data')

  uniqueness = (peptides.size(0)/count)*100
  return uniqueness, gen_scores.mean().item(), gen_scores.std().item()


# Data Genreation

code that takes the original dataset and modifies it to be better/ more usable.

In [9]:
all_sequences = []
all_scores = []
with open('all_data_filtered.csv') as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=",")
    line_count = 0
    for row in csv_reader:
      line_count += 1
      all_sequences.append(row[0])
      all_scores.append(row[1])
    
    print(f'\nProcessed {line_count} lines.\n')
  
np.save("allPeptideSequences", all_sequences)
np.save("allPeptideScores", all_scores)

 64%|██████▍   | 67729/105416 [00:00<00:00, 677279.34it/s]


Processed 105416 lines.



# Toxic Peptide Hall Of Fame

All the best most toxic peptides that get generated should get stuck here along with their score so we don't loose track of them

* FRGRFFFL: -1.397936
* FYFYSFCF: -1.116865
* FYFHVFCF: -1.346310
* FYVLCCFY: -1.095583
* FFLYVCFG: -1.038978
* FRLYVVVC: -1.035730
* FFLYVCFG: -1.038978
* FFYICFCC: -1.063583
* FFFICFCF: -1.361509
* VFLFSLFI: -1.535002
* VFLFGLFI: -1.367290
* VFVFGLFI: -1.190928
* VFLFGLFI: -1.367290
* LYLVCHFL: -1.002115
* LVYFVLCF: -1.046022
* LVYFFLLF: -1.109444
* CCVIRVLI: -1.294149
* FICFFSVF: -1.055895
* YCCLFFFF: -1.043908
* GICVICYV: -1.096067
* YICVFRIV: -1.064286
* LYCCVLFF: -1.020174
* FYLVCCIV: -1.010827
* FCLCICIY: -1.142711
* FCLCICIV: -1.031946
* VFVFYGCF: -1.045995
* FFCCICIF: -1.163224
* FYCCICIF: -1.138242
* IIRLCFSY: -1.070690
* LYIICLLI: -1.017955
* GVLFVCCF: -1.033618
* FFLCICFY: -1.167259
* FYLCICFY: -1.154940
* VYLCVCFY: -1.114934
* FCVFFFRR: -1.077060
* CFFVYRVL: -1.119970
* LFCCVLIV: -1.043952
* FFRFLVVV: -1.301919
* FFRFLYVV: -1.355798
* YFFFFYGF: -1.236266
* ICLSVYFF: -1.224726




# Tests

Please ignore everything bellow this line. This is garbage code just used for testing random things.


In [0]:
sequences = np.load("mostToxicNCSequences.npy")
scores = np.load("mostToxicNCScores.npy")
print(len(sequences))
print(sequences[:5]) #Why are these peptides 7 aa's long? That seems wrong?


import csv

# Repair old data
new_sequences = []
with open('all_data_filtered.csv') as csv_file:
  csv_reader = csv.reader(csv_file, delimiter=',')
  line_count = 0
  pbar = tqdm(total=45957)
  for row in csv_reader:
    line_count += 1
    for i, sequence in enumerate(sequences):
      if sequence in row[0] and scores[i] == row[1]:
        pbar.update()
        new_sequences.append(row[0])

  print(f'Processed {line_count} lines.')

print(len(new_sequences))
print(new_sequences[:5])

np.save("newMostToxicNCSequences", new_sequences)

  0%|          | 4/45957 [00:00<21:02, 36.38it/s]

45957
['DCHRGFV' 'YRCCIIV' 'VCVHFLC' 'CCDIYVC' 'HCFCFDI']


100%|█████████▉| 45954/45957 [25:52<00:00, 28.47it/s]

Processed 105416 lines.
45957
['CDCHRGFV', 'IYRCCIIV', 'HVCVHFLC', 'DCCDIYVC', 'FHCFCFDI']
