In [7]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader

import os
import sys
sys.path.append("/content/drive/MyDrive/bedford_lab/code/EVE")
from EVE import VAE_model
from utils import data_utils

In [3]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Downloading biopython-1.85-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.3 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/3.3 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/3.3 MB[0m [31m38.6 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.3/3.3 MB[0m [31m51.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: biopython
Successfully installed biopython-1.85


In [54]:
sys.path.append("/content/drive/MyDrive/bedford_lab/code/embedded-pathways/latent-diffusion")
from models import VAE, DNADataset, ALPHABET, SEQ_LENGTH, LATENT_DIM
from sklearn.decomposition import PCA
from matplotlib import pyplot as plt
import numpy as np

In [13]:
theta = 0.2
print("Theta MSA re-weighting: "+str(theta))
# msa_location = "/content/drive/MyDrive/bedford_lab/code/embedded-pathways/data/alignment.fasta"
msa_location = "/content/drive/MyDrive/bedford_lab/code/EVE/data/MSA/P53_HUMAN_b0.1.a2m"
weights_location = "/content/drive/MyDrive/bedford_lab/code/EVE/data/weights/Cov_theta_" + str(theta) + ".npy"

data = data_utils.MSA_processing(
            MSA_location=msa_location,
            theta=theta,
            use_weights=True,
            weights_location=weights_location
    )

Theta MSA re-weighting: 0.2
Proportion of sequences dropped due to fraction of gaps: 0.0%
Proportion of non-focus columns removed: 16.28%
Encoding sequences
Computing sequence weights
Neff = 443.49035766802706
Data Shape = (3630, 329, 20)


In [28]:
X = data.one_hot_encoding

(3630, 329, 20)


In [48]:
import argparse

In [34]:
print(X.shape)
print(X[0,1,:])

(3630, 329, 20)
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]


In [25]:
BATCH_SIZE = 64
EPOCHS = 30

dataset = DNADataset("/content/drive/MyDrive/bedford_lab/code/embedded-pathways/data/alignment.fasta")
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

In [46]:
new_dataset = np.array([dataset[x][0] for x in range(len(dataset))])

In [47]:
print(new_dataset.shape)

(466, 29903, 5)


In [35]:
A = next(iter(dataloader))[0]
print(A.shape)
print(A[0,1,:])

torch.Size([64, 29903, 5])
tensor([0., 1., 0., 0., 0.])


In [None]:
import os, sys
import argparse
import pandas as pd
import json
import numpy as np

sys.path.append("/content/drive/MyDrive/bedford_lab/code/embedded-pathways/latent-diffusion")
from models import VAE, DNADataset, ALPHABET, SEQ_LENGTH, LATENT_DIM

from EVE import VAE_model, VAE_encoder, VAE_decoder
from utils import data_utils

class DummyData:
    def __init__(self):
        print("using dummy data class")

        BATCH_SIZE = 64
        EPOCHS = 30
        dataset = DNADataset("/content/drive/MyDrive/bedford_lab/code/embedded-pathways/data/alignment.fasta")
        new_dataset = np.array([dataset[x][0] for x in range(len(dataset))])

        self.num_sequences = new_dataset.shape[0]
        self.seq_len = new_dataset.shape[1]
        self.alphabet_size = new_dataset.shape[-1]
        self.Neff = new_dataset.shape[0]

        self.one_hot_encoding = new_dataset
        self.weights = np.ones(new_dataset.shape[0])

if __name__ == '__main__':

    data = DummyData()


    enc_params = {
        "hidden_layers_sizes"               :   [2000,1000,300],
        "z_dim"                             :   50,
        "convolve_input"                    :   False,
        "convolution_input_depth"           :   40,
        "nonlinear_activation"              :   "relu",
        "dropout_proba"                     :   0.0,
        'seq_len'                           :   data.seq_len,
        'alphabet_size'                     :   data.alphabet_size,
    }

    dec_params = {
        "hidden_layers_sizes"               :   [300,1000,2000],
        "z_dim"                             :   50,
        "bayesian_decoder"                  :   True,
        "first_hidden_nonlinearity"         :   "relu",
        "last_hidden_nonlinearity"          :   "relu",
        "dropout_proba"                     :   0.1,
        "convolve_output"                   :   True,
        "convolution_output_depth"          :   40,
        "include_temperature_scaler"        :   True,
        "include_sparsity"                  :   False,
        "num_tiles_sparsity"                :   0,
        "logit_sparsity_p"                  :   0,
        'seq_len'                           :   data.seq_len,
        'alphabet_size'                     :   data.alphabet_size,
    }


    model_name = "Covid_model1"
    print("Model name: "+str(model_name))
    model = VAE_model.VAE_model(
                    model_name=model_name,
                    data=data,
                    encoder_parameters=enc_params,
                    decoder_parameters=dec_params,
                    # random_seed=args.seed
                    random_seed = 42
    )
    model = model.to(model.device)

    print("Starting to train model: " + model_name)

    training_params = {
        "num_training_steps"                :   400000,
        "learning_rate"                     :   1e-4,
        "batch_size"                        :   256,
        "annealing_warm_up"                 :   0,
        "kl_latent_scale"                   :   1.0,
        "kl_global_params_scale"            :   1.0,
        "l2_regularization"                 :   0.0,
        "use_lr_scheduler"                  :   False,
        "use_validation_set"                :   False,
        "validation_set_pct"                :   0.2,
        "validation_freq"                   :   1000,
        "log_training_info"                 :   True,
        "log_training_freq"                 :   1000,
        "save_model_params_freq"            :   500000,
        'training_logs_location'            :   "./",
        'model_checkpoint_location'         :   "./"
    }
    model.train_model(data=data, training_parameters=training_params)

    print("Saving model: " + model_name)
    model.save(model_checkpoint=training_params['model_checkpoint_location']+os.sep+model_name+"_final",
                encoder_parameters=enc_params,
                decoder_parameters=dec_params,
                training_parameters=training_params
    )

using dummy data class
Model name: Covid_model1
