In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
import sys
import os

In [2]:
abspath = ".."
print(torch.cuda.is_available())

True


In [3]:
sys.path.append("..")
from models import DNADataset, ALPHABET, SEQ_LENGTH, LATENT_DIM
from matplotlib import pyplot as plt
import numpy as np
import pandas as pd
from collections import namedtuple

sys.path.append("../../DRAUPNIR_ASR/draupnir/src/")
import draupnir

  from .autonotebook import tqdm as notebook_tqdm

Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


In [4]:
BATCH_SIZE = 64
dataset = DNADataset(f"{abspath}/data/alignment.fasta")
dataloader = DataLoader(dataset, batch_size=BATCH_SIZE, shuffle=True, pin_memory=True)

In [5]:
import argparse
from argparse import RawTextHelpFormatter
from draupnir import str2bool, str2None

In [6]:
parser = argparse.ArgumentParser(description="Draupnir args",formatter_class=RawTextHelpFormatter)
parser.add_argument('-name','--dataset-name', type=str, nargs='?',
                    default="simulations_blactamase_1",
                    #default="ABO", #TODO: fix fasta and tree file to have same names
                    help='Dataset project name, look at draupnir.available_datasets()')
parser.add_argument('-use-custom','--use-custom', type=str2bool, nargs='?',
                    default=False,
                    help='True: Use a custom dataset (create your own dataset). First create a folder with the same name as args.dataset_name where to store the necessary files here: draupnir/src/draupnir/data) '
                         'False: Use a default dataset (those shown in the paper) (they will automatically be downloaded at draupnir/src/draupnir/data)')
parser.add_argument('-n', '--num-epochs', default=15000, type=int, help='number of training epochs')
parser.add_argument('--alignment-file', type=str2None, nargs='?',
                    #default="/home/lys/Dropbox/PhD/DRAUPNIR_ASR/PF0096/PF0096.mafft",
                    default=None,
                    help='Path to alignment in fasta format (use with args.use_custom = True), with ALIGNED sequences. '
                         'PLEASE make sure that the fasta header names and the names in the tree are the same')
parser.add_argument('--tree-file', type=str2None, nargs='?',
                    #default="/home/lys/Dropbox/PhD/DRAUPNIR_ASR/PF0096/PF0096.fasta.treefile",
                    default=None,
                    help='Path to newick tree (in format 1 from ete3) (use with args.use_custom = True).'
                         'PLEASE make sure that the fasta header names and the names in the tree are the same')
parser.add_argument('--fasta-file', type=str2None, nargs='?',
                    default=None,
                    help='Path to fasta file (use with args.use_custom = True) with UNALIGNED sequences and NO tree (tree is inferred using IQtree). '
                         'PLEASE make sure that the fasta header names and the names in the tree are the same')
parser.add_argument('--leaf-embeddings', type=str2None, nargs='?',
                    default=None,
                    help='Path to dataframe containing pre-computed embeddings for the leaf sequences (i.e ESMB embeddings)') #TODO: IMPLEMENT
parser.add_argument('-build', '--build-dataset', default=False, type=str2bool,
                    help='True: Create and store the dataset from a given alignment file/tree or the unaligned sequences;'
                         'False: Use previously stored data files under folder with -dataset-name or at draupnir/src/draupnir/data. '
                         'Once you have built once the dataset you do not have to do it again (if everything went fine)'
                         'Further customization can be found under draupnir/src/draupnir/datasets.py')
parser.add_argument('-bsize','--batch-size', default=1, type=str2None,nargs='?',help='set batch size.\n '
                                                            'Set to 1 to NOT batch (batch_size == 1 batch == entire dataset).\n '
                                                            'Set to None it automatically suggests a batch size and activates batching (it is slow, only use for very large datasets).\n '
                                                            'If batch_by_clade=True: 1 batch= 1 clade (size given by clades_dict).'
                                                            'Else set the batchsize to the given number')
parser.add_argument('-aa-probs', default=21, type=int, help='21: 20 amino acids,1 gap probabilities \n '
                                                            ' 24: 23 amino acids, 1 gap')
parser.add_argument('-n-samples','-n_samples', default=10, type=int, help='Number of samples (sequences sampled) per node')
parser.add_argument('-use-blosum','--use-blosum', type=str2bool, nargs='?',default=True,help='Use blosum matrix embedding')
parser.add_argument('-subs_matrix', default="BLOSUM62", type=str, help='blosum matrix to create blosum embeddings, choose one from ~/anaconda3/pkgs/biopython-1.76-py37h516909a_0/lib/python3.7/site-packages/Bio/Align/substitution_matrices/data')
parser.add_argument('-embedding-dim', default=50, type=int, help='Blosum embedding dim')
parser.add_argument('-use-cuda', type=str2bool, nargs='?', default=True,
                    help='True: Use GPU; False: Use CPU')
parser.add_argument('-use-scheduler', type=str2bool, nargs='?', default=False, help='Use learning rate scheduler, to modify the learning rate during training. Only used with 1 large dataset in the paper')
parser.add_argument('-test-frequency', default=100, type=int, help='sampling frequency (in epochs) during training, every <n> epochs, sample')
parser.add_argument('-guide', '--select_guide', default="delta_map", type=str,help='choose a guide, available types: "delta_map" , "diagonal_normal" or "variational"')
#Highlight: Sample from a pre-trained model
parser.add_argument('-load-pretrained-path', type=str, nargs='?',default="None",
                    help='Load pretrained Draupnir Checkpoints (folder path) to generate samples')
parser.add_argument('-generate-samples', type=str2bool, nargs='?', default=False,help='Load fixed pretrained parameters (stored in Draupnir Checkpoints) and generate new samples')
#Highlight: EXPERIMENTAL FEATURES
parser.add_argument('-one-hot','--one-hot-encoded', type=str2bool, nargs='?',
                    default=False,
                    help='Build a one-hot-encoded dataset. Do not use, for now, Draupnir works with blosum-encoded and integers as amino acid representations, '
                         'so this is not needed for Draupnir inference at the moment')
parser.add_argument('-bbc','--batch-by-clade', type=str2bool, nargs='?', default=False, help='Experimental. Use the leaves divided by their corresponding clades into batches. Do not use with leaf-testing')
parser.add_argument('-pdb_folder', default=None, type=str,
                    help='Path to folder of PDB structures. The engine can read them and parse them into a dataset that the model can use.')
parser.add_argument('-angles','--infer-angles', type=str2bool, nargs='?', default=False,help='Experimental. Additional Inference of angles. Use only with sequences associated PDB structures and their angles.')
parser.add_argument('-kappa-addition', default=5, type=int, help='lower bound on the angles distribution parameters')
parser.add_argument('-plate','--plating',  type=str2bool, nargs='?', default=False, help='Plating/Subsampling the mapping of the sequences (ONLY the sequences, not the latent space, '
                                                                                         'see example in DRAUPNIRModel_classic_plating under models.py).\n'
                                                                                         ' Remember to set plating/subsampling size, otherwise it is done automatically')
parser.add_argument('-plate-size','--plating_size', type=str2None, nargs='?',default=None,help='Set plating/subsampling size:\n '
                                                                'If set to None it automatically suggests a plate size, only if args.plating is TRUE!. Otherwise it remains as None and no plating occurs\n '
                                                                'Else it sets the plate size to a given integer')
parser.add_argument('-plate-idx-shuffle','--plate-unordered', type=str2bool, nargs='?',const=None, default=False,help='When subsampling/plating, shuffle (True) or not (False) the idx of the sequences which are given in tree level order')
parser.add_argument('-position-embedding-dim', default=30, type=int, help='Tree position embedding dimension size')
parser.add_argument('-max-indel-size', default=5, type=int, help='maximum insertion deletion size (not used)')
parser.add_argument('-activate-elbo-convergence', default=False, type=bool, help='extends the running time until a convergence criteria in the elbo loss is met')
parser.add_argument('-activate-entropy-convergence', default=False, type=bool, help='extends the running time until a convergence criteria in the sequence entropy is met')
#TODO: Ray HPO? Would need to do for each protein family
parser.add_argument('-d', '--config-dict', default=None,type=str, help="Used with parameter search")
parser.add_argument('--parameter-search', type=str2bool, default=False, help="Activates a mini grid search for parameter search. TODO: Improve") #TODO: Change to something that makes more sense

_StoreAction(option_strings=['--parameter-search'], dest='parameter_search', nargs=None, const=None, default=False, type=<function str2bool at 0x77386ea14940>, choices=None, required=False, help='Activates a mini grid search for parameter search. TODO: Improve', metavar=None)

In [7]:
# args = parser.parse_args(["--dataset-name", "benchmark_randall_original_naming", 
#                           "--alignment-file", "../../DRAUPNIR_ASR/data/benchmark_randall_original_naming/benchmark_randall_original_naming.mafft",
#                          "--tree-file","../../DRAUPNIR_ASR/data/benchmark_randall_original_naming/RandallBenchmarkTree_OriginalNaming.tree",
#                          "--select_guide", "variational"])

In [8]:
args = parser.parse_args(["--dataset-name", "draupnir_data", 
                            "--use-custom", "True",
                            "--alignment-file", "../../DRAUPNIR_ASR/data/benchmark_randall_original_naming/draupnir_data.fasta/",
                            "--tree-file","../../DRAUPNIR_ASR/data/benchmark_randall_original_naming/draupnir_data.tree",
                            "--select_guide", "variational"])

In [9]:
# args.__dict__["device"] = "cuda:1" if torch.cuda.is_available() else "cpu"
args.__dict__["device"] = "cuda" if torch.cuda.is_available() else "cpu"
print("args:\n------------------------------")
for k,v in sorted(vars(args).items()):
    print(f"%-40s:\t{v}"%f"{k}")
torch.set_default_dtype(torch.float64)


args:
------------------------------
aa_probs                                :	21
activate_elbo_convergence               :	False
activate_entropy_convergence            :	False
alignment_file                          :	../../DRAUPNIR_ASR/data/benchmark_randall_original_naming/draupnir_data.fasta/
batch_by_clade                          :	False
batch_size                              :	1
build_dataset                           :	False
config_dict                             :	None
dataset_name                            :	draupnir_data
device                                  :	cuda
embedding_dim                           :	50
fasta_file                              :	None
generate_samples                        :	False
infer_angles                            :	False
kappa_addition                          :	5
leaf_embeddings                         :	None
load_pretrained_path                    :	None
max_indel_size                          :	5
n_samples                               :

In [16]:
# data = None
# with open("../data/draupnir_data/draupnir_data.mafft","r") as f:
#     data = f.readlines()
#     data = [x.upper() if '>' not in x else x for x in data]

# with open("../data/draupnir_data/draupnir_data_alignment.fasta", "w") as f:
#     f.write("".join(data))

In [18]:
build_config, settings_config, root_sequence_name = draupnir.datasets.create_draupnir_dataset("draupnir_data", 
                                                                                             fasta_file="../data/draupnir_data/draupnir_data.fasta", 
                                                                                             tree_file="../data/draupnir_data/draupnir_data.tree", 
                                                                                             alignment_file = "../data/draupnir_data/draupnir_data_alignment.fasta",
                                                                                             use_custom=True,
                                                                                             script_dir=".",
                                                                                             args=args,
                                                                                             build=True)



Analyzing alignment...
Reading given alignment file ...
Using given tree file...
Building patristic and cladistic matrices ...
Dataset larger than 200 sequences: Using R script for patristic distances (cladistic matrix is NOT available)!




[1] "suscessfully running R script"

Phylogenetic tree with 466 tips and 464 internal nodes.

Tip labels:
  NODE_0000000, Wuhan/Hu-1/2019, NODE_0000003, NODE_0000004, Australia/QLD1738/2021, USA/CA-LACPHL-AF01176/2021, ...
Node labels:
  , A467, A468, A469, A470, A472, ...

Unrooted; includes branch length(s).
Ready and saved!




Creating not aligned dataset...


In [11]:
# BuildConfig = namedtuple('BuildConfig',['alignment_file','use_ancestral','n_test','build_graph',"aa_probs","triTSNE",
#                                         "leaves_testing","script_dir","no_testing"])
# SettingsConfig = namedtuple("SettingsConfig",["one_hot_encoding", "model_design","aligned_seq","data_folder","full_name","tree_file"])

In [12]:
param_config = {
            "lr": 1e-3,
            "beta1": 0.9, #coefficients used for computing running averages of gradient and its square (default: (0.9, 0.999))
            "beta2": 0.999,
            "eps": 1e-8,#term added to the denominator to improve numerical stability (default: 1e-8)
            "weight_decay": 0,#weight_decay: weight decay (L2 penalty) (default: 0)
            "clip_norm": 10,#clip_norm: magnitude of norm to which gradients are clipped (default: 10.0)
            "lrd": 1, #rate at which learning rate decays (default: 1.0)
            "z_dim": 30,
            "gru_hidden_dim": 60, #60
        }

# root_sequence_name = None
name = "draupnir_data"

# build_config = BuildConfig(alignment_file='../../DRAUPNIR_ASR/data/benchmark_randall_original_naming/benchmark_randall_original_naming.mafft', 
#             use_ancestral=True, 
#             n_test=0, 
#             build_graph=True, 
#             aa_probs=21, 
#             triTSNE=False, 
#             leaves_testing=False, 
#             script_dir='.', 
#             no_testing=False)

# settings_config = SettingsConfig(one_hot_encoding=False, 
#                model_design='GP_VAE', 
#                aligned_seq=True, 
#                data_folder='/home/averma2/code/DRAUPNIR_ASR/data/benchmark_randall_original_naming', 
#                full_name="Randall's Coral fluorescent proteins (CFP) benchmark dataset", 
#                tree_file='/home/averma2/code/DRAUPNIR_ASR/data/benchmark_randall_original_naming/RandallBenchmarkTree_OriginalNaming.tree')

results_dir = os.getcwd() + "/results"

script_dir = os.getcwd()

In [13]:
train_load,test_load,additional_load,build_config = draupnir.main.load_data(name,settings_config,build_config,param_config,results_dir,script_dir,args)
additional_info = draupnir.utils.extra_processing(additional_load.ancestor_info_numbers, additional_load.patristic_matrix_full,results_dir,args,build_config)
train_load,test_load,additional_load= draupnir.load_utils.datasets_pretreatment(name,root_sequence_name,train_load,test_load,additional_load,build_config,args,settings_config,script_dir)

No cladistic matrix available. Evolutionary matrix = Patristic matrix
No testing, there is not a test dataset, we will just predict the ancestors without checking their accuracy due to abscence of test data
Creating empty test dataset ONLY with the internal nodes names (no sequences) 




In [14]:
# names = ["train_load","test_load","additional_load","build_config","additional_info"]
# vs = [train_load,test_load,additional_load,build_config,additional_info]

# for i in range(len(vs)):
#     info_dict = vs[i]._asdict()
#     for key,val in info_dict.items():
#         if str(type(val)) == "<class 'torch.Tensor'>":
#             vs[i] = vs[i]._replace(**{key:val.to(args.device)})

# train_load = vs[0]
# test_load = vs[1]
# additional_load = vs[2]
# build_config = vs[3]
# additional_info = vs[4]

# for i,v in enumerate(vs):
#     print(names[i] + "\n-----------")
#     info_dict = v._asdict()
#     for key,val in info_dict.items():        
#         print(f"%-30s:\t{str(type(val))}"%f"{key}")
#     print()

In [15]:
if args.one_hot_encoded:
    raise ValueError("Please set one_hot_encoding to False")

print("Starting Draupnir ...")
print("Dataset: {}".format(name))
print("Number epochs: {}".format(args.num_epochs))
print("Z/latent Size: {}".format(param_config["z_dim"]))
print("GRU hidden size: {}".format(param_config["gru_hidden_dim"]))
print("Number train sequences: {}".format(train_load.dataset_train.shape[0]))
n_test = [test_load.dataset_test.shape[0] if test_load.dataset_test is not None else 0][0]
print("Number test sequences: {}".format(n_test))
print("Selected Substitution matrix : {}".format(args.subs_matrix))

Starting Draupnir ...
Dataset: draupnir_data
Number epochs: 15000
Z/latent Size: 30
GRU hidden size: 60
Number train sequences: 466
Number test sequences: 464
Selected Substitution matrix : BLOSUM62


In [16]:
print("Training Draupnir with the entire tree at once, not batching")
if not args.batch_by_clade:
    clades_dict=None
else:
    clades_dict = additional_load.clades_dict_leaves
graph_coo = None #Highlight: use only with the GNN models (7)---> Otherwise it is found in additional_info
draupnir.main.draupnir_train(train_load,
                            test_load,
                            additional_load,
                            additional_info,
                            build_config,
                            settings_config,
                            param_config,
                            args.n_samples,
                            args,
                            script_dir,
                            results_dir,
                            graph_coo,
                            clades_dict)

Training Draupnir with the entire tree at once, not batching
Using model DRAUPNIRModel_classic WITHOUT plating
Using variational as guide
 Train_loader size:  1 batches
('f', 'svi', 'training_function_input')


RuntimeError: NVML_SUCCESS == DriverAPI::get()->nvmlInit_v2_() INTERNAL ASSERT FAILED at "/opt/conda/conda-bld/pytorch_1695392035629/work/c10/cuda/CUDACachingAllocator.cpp":1123, please report a bug to PyTorch. 
Trace Shapes:         
 Param Sites:         
Sample Sites:         
   alpha dist |  3    
        value |  3    
 sigma_f dist | 30    
        value | 30    
 sigma_n dist | 30    
        value | 30    
   lambd dist | 30    
        value | 30    
latent_z dist | 30 466
        value | 30 466

In [None]:
!pwd