In [None]:
from google.colab import drive
drive.mount("/content/drive", force_remount=True)

Mounted at /content/drive


In [None]:
!apt install qtbase5-dev qt5-qmake

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
qt5-qmake is already the newest version (5.15.3+dfsg-2ubuntu0.2).
qtbase5-dev is already the newest version (5.15.3+dfsg-2ubuntu0.2).
0 upgraded, 0 newly installed, 0 to remove and 35 not upgraded.


In [None]:
!pip install biopython; pip install gdown; pip3 install opencv-python; pip install PyQt5; pip install ete3;



In [None]:
import warnings
from collections import namedtuple
import pprint
import os
import numpy as np
from Bio import SeqIO
import gdown

In [None]:
from scipy.sparse import coo_matrix
import argparse
import dill
import ast
import matplotlib.pyplot as plt
from matplotlib.pyplot import cm
from matplotlib.colors import LogNorm
import torch
from torch.utils.data import Dataset, DataLoader
import statistics
import seaborn as sns
from collections import defaultdict,namedtuple
import pickle
#Biopython
import Bio.PDB as PDB
from Bio.PDB.Polypeptide import PPBuilder, CaPPBuilder
from Bio.Data.IUPACData import protein_letters_3to1
from Bio.SeqRecord import SeqRecord
from Bio.Seq import Seq
from Bio import BiopythonWarning
from Bio import AlignIO, SeqIO
from Bio.PDB.PDBList import PDBList
import Bio.Align
from Bio.Align.Applications import MafftCommandline
from Bio.Phylo.TreeConstruction import *
from Bio import Phylo
#Numpy
import numpy as np
import numpy.random as npr
import pandas as pd
import matplotlib
import re


from ete3 import Tree as TreeEte3
from ete3 import TreeStyle,NodeStyle, AttrFace


Due to the on going maintenance burden of keeping command line application
wrappers up to date, we have decided to deprecate and eventually remove these
modules.

We instead now recommend building your command line and invoking it directly
with the subprocess module.


In [None]:

def create_dataset(name_file,
                   #args,
                   one_hot_encoding,
                   min_len=30,
                   fasta_file=None,
                   PDB_folder=None,
                   alignment_file=None,
                   tree_file = None,
                   pfam_dict= None,
                   method="iqtree",
                   aa_probs=21,
                   rename_internal_nodes=False,
                   storage_folder="/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data"):
    """ Combination function to create the dataset and additional files (i.e dictionaries) that Draupnir uses for inference
    in:
        :param str name_file : dataset name
        :param bool one_hot_encoding: {True,False} WARNING: One hot encoding is faulty, needs to be fixed, DO NOT USE
        :param int min_len: minimum length of the sequence, drops out sequences smaller than this
        :param str fasta_file: path to fasta with unaligned sequences
        :param str or None PDB_folder: Folder with PDB files from where to extract sequences and angles
        :param str or None alignment_file: path to fasta with aligned sequences
        :param str or None tree_file: path to newick tree, format 1 (ete3 nomenclature)
        :param dict or None pfam_dict: dictionary with PDB files names
        :param str method: tree inference methodology,
                          "iqtree": for ML tree inference by IQtree (make sure is installed globally),
                          "nj": for neighbour joining unrooted tree inference (biopython),
                          "nj_rooted": for NJ rooted tree inference (selects a root based on the distances beetwen nodes) (biopython),
                          "upgma": UPGMA (biopython),
                          "rapidnj": inference of Fast NJ unrooted inference (make sure is installed globally),
        aa_probs: amino acid probabilities
        rename_internal_nodes: {True,False} use different names for the internal/ancestral nodes from the ones given in the tree
        storage_folder: "draupnir/src/draupnir/data/dataset_name" or folder where the fasta file is located (recommended to put in a specific folder)
    out:
        if one_hot_encoding: where gap is [1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]
               Tensor with size: [Nsequences]x[Alignment length + 2]x[30] --> [[length,tree_position,dist_to_root,?,0,0,0...],[GIT vector],[aa1 one hot, phi, psi],[aa2 one hot, phi, psi],....[]]
        else: Amino acids are assigned numbers from 1-20, 0 means gap
               Tensor with size: [Nsequences]x[Alignment length + 3]x[30] --> [[length,tree_position,dist_to_root,0,0,0,0...],[GIT vector],[aa1 number, phi, psi],[aa2 number, phi, psi],....[]]
    """

    warnings.simplefilter('ignore', BiopythonWarning)
    one_hot_label= ["onehot" if one_hot_encoding else "integers"]

    prot_info_dict = {}
    prot_aa_dict = {}
    if PDB_folder:# and not alignment_file:---> We allow to have sequences that have and don't have 3D structure
        print("Creating dataset from PDB files...")
        prot_aa_dict,prot_info_dict = process_pdb_files(PDB_folder,aa_probs,pfam_dict,one_hot_encoding,min_len)
    #Remove duplicated sequences in both dictionaries
    if prot_aa_dict:
        fasta_file = "{}/{}/{}.fasta".format(storage_folder,name_file,name_file)
        print("Writing polypeptides to fasta file to {}".format(fasta_file))
        with open(fasta_file, "w") as output_handle:
            pdb_records =[]
            for id,sequence in prot_aa_dict.items():
                pdb_record = SeqRecord(Seq(''.join(sequence)),
                                   id=str(id),
                                   description="",
                                   annotations={"molecule_type": "protein"})
                pdb_records.append(pdb_record)
            SeqIO.write(pdb_records, output_handle, "fasta")
        output_handle.close()
    # Highlight: Align the polypeptides/sequences and write to a fasta angles_list file
    dict_alignment,alignment = infer_alignment(alignment_file,input_name_file=fasta_file,output_name_file="{}/{}/{}.mafft".format(storage_folder,name_file,name_file))
    #calculate_pairwise_distance(name_file,alignment)
    alignment_file = [alignment_file if alignment_file else "{}/{}/{}.mafft".format(storage_folder,name_file,name_file)][0]
    #Highlight: checking that the selected number of probabilities is correct
    summary_aa_probs = [validate_sequence_alphabet(value) for key,value in dict_alignment.items()] #finds the alphabets of each of the sequences in the alignment, checks for dna
    aa_probs = max(aa_probs,max(summary_aa_probs)) #if the input aa_probs is different from those found, the aa_probs change. And also the aa substitution  matrix
    aa_names_dict = aminoacid_names_dict(aa_probs)
    #Highlight: If the aa sequences do not come from  PDB files, they come from an alignment file that needs to be processed
    #dict_alignment_2 = dict.fromkeys(dict_alignment.keys())
    not_aligned_seqs_from_alignment_file ={}
    for key,value in dict_alignment.items():
        aligned_seq = list(dict_alignment[key])
        #no_gap_indexes = np.where(np.array(aligned_seq) != "-")[0] + 2  # plus 2 in order to make the indexes fit in the final dataframe
        not_aligned_seq =list(filter(lambda a: a != "-", aligned_seq))
        seq_len = len(not_aligned_seq)
        git_vector = np.zeros(30) #fake git vector
        aa_info = np.zeros((seq_len + 2, 30))
        aa_info[0] = np.hstack([seq_len,[0]*29]) #first row contains some sequence length information
        aa_info[1] = git_vector #second row contains a git vector (never used but we could use it for something else)
        if one_hot_encoding:
            for index_a, aa_name in enumerate(not_aligned_seq):
                one_hot = np.zeros(aa_probs)
                index = aa_names_dict[aa_name]
                one_hot[index] = 1
                extra_space = 30-aa_probs
                aa_info[index_a+2] = np.hstack([one_hot,[0]*extra_space]) #extra_space = 10 for 20 aa_probs, 9 for 21 aa_probs, 8 for 22 aa_probs
        else:
            for index_aa, aa_name in enumerate(not_aligned_seq):
                index = aa_names_dict[aa_name]
                aa_info[index_aa+2] =np.hstack([index, [0]*29])
        not_aligned_seqs_from_alignment_file[key] = aa_info
        #dict_alignment_2[key] = aa_info

    tree = infer_tree(alignment=alignment,
                      alignment_file_name=alignment_file,
                      name=name_file,
                      method=method,
                      tree_file_name="{}/{}/{}.tree".format(storage_folder,name_file,name_file),
                      tree_file=tree_file,
                      storage_folder="{}/{}".format(storage_folder,name_file))
    max_lenght = alignment.get_alignment_length()

    #Highlight: Combining sequences in the alignment that have a PDB structure and those who don't
    if prot_info_dict: #Otherwise it does not loop over empty dictionaries and does nothing
        not_aligned_seqs_from_alignment_file.update((k, prot_info_dict[k]) for k, v in not_aligned_seqs_from_alignment_file.items() if k in prot_info_dict.keys())  #update the alignment keys with their homolog with pdb information. Update only those sequences in the alignment. Mafft and the IQ tree program might discard different proteins (for example they drop different identical proteins)
        Combined_dict = not_aligned_seqs_from_alignment_file
    else:
        Combined_dict = not_aligned_seqs_from_alignment_file


    if rename_internal_nodes:
        if name_file.startswith("simulations"):
           tree = rename_tree_internal_nodes_simulations(tree,with_indexes=False)
        else:
           tree = rename_tree_internal_nodes(tree)

    leafs_names = tree.get_leaf_names()
    pickle.dump(leafs_names,open('{}/{}/{}_Leafs_names_list.p'.format(storage_folder,name_file,name_file), 'wb'))
    # if len(leafs_names) <= 200:
    #     print("Rendering tree...")
    #     render_tree(tree, "{}/{}".format(storage_folder,name_file), name_file)
    internal_nodes_names = [node.name for node in tree.traverse() if not node.is_leaf()]

    ancestors_all =[]
    for node in tree.traverse():
        ancestors_node =[node.name.replace("'","")]+[node.dist] +[ancestor.name.replace("'","") for ancestor in node.get_ancestors()]
        ancestors_all.append(ancestors_node)
    length = max(map(len, ancestors_all))
    ancestors_info = np.array([xi + [None] * (length - len(xi)) for xi in ancestors_all])

    tree_levelorder_names = np.asarray([node.name.replace("'","") for node in tree.traverse()])

    tree_levelorder_dist =np.asarray([node.dist for node in tree.traverse()])
    #Add the index of the sequence in the tree to the seq length array info
    Dataset = np.zeros((len(Combined_dict), max_lenght + 1 + 1 +1, 30),dtype=object)  # 30 dim to accomodate git vectors. Careful with the +3 (to include git, seqlen and row/protein name)
    for i, (key,val) in enumerate(Combined_dict.items()):
        aligned_seq = list(dict_alignment[key].strip(","))
        no_gap_indexes = np.where(np.array(aligned_seq) != "-")[0] + 3  # plus 3 in order to make the indexes fit in the final dataframe
        Dataset[i,0,0] = key.replace("'","") #row name/sequence name
        Dataset[i, 1:3] = Combined_dict[key][:2] #Insert seq len and git vector
        if name_file in ["benchmark_randall","benchmark_randall_original","benchmark_randall_original_naming"]:#their leaves have number names, so we keep them instead of changing them for the tree level order ones
            Dataset[i, 1, 1] = int(key.replace("'", ""))
        elif name_file in ["PF01038_lipcti_msa_fungi"]:
            Dataset[i, 1, 1] = np.where(tree_levelorder_names == key.replace(":","_"))[0][0]  # the node name will be its position in the tree
        else:
            Dataset[i,1,1] = np.where(tree_levelorder_names == key.replace("'",""))[0][0] #the node name will be its position in the tree
        Dataset[i, 1, 2] =  tree_levelorder_dist[Dataset[i,1,1]] #distance to the root? that's according to the documentation yes, but is different from the patristic distances
        Dataset[i, no_gap_indexes] = Combined_dict[key][2:]  # Assign the aa info (including angles) to those positions where there is not a gap
        if one_hot_encoding:
            #Highlight: no_gap indexes is not a boolean, we have to convert it
            gaps_mask = np.ones(max_lenght+3,np.bool)
            gaps_mask[no_gap_indexes] = False #do not keep the positions where there is not a gap
            gaps_mask[0] = False #not the first two rows
            gaps_mask[1] = False
            gaps_mask[2] = False
            Dataset[i,gaps_mask,0] = 1. #if one hot encoding the gaps with be assigned the first position in one hot encoding

    #  Reconstruct the tree from the distance matrix
    print("Building patristic and cladistic matrices ...")
    tree_save = pd.DataFrame(ancestors_info)
    #tree_save = pd.DataFrame({"Nodes_Names":tree_levelorder_names.tolist(),"Distance_to_root":tree_levelorder_dist.tolist()})
    tree_save.to_csv("{}/{}/{}_tree_levelorder_info.csv".format(storage_folder,name_file,name_file),sep="\t")
    nodes_and_leafs_names = internal_nodes_names + leafs_names
    calculate_patristic_distance(name_file,Combined_dict,nodes_and_leafs_names,tree,tree_file,"{}/{}".format(storage_folder,name_file))
    calculate_closest_leaves(name_file,tree,"{}/{}".format(storage_folder,name_file))
    calculate_directly_linked_nodes(name_file, tree,"{}/{}".format(storage_folder,name_file))
    calculate_descendants(name_file,tree,"{}/{}".format(storage_folder,name_file))

    print("Ready and saved!")
    warnings.warn("Building clades (Collapses the original tree into monophyletic clades!)")
    divide_into_monophyletic_clades(tree,"{}/{}".format(storage_folder,name_file),name_file)
    np.save("{}/{}/{}_dataset_numpy_aligned_{}.npy".format(storage_folder,name_file,name_file,one_hot_label[0]), Dataset)
    max_lenght_not_aligned = max([int(sequence[0][0]) for idx,sequence in Combined_dict.items()]) #Find the largest sequence without being aligned
    print("Creating not aligned dataset...")
    Dataset_not_aligned = np.zeros((len(Combined_dict), max_lenght_not_aligned +3, 30), dtype=object)  # 30 for future git vectors. Careful with the +3
    for i,(key,value) in enumerate(Combined_dict.items()):
        Dataset_not_aligned[i,0,0] = key
        Dataset_not_aligned[i, 1:3] = Combined_dict[key][:2] #Fill in the sequence lenght and the git vector
        if name_file in ["benchmark_randall_original_naming"]:
            Dataset[i, 1, 1] = int(key.replace("'", ""))
        else:
            Dataset[i,1,1] = np.where(tree_levelorder_names == key.replace("'",""))[0][0] #position in the tree
        Dataset_not_aligned[i, 1, 2] =  tree_levelorder_dist[Dataset[i,1,1]]
        Dataset_not_aligned[i, 3:int(Combined_dict[key][0][0]) +3] = Combined_dict[key][2:] #Fill in the amino acids "letters"/"numbers" and their angles
        if one_hot_encoding:
            # #Highlight: no_gap indexes is not a boolean, we have to convert it
            # gaps_mask = np.ones(max_lenght+3,np.bool)
            # gaps_mask[no_gap_indexes] = False #do not keep the positions where there is not a gap
            # gaps_mask[0] = False #not the first two rows
            # gaps_mask[1] = False
            # gaps_mask[2] = False
            # Dataset[i,gaps_mask,0] = 1. #if one hot encoding the gaps with be assigned the first position in one hot encoding
            Dataset_not_aligned[i, (int(Combined_dict[key][0][0]) + 3):,0] = 1.
    np.save("{}/{}/{}_dataset_numpy_NOT_aligned_{}.npy".format(storage_folder,name_file,name_file,one_hot_label[0]), Dataset_not_aligned)

    return tree_file

In [None]:
def infer_alignment(alignment_file,input_name_file,output_name_file):
    """
    Reads and alignment or performs alignment using MAFFT [MAFFT multiple sequence alignment software version 7: improvements in performance and usability]. Returns a dictionary with the sequence name
    and the sequence and a biopython alignment object
    :param str or None alignment_file: path to pre-computed alignment to read
    :param str input_file_name: path to the file containing unaligned sequences,in fasta format
    :param str output_file_name: name of the file that will contain the aligned sequences"""
    # Align the polypeptides/sequences and write to a fasta file
    print("Analyzing alignment...")
    if alignment_file: #The alignment file should contain the polypeptides of the PDB structures and sequences without structures
        print("Reading given alignment file ...")
        # Read the aligned sequences
        alignment = AlignIO.read("{}".format(alignment_file), "fasta")
        alignment_ids = []
        alignment_seqs = []
        for i,aligned in enumerate(alignment):
            alignment_ids.append(alignment[i].id)
            alignment_seqs.append(alignment[i].seq.strip("*")) #Highlight: Remove stop codons
        dict_alignment = dict(zip(alignment_ids, alignment_seqs))
        return dict_alignment, alignment
    else:
        print("Using mafft to align...")
        mafft_cline = MafftCommandline(input=input_name_file)
        stdout, stderr = mafft_cline()
        with open(output_name_file, "w") as handle:
            handle.write(stdout)
        handle.close()

        # Read the aligned sequences
        alignment = AlignIO.read(output_name_file, "fasta")
        alignment_ids = [alignment[i].id for i, aligned in enumerate(alignment)]
        alignment_seqs = [alignment[i].seq for i, aligned in enumerate(alignment)]
        dict_alignment = dict(zip(alignment_ids, alignment_seqs))
        return dict_alignment, alignment


In [None]:
def aminoacid_names_dict(aa_probs):
  """ Returns an aminoacid associated to a integer value
  :param int aa_probs: amino acid probabilities, this number correlates to the number of different aa types in the input alignment"""
  if aa_probs == 21:
    aminoacid_names = {"-":0,"R":1,"H":2,"K":3,"D":4,"E":5,"S":6,"T":7,"N":8,"Q":9,"C":10,"G":11,"P":12,"A":13,"V":14,"I":15,"L":16,"M":17,"F":18,"Y":19,"W":20}
    return aminoacid_names
  if aa_probs == 22: #includes stop codons---> fix in Create blosum
    aminoacid_names = {"-":0,"*":0,"R":1,"H":2,"K":3,"D":4,"E":5,"S":6,"T":7,"N":8,"Q":9,"C":10,"G":11,"P":12,"A":13,"V":14,"I":15,"L":16,"M":17,"F":18,"Y":19,"W":20}
    return aminoacid_names
  elif aa_probs > 22:
    aminoacid_names = {"-":0,"R":1,"H":2,"K":3,"D":4,"E":5,"S":6,"T":7,"N":8,"Q":9,"C":10,"G":11,"P":12,"A":13,"V":14,"I":15,"L":16,"M":17,"F":18,"Y":19,"W":20,"B":21,"Z":22,"X":23}
    return aminoacid_names

In [None]:
def benchmark_randalls_dataset_train(name,args,storage_folder,aa_prob):
  """Processing of the leaves dataset from "An experimental phylogeny to benchmark ancestral sequence reconstruction"
  :param str name: project dataset name
  :param int aa_prob: amino acid probabilities"""
  observed_nodes = [19,18,17,16,15,14,13,12,11,10,9,8,7,6,4,5,3,2,1] #I have this in a list for a series of past reasons
  sequences_file = "/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data/benchmark_randall_original_naming/original_data/RandallExperimentalPhylogenyAASeqs.fasta"
  #Select the sequences of only the observed nodes
  full_fasta = SeqIO.parse(sequences_file, "fasta")
  with open("{}/benchmark_randall_original_naming/original_data/Randall_Benchmark_Observed.fasta".format(storage_folder), "w") as output_handle:
    observed_fasta = []
    for seq in full_fasta:
      if int(seq.id) in observed_nodes:
        observed_fasta.append(seq)
    SeqIO.write(observed_fasta, output_handle, "fasta")
  create_dataset(name,
                  one_hot_encoding=args.one_hot_encoded,
                  fasta_file = "/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data/benchmark_randall_original_naming/Randall_Benchmark_Observed.fasta",
                  alignment_file = "/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data/benchmark_randall_original_naming/benchmark_randall_original_naming.mafft",
                  tree_file = "/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data/benchmark_randall_original_naming/RandallBenchmarkTree_OriginalNaming.tree",
                  # fasta_file="{}/original_data/Randall_Benchmark_Observed.fasta",
                  # alignment_file="{}/benchmark_randall_original.mafft".format(storage_folder),
                  # tree_file="{}/RandallBenchmarkTree_OriginalNaming.tree".format(storage_folder),
                  aa_probs=aa_prob,
                  rename_internal_nodes=False)

In [None]:
def benchmark_randalls_dataset_test(settings_config,aa_probs=21):
  """Processing of the internal nodes dataset from "An experimental phylogeny to benchmark ancestral sequence reconstruction
  :param str scriptdir
  :param int aa_probs"""
  internal_nodes = [21,30,37,32,31,34,35,36,33,28,29,22,23,27,24,26,25]
  sequences_file = "{}/original_data/RandallExperimentalPhylogenyAASeqs.fasta".format(settings_config.data_folder)
  # Select the sequences of only the observed nodes
  full_fasta = SeqIO.parse(sequences_file, "fasta")
  aminoacid_names= aminoacid_names_dict(aa_probs)
  internal_fasta_dict = {}
  for seq in full_fasta:
    if int(seq.id) in internal_nodes:
      seq_numbers =[]
      for aa_name in seq.seq:
        #aa_number = int(np.where(np.array(aminoacid_names) == aa_name)[0][0]) + add_on
        aa_number = aminoacid_names[aa_name]
        seq_numbers.append(aa_number)
      internal_fasta_dict[int(seq.id)] = [seq.seq,seq_numbers]
  max_length = max([int(len(sequence[0])) for idx,sequence in internal_fasta_dict.items()]) #225

  dataset = np.zeros((len(internal_fasta_dict), max_length + 1 + 1, 30),dtype=object)  # 30 dim to accomodate git vectors. Careful with the +2 (to include git, seqlen)
  for i, (key,val) in enumerate(internal_fasta_dict.items()):
    # aligned_seq = list(alignment[i].seq.strip(",")) # I don't think this made sense, cause files could be in wrong order?
    aligned_seq = list(internal_fasta_dict[key][0].strip(","))
    no_gap_indexes = np.where(np.array(aligned_seq) != "-")[0] + 2  # plus 2 in order to make the indexes fit in the final dataframe
    dataset[i, 0,0] = len(internal_fasta_dict[key][1]) #Insert seq len and git vector
    dataset[i,0,1] = key #position in the tree
    dataset[i, 0, 2] =  0 #fake distance to the root
    dataset[i, no_gap_indexes,0] = internal_fasta_dict[key][1] # Assign the aa info (including angles) to those positions where there is not a gap

  return dataset, internal_nodes

In [None]:
def create_draupnir_dataset(name,use_custom,script_dir,args,build=False,fasta_file=None,tree_file=None,alignment_file=None):
  """In:
  :param str name: Dataset name
  :param bool use_custom: True (uses a custom dataset, located in datasets/custom/"folder_name" ) or False (uses a Draupnir default dataset (used in the publication))
  :param str script_dir: Working directory of Draupnir #TODO: remove
  :param bool build: Activates the construction of the dataset, might take a while if it requires to build tree, so it's recommended to use the pre-saved files
  :param str or None fasta_file: Path to NOT aligned sequences
  :param str or None tree_file: Path to Newick tree, format 1 in ete3
  :param str or None alignment_file: Path to pre-aligned sequences
  :returns namedtuple build_config:
      :str alignment-file:
      :bool use_ancestral: True (patristic_matrix_train = patristic_matrix_full (leaves + ancestors)), False (patristic_matrix_train = patristic_matrix) otherwise we remove the ancestral nodes from patristic_matrix_train. Necessary for some datasets
      :int n_test: percentage of train/leaves sequences to be used as test, i-e n_test = 20 ---> 20% leaves will be th etest datasets
      build_graph: make a graph for CNN #TODO: remove?
      :int aa_prob: Number of amino acid probabilities (21 or 24), depends on the different types of amino acids in the sequence alignment
      :bool triTSNE: Whether to plot TSNE in 3D (True) or not #TODO: Remove
      :bool leaves_testing: True (uses all the leaf's evolutionary distances for training, it only observes (n-n_test) leafsequences. USE WITH n_test), False (uses all the leaf's evolutionary distances for training
                          and observes all the leaf sequences. Use with datasets without ancestors for testing, only generate sequences).
      """
  BuildConfig = namedtuple('BuildConfig',['alignment_file','use_ancestral','n_test','build_graph',"aa_prob","triTSNE","leaves_testing","script_dir","no_testing"],module="build_config") #__name__ + ".namespace"
  SettingsConfig = namedtuple("SettingsConfig", ["one_hot_encoding", "model_design", "aligned_seq","data_folder","full_name","tree_file"],module="settings_config")
  if args.one_hot_encoded:
    warnings.warn("Draupnir was constructed to be used with integers for Categorical likelihood,not OneHotCategorical. And blosum-encoding for the guide. You can build the one-hot-encoded dataset for other purposes")

  #script_dir = os.path.dirname(os.path.abspath(__file__))
  if not use_custom:
    warnings.warn("You have selected a pre-defined dataset, if not present, it will be downloaded. Otherwise set args.use_custom to True")
    root_sequence_name = available_datasets()[0][name]
    full_name = available_datasets()[1][name]
    storage_folder = os.path.abspath(os.path.join(os.path.dirname("/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/"), "data")) #changed from "datasets/default"
    dir_name = '{}/{}'.format(storage_folder,name)
    dict_urls = {
      "aminopeptidase":"https://drive.google.com/drive/folders/1fLsOJbD1hczX15NW0clCgL6Yf4mnx_yl?usp=sharing",
      "benchmark_randall_original_naming":"https://drive.google.com/drive/folders/1oE5-22lqcobZMIguatOU_Ki3N2Fl9b4e?usp=sharing",
      "Coral_all":"https://drive.google.com/drive/folders/1IbfiM2ww5PDcDSpTjrWklRnugP8RdUTu?usp=sharing",
      "Coral_Faviina":"https://drive.google.com/drive/folders/1Ehn5xNNYHRu1iaf7vS66sbAESB-dPJRx?usp=sharing",
      "PDB_files_Draupnir_PF00018_116":"https://drive.google.com/drive/folders/1YJDS_oHHq-5qh2qszwk-CucaYWa9YDOD?usp=sharing",
      "PDB_files_Draupnir_PF00400_185": "https://drive.google.com/drive/folders/1LTOt-dhksW1ZsBjb2uzi2NB_333hLeu2?usp=sharing",
      "PF00096":"https://drive.google.com/drive/folders/103itCfxiH8jIjKYY9Cvy7pRGyDl9cnej?usp=sharing",
      "PF00400":"https://drive.google.com/drive/folders/1Ql10yTItcdX93Xpz3Oh-sl9Md6pyJSZ3?usp=sharing",
      "SH3_pf00018_larger_than_30aa":"https://drive.google.com/drive/folders/1Mww3uvF_WonpMXhESBl9Jjes6vAKPj5f?usp=sharing",
      "simulations_blactamase_1":"https://drive.google.com/drive/folders/1ecHyqnimdnsbeoIh54g2Wi6NdGE8tjP4?usp=sharing",
      "simulations_calcitonin_1":"https://drive.google.com/drive/folders/1jJ5RCfLnJyAq0ApGIPrXROErcJK3COvK?usp=sharing",
      "simulations_insulin_2":"https://drive.google.com/drive/folders/1xB03AF_DYv0EBTwzUD3pj03zBcQDDC67?usp=sharing",
      "simulations_PIGBOS_1":"https://drive.google.com/drive/folders/1KTzfINBVo0MqztlHaiJFoNDt5gGsc0dK?usp=sharing",
      "simulations_sirtuins_1":"https://drive.google.com/drive/folders/1llT_HvcuJQps0e0RhlfsI1OLq251_s5S?usp=sharing",
      "simulations_src_sh3_1":"https://drive.google.com/drive/folders/1tZOn7PrCjprPYmyjqREbW9PFTsPb29YZ?usp=sharing",
      "simulations_src_sh3_2":"https://drive.google.com/drive/folders/1ji4wyUU4aZQTaha-Uha1GBaYruVJWgdh?usp=sharing",
      "simulations_src_sh3_3":"https://drive.google.com/drive/folders/13xLOqW2ldRNm8OeU-bnp9DPEqU1d31Wy?usp=sharing"

        }
    #download=False #TODO: Remove
    # if os.path.isdir(dir_name):
    #   if not os.listdir(dir_name):
    #     print("Directory is empty")
    #     #if download:
    #     # os.remove(dir_name)
    #     print("Data directory is missing. Downloading, this might take a while. If you see an error like \n"
    #           " 'Cannot retrieve the public link of the file. You may need to change the permission to <Anyone with the link>, or have had many accesses', \n"
    #           "just wait, too many requests have been made to the google drive folder \n"
    #           "Otherwise just download the data sets manually from the google drive urls : \n {}".format(
    #         dict_urls[name]))
    #     gdown.download_folder(dict_urls[name], output='{}/{}'.format(storage_folder, name), quiet=True,
    #                           use_cookies=False, remaining_ok=True)
    #     # else:
    #     #     pass


  if name == "benchmark_randall_original_naming":
    alignment_file = "{}/{}/benchmark_randall_original_naming.mafft".format(storage_folder,name)
    tree_file = "{}/{}/RandallBenchmarkTree_OriginalNaming.tree".format(storage_folder,name)
    build_config = BuildConfig(alignment_file=alignment_file,
                                use_ancestral=True, n_test=0,
                                build_graph=True,aa_prob=21,
                                triTSNE=False,leaves_testing=False,script_dir=script_dir,no_testing=False)
    if build:
      benchmark_randalls_dataset_train(name, args,storage_folder,aa_prob=21)

  settings_config = SettingsConfig(one_hot_encoding=args.one_hot_encoded,
                             model_design="GP_VAE",
                             aligned_seq=True,
                             data_folder="{}/{}".format(storage_folder,name), #["{}".format(storage_folder) if use_custom else "{}/{}".format(storage_folder,name)][0]
                             full_name=full_name,
                             tree_file=tree_file)
  return build_config,settings_config, root_sequence_name

In [None]:
def validate_sequence_alphabet(seq):
    """
    Checks that the sequences from an alignment only contains values from one of the protein alphabets (protein21 or protein21plus). Reject DNA or RNA sequences
    :param str seq: sequence of characters
    """
    alphabets = {'dna': re.compile('^[acgtn]*$', re.I),
             'protein21': re.compile('^[-acdefghiklmnpqrstvwy]*$', flags=re.IGNORECASE),
            'protein21plus': re.compile('^[-acdefghiklmnpqrstvwybzx]*$', flags= re.IGNORECASE)}

    if alphabets["dna"].search(str(seq)) is not None: raise ValueError("Please use amino acids in your sequences, accepted alphabets are protein21: -acdefghiklmnpqrstvwy or protein21plus: -*acdefghiklmnpqrstvwybzx")

    if alphabets["protein21"].search(str(seq)) is not None:
        aa_probs = 21
        return aa_probs
    if alphabets["protein21plus"].search(str(seq)) is not None:
        aa_probs = 24
        return aa_probs
    else:
        raise ValueError("Your sequences contain not allowed characters. Available alphabets are: {protein21}: -acdefghiklmnpqrstvwy or {protein21plus} -*acdefghiklmnpqrstvwybzx. If your sequence contains stop codons perhaps you can trim them.")

In [None]:
def infer_tree(alignment, alignment_file_name,name,method=None,tree_file_name=None,tree_file=None,storage_folder=""):
    """ Performs tree inference or reads an input given tree, returns an ete3 tree formated tree
    :param biopython alignment alignment: biopython alignment class
    :param str alignment_file_name: path to alignment file
    :param str name: dataset project name
    :param str method: tree inference method (if tree is not given)
    :param str tree_file_name: name to give to the tree file
    .param str tree_file:path to a possible given tree in newick format 1
    :param str storage_folder: folder where to store the results of the tree inference

    """
    if tree_file:
        print("Using given tree file...")
        tree = TreeEte3(tree_file,format=1,quoted_node_names=True)
        return tree

    else:
        # Pairwise distance matrix
        print("Building distance matrices and {} tree...".format(method))
        if len(alignment) < 200 and method in ["nj","nj_rooted","upgma"]:
            calculator = DistanceCalculator('blosum62')  # DNA ---> Identity// Protein ---> blosum62
            distance_matrix_cal = calculator.get_distance(alignment)
            distance_matrix_cal_pandas = convert_to_pandas(distance_matrix_cal)
            distance_matrix_cal_pandas.to_csv("{}/{}_distance_matrix.csv".format(storage_folder,name))
            #https://stackoverflow.com/questions/30247359/how-does-biopython-determine-the-root-of-a-phylogenetic-tree
            if method == "nj":
                print("Tree inference via Neighbour Joining NOT rooted method...")
                constructor = DistanceTreeConstructor(method="nj")
                tree = constructor.nj(distance_matrix_cal)
                tree = to_ete3(tree)
                return tree
            elif method == "nj_rooted":
                print("Tree inference via Neighbour Joining with additional rooting method...")
                constructor = DistanceTreeConstructor(method="nj")
                tree = constructor.nj(distance_matrix_cal)
                tree = to_ete3(tree)
                # Making a root:
                sequence_0, sequence_1 = tree_pair_for_rooting(distance_matrix_cal_pandas)
                tree.set_outgroup(tree & sequence_0)
                # ancestor = tree.get_common_ancestor(sequence_0, sequence_1)
                # tree.set_outgroup(ancestor)
                return tree
            elif method == "upgma":
                print("Tree inference via Upgma rooted method...")
                constructor = DistanceTreeConstructor(method="upgma") # nj method is unrooted in biopython. upgma is rooted
                tree = constructor.upgma(distance_matrix_cal)
                tree = to_ete3(tree)
                return tree
        elif method == "iqtree":
            print("Iqtree ML method...")
            alignment_f = [alignment_file_name if alignment_file_name else "{}/{}.mafft".format(storage_folder,name)][0]
            tree_file_name = alignment_f.split(".")[0] + ".treefile"

            if not os.path.exists(tree_file_name):
                #-o	Specify an outgroup taxon name to root the tree. The output tree in .treefile will be rooted accordingly. DEFAULT: first taxon in alignment
                taxon_root = False
                if taxon_root:
                    root=21
                    subprocess.run(args=["iqtree","-s",alignment_f.split(".")[0],"--aln",alignment_f,"-nt","AUTO","-o",root],stderr=sys.stderr, stdout=sys.stdout)
                else:
                    subprocess.run(args=["iqtree","-s",alignment_f.split(".")[0],"--aln",alignment_f,"-nt","AUTO"],stderr=sys.stderr, stdout=sys.stdout)
                os.remove(alignment_f + ".log")
                os.remove(alignment_f + ".bionj")
                os.remove(alignment_f + ".ckp.gz")
                os.remove(alignment_f + ".model.gz")
            distance_matrix_cal = pd.read_csv(alignment_f+".mldist", sep="\\s+", skiprows=1, header=None)
            distance_matrix_cal.columns = ["rows"] + distance_matrix_cal.iloc[:,0].to_list()
            distance_matrix_cal.set_index("rows", inplace=True)
            distance_matrix_cal.index.name = ""
            distance_matrix_cal.to_csv("{}/{}_distance_matrix.csv".format(storage_folder,name))
            tree = TreeEte3(alignment_f+ ".treefile")
            return tree
        elif method == "rapidnj":
            print("Using Rapidnj to build NOT rooted tree...")
            tree_file_name = ["{}/{}.tree".format(storage_folder,name) if not tree_file_name else tree_file_name][0]
            alignment_f = [alignment_file_name if alignment_file_name else "{}/{}.mafft".format(storage_folder,name)][0]
            with open(tree_file_name, "w") as tree_file_out:
                subprocess.run(args=["rapidnj",alignment_f, "-i", "fa"], stdout=tree_file_out)
            tree_file_out.close()
            tree = TreeEte3(tree_file_name)
            return tree

In [None]:

def render_tree(tree,storage_folder,name_file):
    """Function to render an ete3 tree into an image
    :param ete3-tree tree: Ete3 tree object
    :param str storage_folder: path to folder where to store the results
    :param name_file: data set project name"""
    ts = TreeStyle()
    ns = NodeStyle()
    #Make thicker lines
    ns["vt_line_width"] = 5
    ns["hz_line_width"] = 5
    # Do not add leaf names automatically
    ts.show_leaf_name = False
    # Use my custom layout
    ts.layout_fn = my_layout
    #print the branch lengths
    ts.show_branch_length = True
    for n in tree.traverse():
            n.set_style(ns)
    try:
        tree.render("{}/return_{}.png".format(storage_folder,name_file),w=1000, units="mm",tree_style=ts)
    except:
        tree.render("{}/return_{}.png".format(storage_folder,name_file), w=1000, units="mm")

In [None]:
def my_layout(node):
    """Ete3 layout that adds the internal nodes names. It is a plug-in for rendering tree images
    :param ete3-node node: node from an ete3 tree"""
    if node.is_leaf():
        # If terminal node, draws its name
        name_face = AttrFace("name",fsize=8,fgcolor="blue")
    else:
        # If internal node, draws label with smaller font size
        name_face = AttrFace("name", fsize=8,fgcolor="red")
    # Adds the name face to the image at the preferred position
    faces.add_face_to_node(name_face, node, column=0, position="branch-right")

In [None]:
def calculate_patristic_distance(name_file,combined_dict,nodes_and_leafs_names,tree,tree_file, storage_folder):
    """Calculates the patristic distances or branch lengths across the nodes in a tree. It also saves the tree in different formats needed for benchmarking etc
    :param str name_file: data set project name
    :param dict combined_dict #TODO: Remove?
    :param list nodes_and_leafs_names: tree nodes in tree-level order stored in a list
    :param ete3-tree tree: ete3 object containing tree
    :param str tree_file: path to the stored tree file
    :param str storage_folder: folder where to store the results
    """

    n_seqs = len(combined_dict)
    #work_dir = os.path.dirname(os.path.abspath(__file__))
    work_dir = ""
    if n_seqs > 200:
        print("Dataset larger than 200 sequences: Using R script for patristic distances (cladistic matrix is NOT available)!")
        warnings.warn("Dataset larger than 200 sequences: Requires R and the ape library")
        command = 'Rscript'
        #path2script = '/home/lys/Dropbox/PhD/DRAUPNIR/Calculate_Patristic.R'
        path2script = "Calculate_Patristic.R"
        if tree_file:
            new_tree = work_dir +tree_file.split(".")[0]+".newick"
            new_tree_format8 = work_dir  + tree_file.split(".")[0] + ".format8newick"
            new_tree_format6 = work_dir  + tree_file.split(".")[0] + ".format6newick"
            new_tree_format7 = work_dir  + tree_file.split(".")[0] + ".format7newick"
        else:
            new_tree = work_dir + "{}/{}.newick".format(storage_folder,name_file)
            new_tree_format8 = work_dir + "{}/{}.format8newick".format(storage_folder,name_file)
            new_tree_format6 = work_dir + "{}/{}.format6newick".format(storage_folder,name_file)
            new_tree_format7 = work_dir + "{}/{}.format7newick".format(storage_folder,name_file)

        tree.write(outfile=new_tree_format8, format=8,format_root_node=True) # format 8 all nodes names
        tree.write(outfile=new_tree_format6, format=6,format_root_node=True)
        tree.write(outfile=new_tree_format7, format=7,format_root_node=True) #all nodes names + branch lengths
        tree.write(outfile=new_tree,format=1) #save the renamed tree, format 9 to not save the internal nodes names
        patristic_file = "{}/{}_patristic_distance_matrix.csv".format(storage_folder,name_file)
        if not os.path.exists(patristic_file):
            # Build subprocess command
            subprocess.check_call([command,path2script,new_tree,patristic_file])
        else:
            print("Patristic matrix already exists at {}, not calculated. Delete it otherwise".format(patristic_file))
        #Highlight: PHYLOCOM
        # # #Highlight:Transform the file to Nexus format
        # new_tree = tree_file.split(".")[0]+".newick"
        # new_tree_filename = ntpath.basename(new_tree)
        # working_directory = os.path.dirname(os.path.abspath(new_tree))
        # tree.write(outfile=new_tree,format=1) #save the renamed tree, format 9 to not save the internal nodes names
        # patristic_file = "Datasets_Folder/Patristic_distance_matrix_{}.txt".format(name_file)
        # with open(patristic_file, "w") as patristic_dist_out:
        #     subprocess.run(args=["phylocom","phydist", "-f", new_tree_filename],stderr=sys.stderr, stdout=patristic_dist_out,cwd=working_directory)
        # patristic_matrix = pd.read_csv(patristic_file,sep="\t",index_col=0)
        # patristic_matrix.to_csv("Datasets_Folder/Patristic_distance_matrix_{}.csv".format(name_file),index_label="rows")

    else:
        if tree_file:
            new_tree = work_dir + "/" + tree_file.split(".")[0] +".newick"
            new_tree_format8 = work_dir + "/" + tree_file.split(".")[0] + ".format8newick"
            new_tree_format6 = work_dir + "/" + tree_file.split(".")[0] + ".format6newick"
            new_tree_format7 = work_dir + "/" + tree_file.split(".")[0] + ".format7newick"
        else:
            new_tree = work_dir + "/{}/{}.newick".format(storage_folder,name_file)
            new_tree_format8 = work_dir + "/{}/{}.format8newick".format(storage_folder,name_file)
            new_tree_format6 = work_dir + "/{}/{}.format6newick".format(storage_folder,name_file)
            new_tree_format7 = work_dir + "/{}/{}.format7newick".format(storage_folder,name_file)


        tree.write(outfile=new_tree_format8, format=8,format_root_node=True)
        tree.write(outfile=new_tree_format6, format=6,format_root_node=True)
        tree.write(outfile=new_tree_format7, format=7,format_root_node=True)
        tree.write(outfile=new_tree,format=1)  # save the renamed tree, format 9 to not save the internal nodes names. format 8 all nodes names

        n_elements = len(nodes_and_leafs_names)
        I = pd.Index(nodes_and_leafs_names, name="rows")
        C = pd.Index(nodes_and_leafs_names, name="columns")
        patristic_matrix = pd.DataFrame(data=np.zeros((n_elements, n_elements)), index=I, columns=C)
        cladistic_matrix = pd.DataFrame(data=np.zeros((n_elements, n_elements)), index=I, columns=C)
        if not os.path.exists("{}/{}_patristic_distance_matrix.csv".format(storage_folder,name_file)):
            for i, t1 in enumerate(nodes_and_leafs_names):
                for j, t2 in enumerate(list(nodes_and_leafs_names)[i + 1:]):
                    cladistic_matrix.loc[[t1], [t2]] = tree.get_distance(t1, t2, topology_only=True)
                    patristic_matrix.loc[[t1], [t2]] = tree.get_distance(t1, t2, topology_only=False)
            cladistic_matrix.to_csv("{}/{}_cladistic_distance_matrix.csv".format(storage_folder,name_file))
            patristic_matrix.to_csv("{}/{}_patristic_distance_matrix.csv".format(storage_folder,name_file))
        else:
            print("Patristic matrix file already exists, not calculated")

In [None]:
def calculate_closest_leaves(name,tree,storage_folder):
    """ Creates a dictionary that contains the closest leave to an internal node {internal_node:leave}
    :param str name: data set project name
    :param ete3-tree tree: Ete3 tree class object
    :param str storage_folder: folder where to dump the output
    """
    closest_leaves_dict=defaultdict() #closest leave to an internal node
    for node in tree.traverse():
        if not node.is_leaf(): #if it's an internal node
            terminal_node = all(node.is_leaf() for node in node.get_children())
            if terminal_node:
                closest_leaves_dict[node.name] = [node.name for node in node.get_children()]
            else:
                closest_leaves_dict[node.name] = [node.get_closest_leaf()[0].name]
    pickle.dump(closest_leaves_dict, open('{}/{}_Closest_leaves_dict.p'.format(storage_folder,name), 'wb'),protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def calculate_directly_linked_nodes(name,tree,storage_folder):
    """Creates a dictionary that contains the 2 children nodes directly linked to a node (not all the children from that node) {node:children}
    :param str name: data set project name
    :param ete3-tree tree: Ete3 tree class object
    :param str storage_folder: folder where to dump the output"""
    closest_children_dict=defaultdict()
    for node in tree.traverse():
        closest_children_dict[node.name] = [node.name for node in node.get_children()]
    pickle.dump(closest_children_dict, open('{}/{}_Closest_children_dict.p'.format(storage_folder,name), 'wb'),protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def calculate_descendants(name,tree,storage_folder):
    """Creates a dictionary that contains all the internal nodes and leaves that descend from that internal node {internal_node:descendants}
    :param str name: data set project name
    :param ete3-tree tree: Ete3 tree class object
    :param str storage_folder: folder where to dump the output"""
    closest_descendants_dict = defaultdict(lambda: defaultdict())
    for node in tree.traverse():
        if not node.is_leaf():
            descendant_leaves = []
            descendant_internal = [node.name]
            for descendant in node.iter_descendants():
                if descendant.is_leaf():
                    descendant_leaves.append(descendant.name)
                else:
                    descendant_internal.append(descendant.name)
            closest_descendants_dict[node.name]["internal"] = descendant_internal
            closest_descendants_dict[node.name]["leaves"] = descendant_leaves
    dill.dump(closest_descendants_dict, open('{}/{}_Descendants_dict.p'.format(storage_folder,name), 'wb'))#,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
def divide_into_monophyletic_clades(tree,storage_folder,name):
    """
    Divides the tree into monophyletic clades:
    See https://www.mun.ca/biology/scarr/Taxon_types.html
    Implementation based on: https://www.biostars.org/p/97409/

    The reasonable clade division criteria seems to group all those nodes whose distance to the internal is lower to the overall average distance from each leaf to the root

    :param ete3-tree tree: ete3 tree class
    :param str storage_folder: folder where to store the results, in this case a dictionary containing {"clade_number": [nodes list]}
    :name str name: name of the data set project

    """

    def mean(array):
        """Calculates branch length average"""
        return sum(array) / float(len(array))

    def cache_distances(tree):
        """Precalculate distances of all nodes to the root"""
        node2rootdist = {tree: 0}
        for node in tree.iter_descendants('preorder'):
            node2rootdist[node] = node.dist + node2rootdist[node.up]
        return node2rootdist

    def build_clades(tree,name):
        """When a clustering condition is met, it collapses the tree at that node to unify all the leaves in that cluster into 'one' leaves. After, we read
        the collapsed tree into a dictionary that contains {clade number:{"internal":[nodes numbers],"leaves":[node numbers]}}"""
        # cache the tip content of each node to reduce the number of times the tree is traversed
        node2tips = tree.get_cached_content()
        root_distance = cache_distances(tree)  # distances of each of the nodes to the root
        average_root_distance = mean(root_distance.values())
        std_root_distance = statistics.stdev(root_distance.values())
        n_leaves = len(tree.get_leaves())
        #TODO: automatize clustering condition
        if n_leaves >= 100 or name.endswith("_subtree"):
            if name in ["PF00096","PF00400"]:
                clustering_condition = average_root_distance #+ 0.3*std_root_distance
            else:
                clustering_condition = average_root_distance -2*std_root_distance
        elif name in ["Coral_all","Coral_Faviina","SH3_pf00018_larger_than_30aa"] or "calcitonin" in name:
            clustering_condition = average_root_distance - std_root_distance
        else:
            clustering_condition = average_root_distance

        for node in tree.get_descendants('preorder'):
            if not node.is_leaf():  # for internal nodes
                avg_distance_to_tips = mean([root_distance[tip] - root_distance[node] for tip in node2tips[node]])  # average distance from the internal node to all it's possible derived leaves
                if avg_distance_to_tips < clustering_condition:
                    #node.name += ' COLLAPSED avg_d:%g {%s}' % (avg_distance_to_tips, ','.join([tip.name for tip in node2tips[node]]))
                    node.name += ' COLLAPSED avg_d:%g leaves:{%s} internal:{%s}' % (avg_distance_to_tips, ','.join([tip.name for tip in node2tips[node]]),','.join([internal.name for internal in node.iter_descendants() if not internal.is_leaf()]))
                    node.add_features(collapsed=True)
                    node.img_style['draw_descendants'] = False

        for n in tree.search_nodes(collapsed=True):
            for child in n.get_children():
                child.detach()
        #print(tree.get_ascii(show_internal=True))
        i = 0
        clade_dict_all = defaultdict(lambda: defaultdict())
        for n in tree.traverse():
                if n.is_leaf() and "COLLAPSED" in n.name: #collapsed leaf (it is a clade on it's own)
                    clade_names_leaves = n.name[n.name.find("leaves:{") + 8:n.name.find("}")].split(",")
                    clade_names_internal = [n.name.split(" ")[0]]
                    clade_names_internal += n.name[n.name.find("internal:{") + 10:].strip("}").split(",")
                    clade_dict_all["Clade_{}".format(i)]["leaves"] = set(clade_names_leaves)  # remove duplicates
                    clade_dict_all["Clade_{}".format(i)]["internal"] = list(filter(None,set(clade_names_internal))) #sometimes the  node strings are empty
                    i += 1
                elif not n.is_leaf(): #if the node is internal
                    clade_names_leaves = []
                    clade_names_internal = []
                    clade_names_internal += [n.name]
                    for descendant in n.iter_descendants():
                        if descendant.is_leaf():
                            if "{" not in descendant.name: #it was a pure leaf
                                clade_names_leaves += [descendant.name]
                            else: #is a collapsed leave
                                clade_names_leaves += descendant.name[descendant.name.find("leaves:{")+8:descendant.name.find("}")].split(",")
                                clade_names_internal += [descendant.name.split(" ")[0]]
                                clade_names_internal += descendant.name[descendant.name.find("internal:{")+10:].strip("}").split(",")
                        else: #add the internal node also to it's clade
                            clade_names_internal += [descendant.name]

                    clade_dict_all["Clade_{}".format(i)]["leaves"] = set(clade_names_leaves) #remove duplicates
                    clade_dict_all["Clade_{}".format(i)]["internal"] = list(filter(None,set(clade_names_internal))) #sometimes the  node strings are empty
                    i += 1
                else:#Non collapsed leaves
                    pass
        clade_dict_leaves = defaultdict()
        i = 0
        for n in tree.traverse("preorder"):
            if n.is_leaf():
                if "{" not in n.name:
                    clade_names_leaves = [n.name]
                else:
                    clade_names_leaves = n.name[n.name.find("leaves:{") + 8:n.name.find("}")].split(",")
                clade_dict_leaves["Clade_{}".format(i)] = clade_names_leaves
                i += 1

        return clade_dict_leaves,clade_dict_all

    clade_dict_leaves,clade_dict_all = build_clades(tree,name)
    #Highlight: clades_dict_all contains each clade's internal and leaves nodes, clades_dict_leaves only contains the leaves of each clade

    dill.dump(clade_dict_all, open('{}/{}_Clades_dict_all.p'.format(storage_folder,name), 'wb'))#,protocol=pickle.HIGHEST_PROTOCOL)
    pickle.dump(clade_dict_leaves, open('{}/{}_Clades_dict_leaves.p'.format(storage_folder,name), 'wb'),protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
# !cd /usr/local/lib/python3.11/dist-packages/PyQt5 && ldd *.so | grep 'not found'| sort | uniq

In [None]:
class Test:
  def __init__(self):
    self.one_hot_encoded = False

In [None]:
ls /content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data/

[0m[01;34mbenchmark_randall_original_naming[0m/


In [None]:
%cd /content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data/

/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data


In [None]:
!export QT_QPA_PLATFORM=offscreen

In [None]:
!export QT_QPA_PLATFORM=offscreen && echo $QT_QPA_PLATFORM

offscreen


In [None]:
# create_draupnir_dataset(name,use_custom,script_dir,args,build=False,fasta_file=None,tree_file=None,alignment_file=None):
test = create_draupnir_dataset("benchmark_randall_original_naming", False, ".", args=Test(), build=True)



Analyzing alignment...
Reading given alignment file ...
Using given tree file...
Building patristic and cladistic matrices ...
Patristic matrix file already exists, not calculated
Ready and saved!
Creating not aligned dataset...


In [None]:
pprint.pprint(test)

(BuildConfig(alignment_file='/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data/benchmark_randall_original_naming/benchmark_randall_original_naming.mafft', use_ancestral=True, n_test=0, build_graph=True, aa_prob=21, triTSNE=False, leaves_testing=False, script_dir='.', no_testing=False),
 SettingsConfig(one_hot_encoding=False, model_design='GP_VAE', aligned_seq=True, data_folder='/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data/benchmark_randall_original_naming', full_name="Randall's Coral fluorescent proteins (CFP) benchmark dataset", tree_file='/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/data/benchmark_randall_original_naming/RandallBenchmarkTree_OriginalNaming.tree'),
 None)


In [None]:
def available_datasets(print_dict = False):
    """Displays the available default data sets shown in the paper"""
    datasets = {"simulations_blactamase_1": "BetaLactamase_seq",# EvolveAGene4 Betalactamase simulation # 32 leaves
                "simulations_calcitonin_1": "Calcitonin_seq",# EvolveAGene4 Calcitonin simulation #50 leaves
                "simulations_src_sh3_1": "SRC_SH3",# EvolveAGene4 SRC SH3 domain simulation 1 #100 leaves
                "simulations_sirtuins_1": "Sirtuin_seq",# EvolveAGene4 Sirtuin simulation #150 leaves
                "simulations_src_sh3_3": "SRC_SH3",# EvolveAGene4 SRC SH3 domain simulation 2 #200 leaves
                "simulations_PIGBOS_1": "PIGBOS_seq",# EvolveAGene4 PIGBOS simulation #300 leaves
                "simulations_insulin_2": "Insulin_seq",# EvolveAGene4 Insulin simulation #400 leaves
                "simulations_src_sh3_2":"SRC_SH3",# EvolveAGene4 SRC SH3 domain simulation 2 #800 leaves
                "simulations_jj_1": "jj1",
                "simulations_jj_2": "jj2",
                "benchmark_randall_original_naming": None,# uses the original tree and it's original node naming
                "SH3_pf00018_larger_than_30aa":  None,# SRC kinases domain SH3 ---> Leaves and angles testing
                "Coral_Faviina":  None,  # Faviina clade from coral sequences # 35 leaves
                "Coral_all": None,# All Coral sequences (includes Faviina clade and additional sequences) #71 leaves
                "PF00400": None, # 125 real sequences
                "PF00400_beta":None,#TEST DATA
                "aminopeptidase":  None, #another real sequences example
                "PF00096": None} #another real sequences example
    if print_dict:
        pprint.pprint(datasets)
    datasets_full_names = {"benchmark_randall_original_naming":"Randall's Coral fluorescent proteins (CFP) benchmark dataset",  # uses the original tree and it's original node naming
                "SH3_pf00018_larger_than_30aa":"PF00018 Pfam family of Protein Tyrosine Kinases SH3 domains",  # SRC kinases domain SH3 ---> Leaves and angles testing
                "simulations_blactamase_1":"32 leaves Simulation Beta-Lactamase",  # EvolveAGene4 Betalactamase simulation
                "simulations_src_sh3_1":"100 leaves Simulation SRC-Kinase SH3 domain",  # EvolveAGene4 SRC SH3 domain simulation
                "simulations_src_sh3_2": "800 leaves Simulation SRC-Kinase SH3 domain",
                "simulations_src_sh3_3": "200 leaves Simulation SRC-Kinase SH3 domain",
                "simulations_sirtuins_1": "150 leaves Simulation Sirtuin 1",
                "simulations_insulin_2": "400 leaves Simulation Insulin Growth Factor",
                "simulations_calcitonin_1": "50 leaves Simulation Calcitonin peptide",
                "simulations_PIGBOS_1": "300 leaves parser.add_argument('-use-cuda', type=str2bool, nargs='?',const=True, default=True, help='Use GPU')simulation PIGB Opposite Strand regulator",
                "simulations_jj_1": "jj1",
                "simulations_jj_2": "jj2",
                "Coral_Faviina":"Coral fluorescent proteins (CFP) Faviina clade",  # Faviina clade from coral sequences
                "Coral_all":"Coral fluorescent proteins (CFP) clade",  # All Coral sequences (includes Faviina clade and additional sequences)
                "PF00400":"WD40 125 sequences",
                "PF00400_beta": "WD40 125 sequences", #TODO:Remove
                "aminopeptidase":"Amino Peptidase",
                "PF00096":"PF00096 protein kinases"}
    return datasets,datasets_full_names

In [None]:
import sys #for version checker
import os #for restart routine

if '3.9' in sys.version:
  print('You already have 3.9')
else:
  #install python 3.9 and dev utils
  #you may not need all the dev libraries, but I haven't tested which aren't necessary.
  !sudo apt-get update -y
  !sudo apt-get install python3.9 python3.9-dev python3.9-distutils libpython3.9-dev
  !sudo apt-get install python3.9-venv binfmt-support #recommended in install logs of the command above

  #change alternatives
  !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.11 1
  !sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 2

  # install pip
  !curl -sS https://bootstrap.pypa.io/get-pip.py | python3.9
  !python3 get-pip.py --force-reinstall

  #install colab's dependencies
  !python3 -m pip install setuptools ipython ipython_genutils ipykernel jupyter_console prompt_toolkit httplib2 astor

  #minor cleanup
  !sudo apt autoremove

  #link to the old google package
  !ln -s /usr/local/lib/python3.11/dist-packages/google /usr/local/lib/python3.9/dist-packages/google
  #this is just to verify if 3.9 folder was indeed created
  !ls /usr/local/lib/python3.9/

shell-init: error retrieving current directory: getcwd: cannot access parent directories: Transport endpoint is not connected
Hit:1 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:2 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:3 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:4 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:5 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:6 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:7 https://r2u.stat.illinois.edu/ubuntu jammy InRelease
0% [Waiting for headers]

In [None]:
!sudo apt-get update -y
!sudo apt-get install python3.9 python3.9-dev python3.9-distutils libpython3.9-dev
!sudo update-alternatives --install /usr/bin/python3 python3 /usr/bin/python3.9 1
!sudo update-alternatives --config python3

In [None]:
%cd /content/drive/MyDrive/bedford_lab/code/
!python3.9 get-pip.py

In [None]:
!python --version
!pip --version

In [None]:
%cd /content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/draupnir
!pip install .

In [None]:
!pip install biopython; pip install pyro-ppl; pip install ete3; pip install dgl -f https://data.dgl.ai/wheels/torch-2.4/cu124/repo.html; pip install torchda

In [None]:
!python --version

In [None]:
import sys
print(sys.version)

In [None]:
# sys.path.append("/content/drive/MyDrive/bedford_lab/code/DRAUPNIR_ASR/draupnir/src/")
# sys.path.append("/content/drive/MyDrive/bedford_lab/code")
import draupnir

In [None]:
!nvcc --version

In [None]:
print(dir(draupnir))

In [None]:
draupnir.create_draupnir_dataset("benchmark_randall_original_naming", False, "./DRAUPNIR_ASR")