# Overview
This file contains an example workflow to create a customized TIPP reference package, using some example genes included under `example_data/` with their necessary files:
1. the alignment,
2. the taxid of sequences,
3. the mapping between each sequence and its taxid.

----------------------
# Required Packages
```
taxtastic>=0.10.0
dendropy>=4.5.2
```

In [1]:
import os, sys
from subprocess import Popen, PIPE, STDOUT

# number of threads to use for raxml-ng and fasttree-2
num_threads = 12

# initialize work directory
workdir = 'example_data'

# where to write the TIPP reference package to
refpkg_dir = 'custom_tipp_refpkg'

# initialize the list of genes to use, which are assumed to be under the working directory
genes = ['RplA_COG0081', 'RplB_COG0090', 'RplD_COG0088', 'RpsC_COG0092', 'RpsM_COG0099']

# Needed files
In this example, we are only using one gene to create our customized TIPP reference package, included under `example_data/RplO_COG0200`. Remember that there are 4 files that we need to prepare (one for global use, and three for each gene):
1. `ncbi_taxonomy.db  - (global) NCBI taxonomy file created with taxtastic`
2. `est.aln.nuc.fasta - (each gene) alignment file`
3. `species.txt       - (each gene) taxids of sequences in alignment, in the same order as in the alignment`
4. `species.mapping   - (each gene) mapping between sequences and taxids, in the same order as in the alignment`

The three files for the example gene are included, and the NCBI taxonomy database can be built below.

# Step 0: NCBI taxonomy database - `ncbi_taxonomy.db`

In [2]:
# download the latest NCBI taxdmp.zip
url = "https://ftp.ncbi.nlm.nih.gov/pub/taxonomy/taxdmp.zip"

# run taxtastic to create a database locally
if not os.path.exists(f"{workdir}/ncbi_taxonomy.db"):
    cmd = f"wget -P {workdir} {url}"
    os.system(cmd)

    cmd = f"taxit new_database -z {workdir}/taxdmp.zip"
    os.system(cmd)

    # move the created ncbi_taxonomy.db file to workdir
    os.system(f"mv ncbi_taxonomy.db {workdir}/")

# Step 1: Update species taxids

In [3]:
db_path = os.path.join(workdir, 'ncbi_taxonomy.db')

# two files to update
old2new_map = {'species.txt': 'species.updated.txt',
               'species.mapping': 'species.updated.mapping',
              }

for gene in genes:
    indir = os.path.join(workdir, gene)

    for infile, outfile in old2new_map.items():
        inpath = os.path.join(indir, infile)
        outpath = os.path.join(indir, outfile)
        # run taxit update_taxids
        cmd = f"taxit update_taxids {inpath} {db_path} -o {outpath}"
        os.system(cmd)

# Step 2: Get taxonomy table from the included species

In [4]:
db_path = os.path.join(workdir, 'ncbi_taxonomy.db')

for gene in genes:
    indir = os.path.join(workdir, gene)
    
    # get taxonomy.table with taxit
    species_path = os.path.join(indir, 'species.updated.txt')
    taxtable_path = os.path.join(indir, 'taxonomy.table')
    cmd = f"taxit taxtable {db_path} -i {species_path} -o {taxtable_path}"
    os.system(cmd)
    
    # get a cleaned version of the taxonomy table by removing redundant columns
    alltaxon_path = os.path.join(indir, 'all_taxon.taxonomy')
    cmd = f"python clean_taxonomy_table.py {taxtable_path} {alltaxon_path}"
    os.system(cmd)

# Step 3: Initialize taxonomy tree

In [5]:
for gene in genes:
    indir = os.path.join(workdir, gene)

    # run script build_taxonomic_tree.py
    species_path = os.path.join(indir, 'species.updated.txt')
    taxtable_path = os.path.join(indir, 'taxonomy.table')
    unrefined_path = os.path.join(indir, 'unrefined.taxonomy')
    cmd = f"python build_taxonomic_tree.py {taxtable_path} {species_path} {unrefined_path}"
    os.system(cmd)
    
    # rename the taxonomy tree with sequence names from this gene
    # with script build_unrefined_tree.pl
    mapping_path = os.path.join(indir, 'species.updated.mapping')
    taxonomy_path = os.path.join(indir, 'unrefined.taxonomy.renamed')
    cmd = f"perl build_unrefined_tree.pl {mapping_path} {unrefined_path} {taxonomy_path}"
    os.system(cmd)

# (for tipp3) Get RAxML-ng maximum likelihood tree and model

In [None]:
for gene in genes:
    indir = os.path.join(workdir, gene)
    
    # masked the alignment first with >= 95% gaps
    aln_path = os.path.join(indir, 'est.aln.nuc.fasta')
    masked_path = os.path.join(indir, 'est.aln.nuc.masked95.fasta')
    thres = 0.95
    cmd = f"python mask_alignment.py {aln_path} {masked_path} {thres}"
    os.system(cmd)
    
    # resolve polytomies in the taxonomy tree and remove internal node labels to run RAxML-ng with
    taxonomy_path = os.path.join(indir, 'unrefined.taxonomy.renamed')
    starting_tree_path = os.path.join(indir, 'raxml.starting.tre')
    cmd = f"python resolve_polytomies.py {taxonomy_path} {starting_tree_path}"
    os.system(cmd)
    
    # Run RAxML-ng with the starting tree and the original taxonomy tree as the constraint
    cmd = ['raxml-ng', '--msa', masked_path, '--model GTR+G',
           '--tree', starting_tree_path, '--tree-constraint', taxonomy_path,
           '--brlen scaled',
           '--force perf_threads', '--redo',
           f'--prefix {indir}/est',
           '--threads', str(num_threads)]
    p = Popen(' '.join(cmd), stdout=PIPE, stderr=STDOUT, text=True, shell=True)
    for line in p.stdout:
        print(line.replace('\n', ''))


RAxML-NG v. 1.2.2-master released on 30.04.2024 by The Exelixis Lab.
Developed by: Alexey M. Kozlov and Alexandros Stamatakis.
Contributors: Diego Darriba, Tomas Flouri, Benoit Morel, Sarah Lutteropp, Ben Bettisworth, Julia Haag, Anastasis Togkousidis.
Latest version: https://github.com/amkozlov/raxml-ng
Questions/problems/suggestions? Please visit: https://groups.google.com/forum/#!forum/raxml

System: 11th Gen Intel(R) Core(TM) i5-11600K @ 3.90GHz, 6 cores, 31 GB RAM

RAxML-NG was called at 28-Feb-2025 15:59:25 as follows:

raxml-ng --msa example_data/RplA_COG0081/est.aln.nuc.masked95.fasta --model GTR+G --tree example_data/RplA_COG0081/raxml.starting.tre --tree-constraint example_data/RplA_COG0081/unrefined.taxonomy.renamed --brlen scaled --force perf_threads --redo --prefix example_data/RplA_COG0081/est --threads 12

Analysis options:
  run mode: ML tree search
  start tree(s): user
  topological constraint: example_data/RplA_COG0081/unrefined.taxonomy.renamed (algorithm: NEW)
  r

[00:00:01 -537341.840637] Model parameter optimization (eps = 10.000000)
[00:00:13 -527149.786874] AUTODETECT spr round 1 (radius: 5)
[00:00:44 -481571.705668] AUTODETECT spr round 2 (radius: 10)
[00:01:16 -479223.100688] AUTODETECT spr round 3 (radius: 15)
[00:01:47 -479182.838351] AUTODETECT spr round 4 (radius: 20)
[00:02:16 -479182.731816] AUTODETECT spr round 5 (radius: 25)
[00:02:42 -479182.669543] SPR radius for FAST iterations: 20 (autodetect)
[00:02:42 -479182.669543] Model parameter optimization (eps = 3.000000)
[00:03:03 -479104.805503] FAST spr round 1 (radius: 20)
[00:03:55 -476401.321478] FAST spr round 2 (radius: 20)
[00:04:40 -476231.082855] FAST spr round 3 (radius: 20)
[00:05:22 -476210.867910] FAST spr round 4 (radius: 20)


In [None]:
# add back the taxonomy information to the raxml output tree, and reroot the tree at taxid 131567
for gene in genes:
    print('>>>>', gene)
    indir = os.path.join(workdir, gene)
    
    # reroot the raxml-ng bestTree to 131567 and relabel the tree
    root = 131567
    best_tree_path = os.path.join(indir, 'est.raxml.bestTree')
    taxonomy_path = os.path.join(indir, 'unrefined.taxonomy.renamed')
    rooted_tree_path = os.path.join(indir, 'est.raxml.bestTree.rooted')
    cmd = f"python relabel_tree.py {taxonomy_path} {best_tree_path} {rooted_tree_path} {root}"
    ret = os.popen(cmd).read()
    print(ret)

# (for tipp3-accurate) Get FastTree-2 maximum likelihood tree and log

In [None]:
for gene in genes:
    indir = os.path.join(workdir, gene)
    
    # use the original alignment and raxml-ng tree to re-estimate FastTree-2 numeric parameters
    rooted_tree_path = os.path.join(indir, 'est.raxml.bestTree.rooted')
    alignment_path = os.path.join(indir, 'est.aln.nuc.fasta')
    ft_log_path = os.path.join(indir, 'est.fasttree.log')
    ft_tree_path = os.path.join(indir, 'est.fasttree.tre')
    
    # enable multi-thread FastTree-2 (using binary FastTreeMP)
    os.system(f"export OMP_NUM_THREADS={num_threads}")
    
    # nucleotide alignment
    cmd = ['FastTreeMP', '-nosupport', '-gtr', '-gamma', '-nt']
    cmd.extend([
        '-mllen', '-nome',
        '-log', ft_log_path, '-intree', rooted_tree_path,
    ])
    
    in_fptr = open(alignment_path, 'r')
    out_fptr = open(ft_tree_path, 'w')
    p = Popen(' '.join(cmd), stdin=in_fptr, stdout=out_fptr, stderr=PIPE, shell=True, text=True)
    for line in p.stderr:
        print(line.replace('\n', ''))
    in_fptr.close()
    out_fptr.close()

# Final step: create the reference package

In [None]:
# run the existing script create_tipp_refpkg.py
db_path = os.path.join(workdir, 'ncbi_taxonomy.db')

cmd = f"python create_tipp_refpkg.py {workdir} {db_path} {refpkg_dir} {num_threads}"
p = Popen(cmd, stdout=PIPE, stderr=STDOUT, text=True, shell=True)
for line in p.stdout:
    print(line.replace('\n', ''))