In [None]:
## Sometimes Jupyter notebook doesn't retain your PATH environment variable -- this will mess up a number of things.
## We recommend specifying the environment variable manually here
import os
import ctypes
os.environ["PATH"] = '/home/mattjones/bin:/home/mattjones/.local/bin:/home/mattjones/myapps/bin:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin:/usr/games:/usr/local/games:/snap/bin:/home/mattjones/opt/gurobi801/linux64/bin:/home/mattjones/software/bowtie2-2.3.4.2:/home/mattjones/emboss/EMBOSS-6.6.0/emboss'
os.environ['LD_LIBRARY_PATH'] = '/home/mattjones/lib/::/home/mattjones/opt/gurobi801/linux64/lib'

import Cassiopeia.TreeSolver as ts
from Cassiopeia.TreeSolver import data_pipeline
from Cassiopeia.TreeSolver import post_process_tree as ppt

import pandas as pd 
import numpy as np
import networkx as nx
import pickle as pic

import sys

In [None]:
alleletable_fp = "lg_output/test.alleleTable.txt"
alleletable = pd.read_csv(alleletable_fp, sep='\t')
target_lg = 4

### Estimate Allele Proportions

In [None]:
allele_props = data_pipeline.get_indel_props(alleletable)

### Create character matrix from a lineage group

The first step is to aggregate our observations into a character matrix -- a matrix whose dimensions are `n x m`, where we have `n` cells and `m` characters (total target sites, 3 times the number of integrations in our case). We can use the alleletable_to_character_matrix function here, which takes the following parameters:

- **at**: alleletable (subsetted for a particular lineage group) 
- **out_fp**: the output file path for the resulting character matrix
- **mutation_map**: the alleleproprotions, calculated as above. If nothing is provided, no prior probabilities will be used for later tree reconstructions.
- **old_r**: Use alleles without context (default = False).

This function will write three files: the character matrix, the indel proprotions dictionary, specifying the mutation probabilities for each character to a particular state, and a dictionary translating a state-character pair to an observed allele. The last two files are saved as pickle files, and only written if a mutation_map is provided.

In [None]:
lg = alleletable[alleletable["lineageGrp"] == target_lg]

data_pipeline.alleletable_to_character_matrix(lg, "test_lg4_character_matrix.txt", mutation_map=allele_props)

### Reconstruct a lineage

You can now reconstruct lineages using the `reconstruct-lineage` command, which takes in many different commands depending on which of many lineage solvers you'd like to use. We provide Neighbor-Joining, Camin-Sokal (implemented with PHYLIP), greedy, Steiner-Tree/ILP, and Hybrid algorithms. Use the `-h` flag to see all possible parameters.

By using one of our algorithms (greedy, hybrid, or ILP), the output will be a networkx object pickle file and a newick text file. The output of the Neighbor-Joining and Camin-Sokal functions is a newick text file 

### Post-Processing a Tree

Here we'll work with a tree named `test_lg4_tree.pkl`, and we assume it was constructed with the greedy method. In this case, there are two things that need to be done:

1. Map terminal character states to the cell identifiers
2. Add "Redundant" leaves to the terminal leaves. This is necessary because if not ever cell represents a unique state, then the final tree would only be over a subset of the cells originally in the character matrix. This is known as the Post-Processing Tree Step.

In the event of post-processing a tree constructed with the hybrid or ilp methods, we only perform step 2. 

There is also a command-line tool provided for this step, you can call it with `post-process-tree`. Use the `-h` flag to see options and usage.

#### Mapping Terminal states

In [None]:
g = nx.read_gpickle("test_lg4_tree.pkl")
cm = pd.read_csv("test_lg4_character_matrix.txt", sep='\t', index_col = 0)

g = ppt.assign_samples_to_charstrings(g, cm)

#### Post-Process Tree & Add Redundant Leaves

In [None]:
g = ppt.post_process_tree(g)
g = ppt.add_redundant_leaves(g, cm)

In [None]:
## now save final tre
out_fp = "test_lg4_tree.processed.txt"
stem = ".".join(out_fp.split(".")[:-1])

pic.dump(g, open(stem + ".pkl", "wb"))

newick = data_pipeline.convert_network_to_newick_format(g)

with open(out_fp, "w") as f:
    f.write(newick)