In [None]:
import sys, os
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import joblib


import warnings
warnings.filterwarnings('ignore')

# sys.path.append('/Users/ryanschenck/Dropbox/Projects/Tools/Code/py/')
# from StringConverter import StringConverter
# from ccfcalc import CCFCalculator

# ccfcalc = CCFCalculator()

font = {'size'   : 8}
plt.rc('font', **font)
plt.rcParams['pdf.fonttype'] = 42

# Single crypt phologenetic reconstruction

## Load Data

In [None]:
maf = pd.read_csv('../data/single_crypt/FAP03_sgWGS_maf_all_SSNVs_consensus_filtered.csv', sep=',')
maf.head()

## Setup PAUP nexus file with run options

In [None]:
class PAUPRunAllSams:

    def __init__(self, ptmuts, wxs='WGS', pt="ALL", output="./processed_data/", nexusTemplate="./PAUP_Run_Options.nex", paupExe="./paup4a168_osx", withBlood=True, prune=False):
        ## Variables
        self.pt = pt
        self.wxs = wxs
        self.path = output + "/%s_%s_PAUPrun/" %(self.pt, wxs)
        self.nexfile = self.path + "%s.%s.nex" %(self.pt, wxs)
        self.nexusRunOpts = nexusTemplate
        self.paupExe = paupExe

        ## Create directory for NEXUS run
        if os.path.exists(self.path )==False:
            os.mkdir(self.path)

        ## Get mutation information for patient to keep in class
        self.ptmuts = ptmuts

        ## Setup Matrix to house mutation presence
        self.withBlood = withBlood

        ## Get mutation matrix
        # mat is the matrix, snaSet is the mutation identifier, and sampleIDSet is the sample IDs
        self.mat, self.snaSet, self.sampleIDSet = self._assignMutsToMat()

        if prune:
            self._pruneMatrix()

        ## Get NEXUS File
        self.fixedSampleIDs = None # Sample IDs for those whose characters don't work with PAUP
        self.sampleDict = None
        self._writeNexusFile()
    
    def _pruneMatrix(self):
        self.mat = self.mat[np.where(self.mat.sum(axis=1)>1)[0]]
        self.snaSet = np.asarray(self.snaSet)[np.where(self.mat.sum(axis=1)>1)[0]]
        # simplesampleIDSet = np.asarray(paupExe.sampleIDSet)[np.where(paupExe.mat.sum(axis=1)>1)[0]]

    def _writeNexusFile(self):
        template = self._loadNexusOpts()

        self.fixedSampleIDs = [idval.replace('_','').replace(' ','').replace('-','') for idval in self.sampleIDSet]
        self.sampleDict = dict(zip( self.fixedSampleIDs , self.sampleIDSet ))

        template = template.replace('?MAT?', self._prettyPrintMat())
        template = template.replace('?NTAX?', str(len(self.fixedSampleIDs)))
        template = template.replace('?NCHAR?', str(self.mat.shape[0]))
        template = template.replace('?PATIENT?', self.pt)
        template = template.replace('?WXS?', self.wxs)

        with open(self.nexfile, "w") as outputFile:
            outputFile.write(template)

        with open(self.nexfile.replace(".nex",".dummy.nex"), "w") as outputFile:
            outputFile.write( template.replace("begin paup;", "begin __paup;") )

    def _loadNexusOpts(self):
        '''
        To replace:
        ?MAT? ?NTAX? ?NCHAR? ?PATIENT? ?WXS?
        '''
        with open(self.nexusRunOpts, "r") as nexusFile:
            opts = ''.join(nexusFile.readlines())
        return(opts)

    def _assignMutsToMat(self):
        '''
        Assign SNA to PAUP ready matrix
        '''
        iterMuts = self.ptmuts[['Mut_ID', 'Tumor_Sample_Barcode']]

        pd.options.mode.chained_assignment = None  # default='warn'

        iterMuts['Presence'] = 1 # Assign presence

        pd.options.mode.chained_assignment = 'warn'  # default='warn'

        mat = iterMuts.pivot(columns="Tumor_Sample_Barcode", values="Presence", index="Mut_ID")

        sampleIDSet = mat.columns.tolist() # Pull info

        snaSet = mat.index.tolist() # Pull info

        mat = mat.to_numpy() # Get matrix
        np.nan_to_num(mat, copy=False, nan=0)
        mat = mat.astype("int")

        if self.withBlood:
            sampleIDSet.insert(0 ,"Blood")
            bloodOutgroup = np.zeros((mat.shape[0] ,1), dtype="int")
            mat = np.concatenate((bloodOutgroup, mat), axis=1)

        return( mat, snaSet, sampleIDSet)

    def _prettyPrintMat(self):
        '''
        Create a matrix per sample that is 1000 characters wide

        '''
        n=1000
        ret = []
        for i in range(0, self.mat.shape[1]):
            ret.append( self.fixedSampleIDs[ i ] )
            sampleMutStrSet = ''.join([str(v) for v in self.mat[:,i]])
            sampleMutStrSetNotWide = '\n'.join([sampleMutStrSet[i:i+n] for i in range(0, len(sampleMutStrSet), n)])
            ret.append(sampleMutStrSetNotWide)
        return( '\n'.join(ret) )

    def executePAUP(self):
        cmd = [self.paupExe, self.nexfile, "--noninteractive"]
        p = Popen(cmd, stdin=PIPE, stdout=PIPE, stderr=PIPE)
        output, err = p.communicate(b"input data that is passed to subprocess' stdin")
        rc = p.returncode
        print("Exit code %s: %s"%(self.pt , rc))

In [None]:
paupExe = "./paup4a168_osx" # Path to PAUP executable
nexusTemplate = "./PAUP_Run_Options.nex" # Path to NEXUS template

# Take only the columns needed for setting up the NEXUS file
muts = maf[['Tumor_Sample_Barcode','Mut_ID']]

paupExe_single_crypts = PAUPRunAllSams(muts, 'WGS', pt='single_crypts', output='./output', prune=False)

In [None]:
# Optionally we can save the output of run options to more easily track mutations in the output from PAUP
# Output all samples paupruns into a joblib
joblib.dump({'single_crypts':paupExe_single_crypts}, ".PAUPRunAllSams_singlecrypts.joblib")

# Load data
# paupExe_single_crypts = joblib.load("./output/PAUPRunAllSams_singlecrypts.joblib")

# sgWGS = paupExe_single_crypts['single_crypts']

# print('Mutations in SSNV set:',len(sgWGS.snaSet))

# Visualizing Outputs

The outputs from PAUP can be visualized many different ways. The output includes a consensus phylogeny (50% majority rule after bootstrapping) and a `best tree` based on parsimony scores.

As an example, visualize the consensus tree using the following:

In [None]:
res_paths = glob.glob("./output/*")

for path in res_paths:
    if os.path.isdir(path):
        if path.endswith('.nex')==False:
            print(path)
            
            try:
                contree = list(Phylo.parse(f"{path}/fileref.contree50.tre", "newick"))
                Phylo.draw(contree[0])
            except:
                pass