# Incorporate isolate sequences into master tree of DADA2 seqs
* Align isolate seqs using SSU_align
* Mask sequences based on same mask as used for sequences from Exp10 
* Append sequences to create new fasta with both sets of sequences
* Create master tree using FastTree
* Use qiime environment and set kernel to Python 2

In [26]:
workDir = '/home/be68/Hyphosphere/Exp10/MasterTree/'
maskDir = '/home/be68/Hyphosphere/Exp10/DADA2/'
IsolateSeqDir = '/home/be68/Hyphosphere/Exp10/MasterTree/'
IsolateSeqFile = 'IsolateSeqs.fasta'


nprocs = 3

In [50]:
import os
import numpy as np
#import entrez.direct
from cogent.app.fasttree import build_tree_from_alignment
from cogent import DNA, LoadSeqs
from Bio import Entrez, SeqIO
from pandas import *
Entrez.email = "be68@cornell.edu"
#from IPython.display import display, Image, SVG
#from cogent3 import LoadSeqs, FastTree

In [16]:
if not os.path.isdir(workDir):
    os.mkdir(workDir)

In [17]:
%cd $workDir

/data/home/be68/Hyphosphere/Exp10/MasterTree


In [18]:
!cd $workDir; ln -f -s $IsolateSeqDir$IsolateSeqFile

ln: ‘/home/be68/Hyphosphere/Exp10/MasterTree/IsolateSeqs.fasta’ and ‘./IsolateSeqs.fasta’ are the same file


In [19]:
!cd $workDir; head $IsolateSeqFile

>BE26
AGCGGCGGACGGGTGAGTAACACGTGGGCAACCTGCCTGTAAGACTGGGATAACTTCGGGAAACCGAAGCTAATACCGGATAGGATCTTCTCCTTCATGGGAGATGATTGAAAGATGGTTTCGGCTATCACTTACAGATGGGCCCGCGGTGCATTAGCTAGTTGGTGAGGTAACGGCTCACCAAGGCAACGATGCATAGCCGACCTGAGAGGGTGATCGGCCACACTGGGACTGAGACACGGCCCAGACTCCTACGGGAGGCAGCAGTAGGGAATCTTCCGCAATGGACGAAAGTCTGACGGAGCAACGCCGCGTGAGTGATGAAGGCTTTCGGGTCGTAAAACTCTGTTGTTAGGGAAGAACAAGTACGAGAGTAACTGCTCGTACCTTGACGGTACCTAACCAGAAAGCCACGGCTAACTACGTGCCAGCAGCCGCGGTAATACGTAGGTGGCAAGCGTTATCCGGAATTATTGGGCGTAAAGCGCGCGCAGGCGGTTTCTTAAGTCTGATGTGAAAGCCCACGGCTCAACCGTGGAGGGTCATTGGAAACTGGGGAACTTGAGTGCAGAAGAGAAAAGCGGAATTCCACGTGTAGCGGTGAAATGCGTAGAGATGTGGAGGAACACCAGTGGCGAAGGCGGCTTTTTGGTCTGTAACTGACGCTGAGGCGCGAAAGCGTGGGGAGCAAACAGGATTAGATACCCTGGTAGTCCACGCCGTAAACGATGAGTGCTAAGTGTTAGAGGGTTTCCGCCCTTTAGTGCTGCAGCTAACGCATTAAGCACTCCGCCTGGGGAGTACGGTCGCAAGACTGAAACTCAAAGGAATTGACGGGGGCCCGCACAAGCGGTGGAGCATGTGGTTTAATTCGAAGCAACGCGAAGAACCTTACCAGGTCTTGACATCCTCTGACAACTCTAGAGATAGAGCGTTCCCCTTCGGGGGACAGAGTGACAGGTGGTGCATGGTTGTCGTCAGCTCGTGTCGTG

In [20]:
!printf "Number of OTUs in fasta: "
!cd $workDir; grep -c ">" $IsolateSeqFile

Number of OTUs in fasta: 170


## Using SSU-Align to align seqs and masking based on alignment posterior probabilities of Exp10 - ASV set.

In [24]:
!cd $workDir; ssu-prep -f -x -b 50 --rfonly --dna $IsolateSeqFile ssu_aln 3

# _ssu-prep :: prepare SSU rRNA sequences for parallel ssu-align jobs
# SSU-ALIGN 0.1.1 (Feb 2016)
# Copyright (C) 2016 Howard Hughes Medical Institute
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# command: _ssu-prep -x -f -b 50 --dna --rfonly IsolateSeqs.fasta ssu_aln 3
# date:    Fri Apr 19 15:40:32 2019
#
# Validating input sequence file ... done.
#
# Preparing 3 ssu-align jobs ...
# Partitioning seqs with goal of equalizing total number of nucleotides per job ...
#
# output file name      description                                        
# --------------------  ---------------------------------------------------
  ssu_aln/IsolateSeqs.fasta.1  partition 1 FASTA sequence file (55 seqs; 70726 nt)
  ssu_aln/IsolateSeqs.fasta.2  partition 2 FASTA sequence file (56 seqs; 70104 nt)
  ssu_aln/IsolateSeqs.fasta.3  partition 3 FASTA sequence file (54 seqs; 67718 nt)
  ssu_aln.ssu-align.sh  shell script th

In [25]:
!cd $workDir; ./ssu_aln.ssu-align.sh

# Executing: ssu-align -b 50 --dna --rfonly ssu_aln/IsolateSeqs.fasta.1 ssu_aln/ssu_aln.1 > /dev/null &
# Executing: ssu-align -b 50 --dna --rfonly ssu_aln/IsolateSeqs.fasta.2 ssu_aln/ssu_aln.2 > /dev/null &
# Executing: ssu-align --merge 3 -b 50 --dna --rfonly ssu_aln/IsolateSeqs.fasta.3 ssu_aln/ssu_aln.3
# _ssu-align :: align SSU rRNA sequences
# SSU-ALIGN 0.1.1 (Feb 2016)
# Copyright (C) 2016 Howard Hughes Medical Institute
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# command: _ssu-align --merge 3 -b 50 --dna --rfonly ssu_aln/IsolateSeqs.fasta.3 ssu_aln/ssu_aln.3
# date:    Fri Apr 19 15:40:58 2019
#
# Validating input sequence file ... done.
#
# Stage 1: Determining SSU start/end positions and best-matching models...
#
# output file name            description                                
# --------------------------  -------------------------------------------
  ssu_aln.3.tab               lo

In [27]:
!cd $workDir; ssu-mask -s $maskDir/ssu_aln/ssu_aln.bacteria.mask --dna --afa ssu_aln/

# _ssu-mask :: mask SSU rRNA alignments
# SSU-ALIGN 0.1.1 (Feb 2016)
# Copyright (C) 2016 Howard Hughes Medical Institute
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# command: _ssu-mask -s /home/be68/Hyphosphere/Exp10/DADA2//ssu_aln/ssu_aln.bacteria.mask --afa --dna ssu_aln/
# date:    Fri Apr 19 15:44:03 2019
#
# Masking alignments using pre-existing masks...
#
#                                                     mask    
#                                                 ------------
# file name                  in/out  type  #cols  incl.  excl.
# -------------------------  ------  ----  -----  -----  -----
  ssu_aln.bacteria.stk        input   aln   1582      -      -
  ssu_aln.bacteria.mask       input  mask   1582    248   1334
  ssu_aln.bacteria.mask.pdf  output   pdf   1582    248   1334
  ssu_aln.bacteria.mask.afa  output   aln    248      -      -
#
# All attempts to draw structure diagrams 

In [None]:
# Append masked alignments

In [28]:
!cd $workDir; cat $maskDir/ssu_aln/ssu_aln.bacteria.mask.afa ssu_aln/ssu_aln.bacteria.mask.afa > aln_for_tree.fasta

In [30]:
!tail aln_for_tree.fasta

TAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACATCTGATACTGGCAAGCTTGAG
TCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAA
TACCGGTGGCGAAGGCGGCCCCCTGGACGAACTGACGCTCAGGTGCGAAAGCGTGGGGAG
CAAACAGG
>BE277
TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGT
TAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAACATCTGATACTGGCAAGCTTGAG
TCTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAA
TACCGGTGGCGAAGGCGGCCCCCTGGACGAACTGACGCTCAGGTGCGAAAGCGTGGGGAG
CAAACAGG


## Inferring and rooting the tree

In [32]:
aln = LoadSeqs(os.path.join(workDir, 'aln_for_tree.fasta'), moltype=DNA)
t_unroot = build_tree_from_alignment(aln, moltype=DNA)

* Using sulfalobus sso_acc = "X90478" as the outgroup leads to an error downstream when calculating unifrac distances, instead I checked the tree with sulfalobus as root and identified ASV1721 as an outgroup.  Rooted the tree to ASV1721 instead and received no errors during the analysis

In [33]:
t_rooted = t_unroot.rootedWithTip('ASV1721')

In [34]:
t_rooted.writeToFile(os.path.join(workDir, 'Master.tree'))

In [None]:
## Subset tree with hyphal ASVs and top

In [None]:
## get tip names of isolates

In [154]:
file_to_open = os.path.join(IsolateSeqDir, IsolateSeqFile)
IsolateList = []

for record in SeqIO.parse(file_to_open, "fasta") :
       IsolateList.append(record.id)#+"\n"+str(record.seq)+"\n")

print(IsolateList)

['BE26', 'BE27', 'BE28', 'BE29', 'BE30', 'BE31', 'BE32', 'BE33', 'BE34', 'BE35', 'BE36', 'BE38', 'BE39', 'BE40', 'BE41', 'BE42', 'BE43', 'BE44', 'BE45', 'BE46', 'BE47', 'BE48', 'BE49', 'BE50', 'BE51', 'BE52', 'BE53', 'BE54', 'BE55', 'BE56', 'BE57', 'BE59', 'BE60', 'BE61', 'BE62', 'BE63', 'BE64', 'BE65', 'BE65B', 'BE66', 'BE68', 'BE69', 'BE70', 'BE71', 'BE72', 'BE74', 'BE75', 'BE76', 'BE79', 'BE80', 'BE81', 'BE82', 'BE83', 'BE84', 'BE85', 'BE88', 'BE91', 'BE92', 'BE93', 'BE94', 'BE95', 'BE96', 'BE97', 'BE98', 'BE100', 'BE101', 'BE102', 'BE103', 'BE104', 'BE105', 'BE107', 'BE108', 'BE109', 'BE110', 'BE112', 'BE113', 'BE114', 'BE115', 'BE116', 'BE117', 'BE118', 'BE120', 'BE121', 'BE122', 'BE123', 'BE124', 'BE125', 'BE126', 'BE127', 'BE128', 'BE153', 'BE154', 'BE156', 'BE157', 'BE159', 'BE163', 'BE164', 'BE166', 'BE167', 'BE168', 'BE170', 'BE171', 'BE172', 'BE173', 'BE177', 'BE178', 'BE179', 'BE180', 'BE181', 'BE182', 'BE183', 'BE184', 'BE186', 'BE187', 'BE190', 'BE191', 'BE193', 'BE194', 

## Import blast tablesb

In [52]:
!head Isolate-ASV_blast

BE26	ASV193	100.00	253	0	0	444	696	1	253	3e-132	  468
BE27	ASV193	100.00	253	0	0	450	702	1	253	3e-132	  468
BE28	ASV193	100.00	253	0	0	452	704	1	253	3e-132	  468
BE29	ASV39	100.00	253	0	0	433	685	1	253	3e-132	  468
BE30	ASV39	100.00	253	0	0	433	685	1	253	3e-132	  468
BE31	ASV81	100.00	253	0	0	417	669	1	253	3e-132	  468
BE32	ASV81	100.00	253	0	0	417	669	1	253	3e-132	  468
BE33	ASV3899	100.00	253	0	0	419	671	1	253	2e-132	  468
BE34	ASV81	100.00	253	0	0	418	670	1	253	3e-132	  468
BE35	ASV4861	98.81	253	3	0	417	669	1	253	2e-127	  451


In [58]:
df = pandas.read_table('Isolate-ASV_blast', sep = "\t", header=None)
df.columns = ['qseqid','sseqid',
'pident',
'length',
'mismatch',
'gapopen',
'qstart',
'qend',
'sstart',
'send',
'evalue',
'bitscore']
df.head()


Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,BE26,ASV193,100.0,253,0,0,444,696,1,253,3e-132,468
1,BE27,ASV193,100.0,253,0,0,450,702,1,253,3e-132,468
2,BE28,ASV193,100.0,253,0,0,452,704,1,253,3e-132,468
3,BE29,ASV39,100.0,253,0,0,433,685,1,253,3e-132,468
4,BE30,ASV39,100.0,253,0,0,433,685,1,253,3e-132,468


In [59]:
df

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,BE26,ASV193,100.00,253,0,0,444,696,1,253,3.000000e-132,468
1,BE27,ASV193,100.00,253,0,0,450,702,1,253,3.000000e-132,468
2,BE28,ASV193,100.00,253,0,0,452,704,1,253,3.000000e-132,468
3,BE29,ASV39,100.00,253,0,0,433,685,1,253,3.000000e-132,468
4,BE30,ASV39,100.00,253,0,0,433,685,1,253,3.000000e-132,468
5,BE31,ASV81,100.00,253,0,0,417,669,1,253,3.000000e-132,468
6,BE32,ASV81,100.00,253,0,0,417,669,1,253,3.000000e-132,468
7,BE33,ASV3899,100.00,253,0,0,419,671,1,253,2.000000e-132,468
8,BE34,ASV81,100.00,253,0,0,418,670,1,253,3.000000e-132,468
9,BE35,ASV4861,98.81,253,3,0,417,669,1,253,2.000000e-127,451


In [66]:
# function to get unique values 
def unique(list1): 
    x = np.array(list1) 
    return(np.unique(x)) 

In [70]:
IsolateMatches_list = df["sseqid"].tolist()
IsolateMatches_list = list(unique(IsolateMatches_list))
IsolateMatches_list

['ASV1030',
 'ASV1106',
 'ASV1114',
 'ASV12',
 'ASV13',
 'ASV1350',
 'ASV1356',
 'ASV1379',
 'ASV150',
 'ASV151',
 'ASV1592',
 'ASV1601',
 'ASV163',
 'ASV171',
 'ASV1811',
 'ASV188',
 'ASV19',
 'ASV1915',
 'ASV193',
 'ASV2066',
 'ASV2116',
 'ASV2203',
 'ASV2204',
 'ASV23',
 'ASV233',
 'ASV236',
 'ASV266',
 'ASV2669',
 'ASV270',
 'ASV2719',
 'ASV283',
 'ASV2888',
 'ASV296',
 'ASV302',
 'ASV321',
 'ASV3270',
 'ASV350',
 'ASV3528',
 'ASV3592',
 'ASV360',
 'ASV362',
 'ASV3772',
 'ASV3899',
 'ASV39',
 'ASV393',
 'ASV420',
 'ASV430',
 'ASV4308',
 'ASV452',
 'ASV4604',
 'ASV4621',
 'ASV477',
 'ASV481',
 'ASV4861',
 'ASV492',
 'ASV5115',
 'ASV522',
 'ASV5271',
 'ASV5417',
 'ASV5572',
 'ASV5578',
 'ASV58',
 'ASV600',
 'ASV6114',
 'ASV617',
 'ASV665',
 'ASV687',
 'ASV7471',
 'ASV765',
 'ASV77',
 'ASV792',
 'ASV8',
 'ASV81',
 'ASV891',
 'ASV90',
 'ASV91']

## Identify hyphal ASVs

* Too many RH ASVs to include
* Don't include for now, but use attributes to highlight any ASVs that make it into the tree otherwise

In [206]:
dfl2fc_CHCS = pandas.read_table('/home/be68/Hyphosphere/Exp10/DESeq2/dfl2fc-CHCS.txt', sep = "\t")
dfl2fc_CHBS = pandas.read_table('/home/be68/Hyphosphere/Exp10/DESeq2/dfl2fc-CHBS.txt', sep = "\t")
dfl2fc_RHCS = pandas.read_table('/home/be68/Hyphosphere/Exp10/DESeq2/dfl2fc-RHCS.txt', sep = "\t")
dfl2fc = dfl2fc_CHBS.append(dfl2fc_CHCS)
dfl2fc = dfl2fc.append(dfl2fc_RHCS)

sigtab = dfl2fc_CHBS.append(dfl2fc_CHCS)
sigtab.head()

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,ASV,Test,TimePoint
1,0.0,,,,,,ASV11489,CH-BS,1
2,0.162168,1.45507,3.093253,0.470401,0.638068,,ASV11644,CH-BS,1
3,0.913138,-3.296519,3.0341,-1.08649,0.277262,,ASV3656,CH-BS,1
4,15.109037,-0.425264,2.894952,-0.146898,0.883212,0.972331,ASV562,CH-BS,1
5,0.0,,,,,,ASV2718,CH-BS,1


In [207]:

sigtab = sigtab.loc[(sigtab.padj < 0.05) & (sigtab.log2FoldChange > 0)]

HA = sigtab.ASV.unique()
HA = list(HA)
#HA.sort()
len(HA)



200

## Identify top 100 ASVs for context

In [126]:
ASVabund = pandas.read_table('ASV_meanAbundances.txt', sep = "\t")
ASVabund.head()

Unnamed: 0,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT
1,ASV100,Proteobacteria,Alphaproteobacteria,Xanthobacteraceae,,0.003115,0.004161,0.00249,0.003742,0.00023,0.0
2,ASV1000,Verrucomicrobia,Verrucomicrobiae,Chthoniobacteraceae,Candidatus_Udaeobacter,0.000364,0.000745,0.000158,0.000268,3.1e-05,0.0
3,ASV1002,Actinobacteria,Actinobacteria,Microbacteriaceae,Agromyces,0.000508,0.000406,0.000361,0.000407,0.000142,0.0
4,ASV1003,Gemmatimonadetes,Gemmatimonadetes,Gemmatimonadaceae,,0.000628,0.000274,8.7e-05,0.000534,4.6e-05,0.0
5,ASV10035,Actinobacteria,Thermoleophilia,Solirubrobacteraceae,,0.0,0.0,0.0,0.0,3.6e-05,1.3e-05


In [136]:
ASVabund['meanAbund'] = 0
ASVabund.head()
#np.mean(ASVabund["CS"], ASVabund["CH"]) #, ASVabund['RH'], ASVabund['CH'])

Unnamed: 0,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT,meanAbund
1,ASV100,Proteobacteria,Alphaproteobacteria,Xanthobacteraceae,,0.003115,0.004161,0.00249,0.003742,0.00023,0.0,0
2,ASV1000,Verrucomicrobia,Verrucomicrobiae,Chthoniobacteraceae,Candidatus_Udaeobacter,0.000364,0.000745,0.000158,0.000268,3.1e-05,0.0,0
3,ASV1002,Actinobacteria,Actinobacteria,Microbacteriaceae,Agromyces,0.000508,0.000406,0.000361,0.000407,0.000142,0.0,0
4,ASV1003,Gemmatimonadetes,Gemmatimonadetes,Gemmatimonadaceae,,0.000628,0.000274,8.7e-05,0.000534,4.6e-05,0.0,0
5,ASV10035,Actinobacteria,Thermoleophilia,Solirubrobacteraceae,,0.0,0.0,0.0,0.0,3.6e-05,1.3e-05,0


In [145]:
dfA = ASVabund
dfA['sumAbund'] = dfA.BS + dfA.CS + dfA.CH + dfA.RH
dfA = dfA.sort_values(['sumAbund'], ascending = False)[:100]
TopASVs = list(dfA.OTU)
TopASVs
#df.sort(['A', 'B'], ascending=[1, 0])


['ASV11',
 'ASV17',
 'ASV20',
 'ASV21',
 'ASV8',
 'ASV19',
 'ASV18',
 'ASV6',
 'ASV28',
 'ASV22',
 'ASV16',
 'ASV13',
 'ASV15',
 'ASV12',
 'ASV38',
 'ASV41',
 'ASV48',
 'ASV42',
 'ASV37',
 'ASV45',
 'ASV43',
 'ASV99',
 'ASV51',
 'ASV56',
 'ASV54',
 'ASV26',
 'ASV59',
 'ASV60',
 'ASV61',
 'ASV68',
 'ASV78',
 'ASV35',
 'ASV72',
 'ASV80',
 'ASV73',
 'ASV79',
 'ASV46',
 'ASV89',
 'ASV83',
 'ASV57',
 'ASV91',
 'ASV23',
 'ASV110',
 'ASV100',
 'ASV104',
 'ASV113',
 'ASV107',
 'ASV161',
 'ASV96',
 'ASV74',
 'ASV97',
 'ASV114',
 'ASV117',
 'ASV71',
 'ASV112',
 'ASV39',
 'ASV122',
 'ASV123',
 'ASV86',
 'ASV101',
 'ASV109',
 'ASV140',
 'ASV47',
 'ASV82',
 'ASV50',
 'ASV111',
 'ASV141',
 'ASV227',
 'ASV139',
 'ASV169',
 'ASV178',
 'ASV76',
 'ASV234',
 'ASV129',
 'ASV142',
 'ASV125',
 'ASV77',
 'ASV115',
 'ASV85',
 'ASV134',
 'ASV174',
 'ASV189',
 'ASV66',
 'ASV144',
 'ASV366',
 'ASV152',
 'ASV224',
 'ASV167',
 'ASV173',
 'ASV162',
 'ASV136',
 'ASV180',
 'ASV190',
 'ASV44',
 'ASV216',
 'ASV219',
 '

## Now join lists and prune tree
* Top ASVs = TopASVs
* Diff Abund asvs = HA
* Isolates and top blast hits = IsolateList + IsolateMatches_list

* master tree = t_rooted

In [156]:
IsolateList

['BE26',
 'BE27',
 'BE28',
 'BE29',
 'BE30',
 'BE31',
 'BE32',
 'BE33',
 'BE34',
 'BE35',
 'BE36',
 'BE38',
 'BE39',
 'BE40',
 'BE41',
 'BE42',
 'BE43',
 'BE44',
 'BE45',
 'BE46',
 'BE47',
 'BE48',
 'BE49',
 'BE50',
 'BE51',
 'BE52',
 'BE53',
 'BE54',
 'BE55',
 'BE56',
 'BE57',
 'BE59',
 'BE60',
 'BE61',
 'BE62',
 'BE63',
 'BE64',
 'BE65',
 'BE65B',
 'BE66',
 'BE68',
 'BE69',
 'BE70',
 'BE71',
 'BE72',
 'BE74',
 'BE75',
 'BE76',
 'BE79',
 'BE80',
 'BE81',
 'BE82',
 'BE83',
 'BE84',
 'BE85',
 'BE88',
 'BE91',
 'BE92',
 'BE93',
 'BE94',
 'BE95',
 'BE96',
 'BE97',
 'BE98',
 'BE100',
 'BE101',
 'BE102',
 'BE103',
 'BE104',
 'BE105',
 'BE107',
 'BE108',
 'BE109',
 'BE110',
 'BE112',
 'BE113',
 'BE114',
 'BE115',
 'BE116',
 'BE117',
 'BE118',
 'BE120',
 'BE121',
 'BE122',
 'BE123',
 'BE124',
 'BE125',
 'BE126',
 'BE127',
 'BE128',
 'BE153',
 'BE154',
 'BE156',
 'BE157',
 'BE159',
 'BE163',
 'BE164',
 'BE166',
 'BE167',
 'BE168',
 'BE170',
 'BE171',
 'BE172',
 'BE173',
 'BE177',
 'BE178',
 'B

In [208]:
MasterList = TopASVs + HA + IsolateList + IsolateMatches_list
#len(MasterList)
MasterList = list(set(MasterList))
len(MasterList)

496

## Prune tree with master list

In [209]:
mt_rooted = t_rooted.getSubTree(MasterList)

In [210]:
mt_rooted.writeToFile(os.path.join(workDir, 'SeqofInt.tree'))

# Generate dataframes for iTOL

In [228]:
dfl2fc.head()

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,ASV,Test,TimePoint
1,0.0,,,,,,ASV11489,CH-BS,1
2,0.162168,1.45507,3.093253,0.470401,0.638068,,ASV11644,CH-BS,1
3,0.913138,-3.296519,3.0341,-1.08649,0.277262,,ASV3656,CH-BS,1
4,15.109037,-0.425264,2.894952,-0.146898,0.883212,0.972331,ASV562,CH-BS,1
5,0.0,,,,,,ASV2718,CH-BS,1


In [166]:
ASVabund.head()

Unnamed: 0,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT,meanAbund,sumAbund
1,ASV100,Proteobacteria,Alphaproteobacteria,Xanthobacteraceae,,0.003115,0.004161,0.00249,0.003742,0.00023,0.0,0.007276,0.010624
2,ASV1000,Verrucomicrobia,Verrucomicrobiae,Chthoniobacteraceae,Candidatus_Udaeobacter,0.000364,0.000745,0.000158,0.000268,3.1e-05,0.0,0.001109,0.001203
3,ASV1002,Actinobacteria,Actinobacteria,Microbacteriaceae,Agromyces,0.000508,0.000406,0.000361,0.000407,0.000142,0.0,0.000914,0.001317
4,ASV1003,Gemmatimonadetes,Gemmatimonadetes,Gemmatimonadaceae,,0.000628,0.000274,8.7e-05,0.000534,4.6e-05,0.0,0.000902,0.00094
5,ASV10035,Actinobacteria,Thermoleophilia,Solirubrobacteraceae,,0.0,0.0,0.0,0.0,3.6e-05,1.3e-05,0.0,3.6e-05


## Process dataframes
* Create a dataframe with names from MasterList
* Left join abundance dataframe to sequences in Master List
* generate dfl2fc indicators annd add attributes to dataframe

In [174]:
dfM = pandas.DataFrame(MasterList, columns = ["ASV"])
dfM = pandas.merge(dfM, ASVabund, left_on='ASV', right_on="OTU", how='left')
dfM.head()

Unnamed: 0,ASV,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT,meanAbund,sumAbund
0,ASV2719,ASV2719,Proteobacteria,Alphaproteobacteria,Rhizobiaceae,Aminobacter,3.8e-05,0.000155,9.6e-05,2.5e-05,0.000177,1.3e-05,0.000193,0.000453
1,BE42,,,,,,,,,,,,,
2,ASV2717,ASV2717,Armatimonadetes,Fimbriimonadia,Fimbriimonadaceae,,0.000154,3e-05,0.000187,4e-05,2.9e-05,1.1e-05,0.000184,0.000286
3,BE41,,,,,,,,,,,,,
4,BE47,,,,,,,,,,,,,


In [229]:
sigtab = dfl2fc.loc[(dfl2fc.padj < 0.05) & (dfl2fc.log2FoldChange > 0)]
sigtab = sigtab.groupby(['ASV', 'Test'], as_index=False).agg({"padj": "min"})
sigtab = sigtab.pivot(index = 'ASV', columns = "Test", values = "padj")




sigtab.head()

Test,CH-BS,CH-CS,RH-CS
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ASV101,0.02995563,0.034742,8.431885e-11
ASV1018,1.699569e-05,0.000863,
ASV1019,2.721585e-13,,
ASV102,,,0.002708662
ASV1022,,,0.001102364


In [232]:
#Note that this could be a good place to insert color codes into iTOL formated sheet
mask = sigtab['CH-BS'] < 0.05
column_name = 'CH-BS'
sigtab.loc[mask, column_name] = 1

mask = sigtab['CH-CS'] < 0.05
column_name = 'CH-CS'
sigtab.loc[mask, column_name] = 1

mask = sigtab['RH-CS'] < 0.05
column_name = 'RH-CS'
sigtab.loc[mask, column_name] = 1

sigtab.head()

Test,CH-BS,CH-CS,RH-CS
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ASV101,1.0,1.0,1.0
ASV1018,1.0,1.0,
ASV1019,1.0,,
ASV102,,,1.0
ASV1022,,,1.0


In [234]:
dfM = pandas.merge(dfM, sigtab, on = 'ASV', how='left')
dfM.head()

Unnamed: 0,ASV,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT,meanAbund,sumAbund,CH-BS,CH-CS,RH-CS
0,ASV2719,ASV2719,Proteobacteria,Alphaproteobacteria,Rhizobiaceae,Aminobacter,3.8e-05,0.000155,9.6e-05,2.5e-05,0.000177,1.3e-05,0.000193,0.000453,,,
1,BE42,,,,,,,,,,,,,,,,
2,ASV2717,ASV2717,Armatimonadetes,Fimbriimonadia,Fimbriimonadaceae,,0.000154,3e-05,0.000187,4e-05,2.9e-05,1.1e-05,0.000184,0.000286,1.0,,
3,BE41,,,,,,,,,,,,,,,,
4,BE47,,,,,,,,,,,,,,,,


### overall attribute table

In [236]:
dfM.to_csv(os.path.join(workDir, 'SeqofInt_attributes.txt'), sep='\t', index=False)

In [None]:
### Individual tables for iTOL templates

In [238]:
dfM['Taxonomy'] = dfM.Phylum + " " + dfM.Class + " " + dfM.Family

In [241]:
dfT = dfM[['ASV', 'Taxonomy']]

dfT.head()

Unnamed: 0,ASV,Taxonomy
0,ASV2719,Proteobacteria Alphaproteobacteria Rhizobiaceae
1,BE42,
2,ASV2717,Armatimonadetes Fimbriimonadia Fimbriimonadaceae
3,BE41,
4,BE47,


In [243]:
dfT.to_csv(os.path.join(workDir, 'SeqofInt_leaflabels.txt'), sep='\t', index=False)