# Incorporate isolate sequences into master tree of DADA2 seqs
* Align isolate seqs using SSU_align
* Mask sequences based on same mask as used for sequences from Exp10 
* Append sequences to create new fasta with both sets of sequences
* Create master tree using FastTree
* Use qiime environment and set kernel to Python 2

In [33]:
workDir = '/home/be68/Hyphosphere/data/3Exp/MasterTree/'
maskDir = '/home/be68/Hyphosphere/data/3Exp/Fasttree'
IsolateSeqDir = '/home/be68/Hyphosphere/data/3Exp/'
IsolateSeqFile = 'IsolateSeqs.fasta'


nprocs = 3

In [14]:
import os
import numpy as np
#import entrez.direct
from cogent.app.fasttree import build_tree_from_alignment
from cogent import DNA, LoadSeqs
from Bio import Entrez, SeqIO
from pandas import *
Entrez.email = "be68@cornell.edu"
#from IPython.display import display, Image, SVG
#from cogent3 import LoadSeqs, FastTree

In [15]:
if not os.path.isdir(workDir):
    os.mkdir(workDir)

In [16]:
%cd $workDir

/data/home/be68/Hyphosphere/data/3Exp/MasterTree


In [26]:
!cd $workDir; ls

In [28]:
!cd $workDir; ln -f -s $IsolateSeqDir$IsolateSeqFile

In [34]:
!printf "Number of OTUs in fasta: "
!cd $workDir; grep -c ">" $IsolateSeqFile

Number of OTUs in fasta: 165


## Using SSU-Align to align seqs and masking based on alignment posterior probabilities of 3Exp - ASV set

In [30]:
!cd $workDir; ssu-prep -f -x -b 50 --rfonly --dna $IsolateSeqFile ssu_aln 3

# _ssu-prep :: prepare SSU rRNA sequences for parallel ssu-align jobs
# SSU-ALIGN 0.1.1 (Feb 2016)
# Copyright (C) 2016 Howard Hughes Medical Institute
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# command: _ssu-prep -x -f -b 50 --dna --rfonly IsolateSeqs.fasta ssu_aln 3
# date:    Sat Oct 26 11:56:03 2019
#
# Validating input sequence file ... done.
#
# Preparing 3 ssu-align jobs ...
# Partitioning seqs with goal of equalizing total number of nucleotides per job ...
#
# output file name      description                                        
# --------------------  ---------------------------------------------------
  ssu_aln/IsolateSeqs.fasta.1  partition 1 FASTA sequence file (55 seqs; 70726 nt)
  ssu_aln/IsolateSeqs.fasta.2  partition 2 FASTA sequence file (56 seqs; 70104 nt)
  ssu_aln/IsolateSeqs.fasta.3  partition 3 FASTA sequence file (54 seqs; 67718 nt)
  ssu_aln.ssu-align.sh  shell script th

In [31]:
!cd $workDir; ./ssu_aln.ssu-align.sh

# Executing: ssu-align -b 50 --dna --rfonly ssu_aln/IsolateSeqs.fasta.1 ssu_aln/ssu_aln.1 > /dev/null &
# Executing: ssu-align -b 50 --dna --rfonly ssu_aln/IsolateSeqs.fasta.2 ssu_aln/ssu_aln.2 > /dev/null &
# Executing: ssu-align --merge 3 -b 50 --dna --rfonly ssu_aln/IsolateSeqs.fasta.3 ssu_aln/ssu_aln.3
# _ssu-align :: align SSU rRNA sequences
# SSU-ALIGN 0.1.1 (Feb 2016)
# Copyright (C) 2016 Howard Hughes Medical Institute
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# command: _ssu-align --merge 3 -b 50 --dna --rfonly ssu_aln/IsolateSeqs.fasta.3 ssu_aln/ssu_aln.3
# date:    Sat Oct 26 11:56:11 2019
#
# Validating input sequence file ... done.
#
# Stage 1: Determining SSU start/end positions and best-matching models...
#
# output file name            description                                
# --------------------------  -------------------------------------------
  ssu_aln.3.tab               lo

## mask alignments based on 3Exp ASVs

In [38]:
!cd $workDir; ssu-mask -s $maskDir/ssu_aln/ssu_aln.bacteria.mask --dna --afa ssu_aln/

# _ssu-mask :: mask SSU rRNA alignments
# SSU-ALIGN 0.1.1 (Feb 2016)
# Copyright (C) 2016 Howard Hughes Medical Institute
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# command: _ssu-mask -s /home/be68/Hyphosphere/data/3Exp/Fasttree//ssu_aln/ssu_aln.bacteria.mask --afa --dna ssu_aln/
# date:    Sat Oct 26 12:07:01 2019
#
# Masking alignments using pre-existing masks...
#
#                                                     mask    
#                                                 ------------
# file name                  in/out  type  #cols  incl.  excl.
# -------------------------  ------  ----  -----  -----  -----
  ssu_aln.bacteria.stk        input   aln   1582      -      -
  ssu_aln.bacteria.mask       input  mask   1582    247   1335
  ssu_aln.bacteria.mask.pdf  output   pdf   1582    247   1335
  ssu_aln.bacteria.mask.afa  output   aln    247      -      -
#
# All attempts to draw structure di

# Append masked alignments and include outgroup from notebook #4

In [42]:
!cd $workDir; cat $maskDir/ssu_aln/ssu_aln.bacteria.mask.afa $maskDir/sso_aln/sso_aln.bacteria.mask.afa ssu_aln/ssu_aln.bacteria.mask.afa > aln_for_tree.fasta

In [43]:
!head aln_for_tree.fasta
!tail aln_for_tree.fasta

>ASV1
TACGAAGGGGGCTAGCGTTGTTCGGATTTACTGGGCGTAAAGCGCACGTAGGCGGACTAT
TAAGTCAGGGGTGAAATCCCGGGGCTCAACCCCGGAACTTTGATACTGGTAGTCTTGAGT
TCGAGAGAGGTGAGTGGAATTCCGAGTGTAGAGGTGAAATTCGTAGATATTCGGAGGAAC
ACCAGTGGCGAAGGCGGCTCACTGGCTCGACTGACGCTGAGGTGCGAAAGCGTGGGGAGC
AAACAGG
>ASV2
-ACATAGGTGGCAAACATTATCCGGAATTATTGGGCGTAAAGGGTGCGTAGGCGGCATGA
TAAGTTGCTGGTGGGAAATCAAGGCTCAACCTTGTGGTAGCAATACTGTCAAGCTAGAGG
GCAGAAGAGGTTAACGGAACTCTATGTGGAGCGGTAAAATGTGTAGATATATAGAAGAAC
TAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAAATCTGATACTGGCAAGCTTGAGT
CTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAAT
ACCGGTGGCGAAGGCGGCCCCCTGGACGAACTGACGCTCAGGTGCGAAAGCGTGGGGAGC
AAACAGG
>BE277
TACGGAGGGTGCAAGCGTTAATCGGAATTACTGGGCGTAAAGCGCACGCAGGCGGTTTGT
TAAGTCAGATGTGAAATCCCCGGGCTCAACCTGGGAAATCTGATACTGGCAAGCTTGAGT
CTCGTAGAGGGGGGTAGAATTCCAGGTGTAGCGGTGAAATGCGTAGAGATCTGGAGGAAT
ACCGGTGGCGAAGGCGGCCCCCTGGACGAACTGACGCTCAGGTGCGAAAGCGTGGGGAGC
AAACAGG


## Inferring and rooting the tree

In [45]:
aln = LoadSeqs(os.path.join(workDir, 'aln_for_tree.fasta'), moltype=DNA)
t_unroot = build_tree_from_alignment(aln, moltype=DNA)

In [46]:
t_rooted = t_unroot.rootedWithTip('X90478')

In [47]:
t_rooted.writeToFile(os.path.join(workDir, 'Master.tree'))

## Subset tree with hyphal ASVs and top

## get tip names of isolates

In [48]:
file_to_open = os.path.join(IsolateSeqDir, IsolateSeqFile)
IsolateList = []

for record in SeqIO.parse(file_to_open, "fasta") :
       IsolateList.append(record.id)#+"\n"+str(record.seq)+"\n")

print(IsolateList)

['BE26', 'BE27', 'BE28', 'BE29', 'BE30', 'BE31', 'BE32', 'BE33', 'BE34', 'BE35', 'BE36', 'BE38', 'BE39', 'BE40', 'BE41', 'BE42', 'BE43', 'BE44', 'BE45', 'BE46', 'BE47', 'BE48', 'BE49', 'BE50', 'BE51', 'BE52', 'BE53', 'BE54', 'BE55', 'BE56', 'BE57', 'BE59', 'BE60', 'BE61', 'BE62', 'BE63', 'BE64', 'BE65', 'BE65B', 'BE66', 'BE68', 'BE69', 'BE70', 'BE71', 'BE72', 'BE74', 'BE75', 'BE76', 'BE79', 'BE80', 'BE81', 'BE82', 'BE83', 'BE84', 'BE85', 'BE88', 'BE91', 'BE92', 'BE93', 'BE94', 'BE95', 'BE96', 'BE97', 'BE98', 'BE100', 'BE101', 'BE102', 'BE103', 'BE104', 'BE105', 'BE107', 'BE108', 'BE109', 'BE110', 'BE112', 'BE113', 'BE114', 'BE115', 'BE116', 'BE117', 'BE118', 'BE120', 'BE121', 'BE122', 'BE123', 'BE124', 'BE125', 'BE126', 'BE127', 'BE128', 'BE153', 'BE154', 'BE156', 'BE157', 'BE159', 'BE163', 'BE164', 'BE166', 'BE167', 'BE168', 'BE170', 'BE171', 'BE172', 'BE173', 'BE177', 'BE178', 'BE179', 'BE180', 'BE181', 'BE182', 'BE183', 'BE184', 'BE186', 'BE187', 'BE190', 'BE191', 'BE193', 'BE194', 

## Import blast tables
* blast database created from seqs_thresh.fasta with the command "makeblastdb -in seqs_thresh.fasta -parse_seqids -dbtype nucl"
* blast table created with: 'blastn -db phyloseq/seqs_thresh.fasta -query IsolateSeqs.fasta -out Isolate_ASV_blast.out -outfmt 6 -max_target_seqs 1'

In [52]:
!head ../Isolate_ASV_blast.out

BE26	ASV97	100.00	253	0	0	444	696	1	253	5e-132	  468
BE27	ASV97	100.00	253	0	0	450	702	1	253	5e-132	  468
BE28	ASV97	100.00	253	0	0	452	704	1	253	5e-132	  468
BE29	ASV34	100.00	253	0	0	433	685	1	253	5e-132	  468
BE30	ASV34	100.00	253	0	0	433	685	1	253	5e-132	  468
BE31	ASV120	100.00	253	0	0	417	669	1	253	5e-132	  468
BE32	ASV120	100.00	253	0	0	417	669	1	253	5e-132	  468
BE33	ASV4392	100.00	253	0	0	419	671	1	253	4e-132	  468
BE34	ASV120	100.00	253	0	0	418	670	1	253	5e-132	  468
BE35	ASV636	100.00	253	0	0	417	669	1	253	4e-132	  468


In [55]:
df = pandas.read_table('/home/be68/Hyphosphere/data/3Exp/Isolate_ASV_blast.out', sep = "\t", header=None)
df.columns = ['qseqid','sseqid',
'pident',
'length',
'mismatch',
'gapopen',
'qstart',
'qend',
'sstart',
'send',
'evalue',
'bitscore']
df.head()


Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,BE26,ASV97,100.0,253,0,0,444,696,1,253,5e-132,468
1,BE27,ASV97,100.0,253,0,0,450,702,1,253,5e-132,468
2,BE28,ASV97,100.0,253,0,0,452,704,1,253,5e-132,468
3,BE29,ASV34,100.0,253,0,0,433,685,1,253,5e-132,468
4,BE30,ASV34,100.0,253,0,0,433,685,1,253,5e-132,468


In [56]:
df.tail()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
159,BE268,ASV298,100.0,253,0,0,385,637,1,253,5e-132,468
160,BE270,ASV173,100.0,253,0,0,383,635,1,253,3e-132,468
161,BE275,ASV681,100.0,253,0,0,443,695,1,253,5e-132,468
162,BE276,ASV681,100.0,253,0,0,441,693,1,253,5e-132,468
163,BE277,ASV681,100.0,253,0,0,441,693,1,253,5e-132,468


In [57]:
# function to get unique values 
def unique(list1): 
    x = np.array(list1) 
    return(np.unique(x)) 

In [58]:
IsolateMatches_list = df["sseqid"].tolist()
IsolateMatches_list = list(unique(IsolateMatches_list))
IsolateMatches_list

['ASV1012',
 'ASV103',
 'ASV105',
 'ASV120',
 'ASV12433',
 'ASV1296',
 'ASV131',
 'ASV1492',
 'ASV15',
 'ASV1578',
 'ASV158',
 'ASV159',
 'ASV1622',
 'ASV17',
 'ASV173',
 'ASV183',
 'ASV1967',
 'ASV206',
 'ASV2102',
 'ASV2103',
 'ASV2123',
 'ASV2155',
 'ASV2227',
 'ASV2250',
 'ASV2258',
 'ASV231',
 'ASV242',
 'ASV2449',
 'ASV248',
 'ASV275',
 'ASV2857',
 'ASV2948',
 'ASV296',
 'ASV298',
 'ASV3069',
 'ASV3108',
 'ASV320',
 'ASV3225',
 'ASV34',
 'ASV3446',
 'ASV3647',
 'ASV371',
 'ASV395',
 'ASV404',
 'ASV407',
 'ASV43',
 'ASV4392',
 'ASV44',
 'ASV443',
 'ASV447',
 'ASV490',
 'ASV4965',
 'ASV5',
 'ASV514',
 'ASV523',
 'ASV627',
 'ASV636',
 'ASV64',
 'ASV664',
 'ASV673',
 'ASV681',
 'ASV6820',
 'ASV70',
 'ASV7019',
 'ASV744',
 'ASV773',
 'ASV8',
 'ASV8301',
 'ASV85',
 'ASV864',
 'ASV9039',
 'ASV9045',
 'ASV9153',
 'ASV93',
 'ASV953',
 'ASV97',
 'ASV9759',
 'ASV9844',
 'ASV99']

## Identify hyphal ASVs

* Stick with CH_CS and BH_BS for now

In [118]:
dfl2fc_1 = pandas.read_table('/home/be68/Hyphosphere/data/3Exp/DESeq/dfl2fc-Exp1.txt', sep = "\t")
sigtab1 = dfl2fc_1.loc[(dfl2fc_1.padj < 0.05) & (dfl2fc_1.log2FoldChange > 0) & (dfl2fc_1.Contrast == 'CH_CS') & (dfl2fc_1.Kingdom == "Bacteria")]
del dfl2fc_1 

dfl2fc_2 = pandas.read_table('/home/be68/Hyphosphere/data/3Exp/DESeq/dfl2fc-Exp2.txt', sep = "\t")
sigtab2 = dfl2fc_2.loc[(dfl2fc_2.padj < 0.05) & (dfl2fc_2.log2FoldChange > 0) & (dfl2fc_2.Contrast == 'CH_CS') * (dfl2fc_2.Kingdom == "Bacteria")]
del dfl2fc_2

dfl2fc_3 = pandas.read_table('/home/be68/Hyphosphere/data/3Exp/DESeq/dfl2fc-Exp3.txt', sep = "\t")
sigtab3 = dfl2fc_3.loc[(dfl2fc_3.padj < 0.05) & (dfl2fc_3.log2FoldChange > 0) & (dfl2fc_3.Contrast == 'CH_CS') & (dfl2fc_3.Kingdom == "Bacteria")]
del dfl2fc_3

# dfl2fc = dfl2fc.append(dfl2fc_CHCS)
# dfl2fc = dfl2fc.append(dfl2fc_RHCS)

# sigtab = dfl2fc_CHBS.append(dfl2fc_CHCS)
# sigtab.head()

  .format(op=op_str, alt_op=unsupported[op_str]))


In [119]:

sigtab = sigtab1.append(sigtab2) 
sigtab = sigtab.append(sigtab3)

HA = sigtab.OTU.unique()
HA = list(HA)
#HA.sort()
len(HA)



536

## Identify top 200 ASVs for context

In [114]:
dfl2fc_1 = pandas.read_table('/home/be68/Hyphosphere/data/3Exp/DESeq/dfl2fc-Exp1.txt', sep = "\t")
Abund1 = dfl2fc_1.loc[(dfl2fc_1.Contrast == 'CH_CS') & (dfl2fc_1.Kingdom == "Bacteria")]
del dfl2fc_1 

dfl2fc_2 = pandas.read_table('/home/be68/Hyphosphere/data/3Exp/DESeq/dfl2fc-Exp2.txt', sep = "\t")
Abund2 = dfl2fc_2.loc[(dfl2fc_2.Contrast == 'CH_CS') & (dfl2fc_2.Kingdom == "Bacteria")]
del dfl2fc_2

dfl2fc_3 = pandas.read_table('/home/be68/Hyphosphere/data/3Exp/DESeq/dfl2fc-Exp3.txt', sep = "\t")
Abund3 = dfl2fc_3.loc[(dfl2fc_3.Contrast == 'CH_CS') & (dfl2fc_3.Kingdom == "Bacteria")]
del dfl2fc_3

In [115]:
Abund = Abund1.append(Abund2)
Abund = Abund2.append(Abund3)

Abund = Abund.groupby('OTU').agg({'baseMean': 'mean'})
Abund = Abund.sort_values(by='baseMean', ascending=False)[0:199]
TopASVs = list(Abund.index.values)

## Now join lists and prune tree
* Top ASVs = TopASVs
* Diff Abund asvs = HA
* Isolates and top blast hits = IsolateList + IsolateMatches_list

* master tree = t_rooted

In [94]:
IsolateList

['BE26',
 'BE27',
 'BE28',
 'BE29',
 'BE30',
 'BE31',
 'BE32',
 'BE33',
 'BE34',
 'BE35',
 'BE36',
 'BE38',
 'BE39',
 'BE40',
 'BE41',
 'BE42',
 'BE43',
 'BE44',
 'BE45',
 'BE46',
 'BE47',
 'BE48',
 'BE49',
 'BE50',
 'BE51',
 'BE52',
 'BE53',
 'BE54',
 'BE55',
 'BE56',
 'BE57',
 'BE59',
 'BE60',
 'BE61',
 'BE62',
 'BE63',
 'BE64',
 'BE65',
 'BE65B',
 'BE66',
 'BE68',
 'BE69',
 'BE70',
 'BE71',
 'BE72',
 'BE74',
 'BE75',
 'BE76',
 'BE79',
 'BE80',
 'BE81',
 'BE82',
 'BE83',
 'BE84',
 'BE85',
 'BE88',
 'BE91',
 'BE92',
 'BE93',
 'BE94',
 'BE95',
 'BE96',
 'BE97',
 'BE98',
 'BE100',
 'BE101',
 'BE102',
 'BE103',
 'BE104',
 'BE105',
 'BE107',
 'BE108',
 'BE109',
 'BE110',
 'BE112',
 'BE113',
 'BE114',
 'BE115',
 'BE116',
 'BE117',
 'BE118',
 'BE120',
 'BE121',
 'BE122',
 'BE123',
 'BE124',
 'BE125',
 'BE126',
 'BE127',
 'BE128',
 'BE153',
 'BE154',
 'BE156',
 'BE157',
 'BE159',
 'BE163',
 'BE164',
 'BE166',
 'BE167',
 'BE168',
 'BE170',
 'BE171',
 'BE172',
 'BE173',
 'BE177',
 'BE178',
 'B

In [120]:
MasterList = TopASVs + HA + IsolateList + IsolateMatches_list
#len(MasterList)
MasterList = list(set(MasterList))
len(MasterList)

866

In [121]:
mt_rooted = t_rooted.getSubTree(MasterList)

In [122]:
mt_rooted.writeToFile(os.path.join(workDir, 'SeqofInt.tree'))

# Generate dataframes for iTOL

In [228]:
dfl2fc.head()

Unnamed: 0,baseMean,log2FoldChange,lfcSE,stat,pvalue,padj,ASV,Test,TimePoint
1,0.0,,,,,,ASV11489,CH-BS,1
2,0.162168,1.45507,3.093253,0.470401,0.638068,,ASV11644,CH-BS,1
3,0.913138,-3.296519,3.0341,-1.08649,0.277262,,ASV3656,CH-BS,1
4,15.109037,-0.425264,2.894952,-0.146898,0.883212,0.972331,ASV562,CH-BS,1
5,0.0,,,,,,ASV2718,CH-BS,1


In [166]:
ASVabund.head()

Unnamed: 0,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT,meanAbund,sumAbund
1,ASV100,Proteobacteria,Alphaproteobacteria,Xanthobacteraceae,,0.003115,0.004161,0.00249,0.003742,0.00023,0.0,0.007276,0.010624
2,ASV1000,Verrucomicrobia,Verrucomicrobiae,Chthoniobacteraceae,Candidatus_Udaeobacter,0.000364,0.000745,0.000158,0.000268,3.1e-05,0.0,0.001109,0.001203
3,ASV1002,Actinobacteria,Actinobacteria,Microbacteriaceae,Agromyces,0.000508,0.000406,0.000361,0.000407,0.000142,0.0,0.000914,0.001317
4,ASV1003,Gemmatimonadetes,Gemmatimonadetes,Gemmatimonadaceae,,0.000628,0.000274,8.7e-05,0.000534,4.6e-05,0.0,0.000902,0.00094
5,ASV10035,Actinobacteria,Thermoleophilia,Solirubrobacteraceae,,0.0,0.0,0.0,0.0,3.6e-05,1.3e-05,0.0,3.6e-05


## Process dataframes
* Create a dataframe with names from MasterList
* Left join abundance dataframe to sequences in Master List
* generate dfl2fc indicators annd add attributes to dataframe

In [293]:
dfM = pandas.DataFrame(MasterList, columns = ["ASV"])
dfM = pandas.merge(dfM, ASVabund, left_on='ASV', right_on="OTU", how='left')
dfM.head()

Unnamed: 0,ASV,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT,meanAbund,sumAbund
0,ASV2719,ASV2719,Proteobacteria,Alphaproteobacteria,Rhizobiaceae,Aminobacter,3.8e-05,0.000155,9.6e-05,2.5e-05,0.000177,1.3e-05,0.000193,0.000453
1,BE42,,,,,,,,,,,,,
2,ASV2717,ASV2717,Armatimonadetes,Fimbriimonadia,Fimbriimonadaceae,,0.000154,3e-05,0.000187,4e-05,2.9e-05,1.1e-05,0.000184,0.000286
3,BE41,,,,,,,,,,,,,
4,ASV2481,ASV2481,Proteobacteria,Deltaproteobacteria,Haliangiaceae,Haliangium,0.000176,0.0,0.000325,1.4e-05,0.0,0.0,0.000176,0.000339


In [294]:
sigtab = dfl2fc.loc[(dfl2fc.padj < 0.05) & (dfl2fc.log2FoldChange > 0)]
sigtab = sigtab.groupby(['ASV', 'Test'], as_index=False).agg({"padj": "min"})
sigtab = sigtab.pivot(index = 'ASV', columns = "Test", values = "padj")




sigtab.head()

Test,CH-BS,CH-CS,RH-CS
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ASV101,0.02995563,0.034742,8.431885e-11
ASV1018,1.699569e-05,0.000863,
ASV1019,2.721585e-13,,
ASV102,,,0.002708662
ASV1022,,,0.001102364


In [295]:
#Note that this could be a good place to insert color codes into iTOL formated sheet
mask = sigtab['CH-BS'] < 0.05
column_name = 'CH-BS'
sigtab.loc[mask, column_name] = 1

mask = sigtab['CH-CS'] < 0.05
column_name = 'CH-CS'
sigtab.loc[mask, column_name] = 1

mask = sigtab['RH-CS'] < 0.05
column_name = 'RH-CS'
sigtab.loc[mask, column_name] = 1

sigtab.head()

Test,CH-BS,CH-CS,RH-CS
ASV,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
ASV101,1.0,1.0,1.0
ASV1018,1.0,1.0,
ASV1019,1.0,,
ASV102,,,1.0
ASV1022,,,1.0


In [296]:
dfM = pandas.merge(dfM, sigtab, on = 'ASV', how='left')
dfM.head()

Unnamed: 0,ASV,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT,meanAbund,sumAbund,CH-BS,CH-CS,RH-CS
0,ASV2719,ASV2719,Proteobacteria,Alphaproteobacteria,Rhizobiaceae,Aminobacter,3.8e-05,0.000155,9.6e-05,2.5e-05,0.000177,1.3e-05,0.000193,0.000453,,,
1,BE42,,,,,,,,,,,,,,,,
2,ASV2717,ASV2717,Armatimonadetes,Fimbriimonadia,Fimbriimonadaceae,,0.000154,3e-05,0.000187,4e-05,2.9e-05,1.1e-05,0.000184,0.000286,1.0,,
3,BE41,,,,,,,,,,,,,,,,
4,ASV2481,ASV2481,Proteobacteria,Deltaproteobacteria,Haliangiaceae,Haliangium,0.000176,0.0,0.000325,1.4e-05,0.0,0.0,0.000176,0.000339,,1.0,


### overall attribute table

In [297]:
dfM.to_csv(os.path.join(workDir, 'SeqofInt_attributes.txt'), sep='\t', index=False)

In [None]:
### Individual tables for iTOL templates

In [238]:
dfM['Taxonomy'] = dfM.Phylum + " " + dfM.Class + " " + dfM.Family

In [241]:
dfT = dfM[['ASV', 'Taxonomy']]

dfT.head()

Unnamed: 0,ASV,Taxonomy
0,ASV2719,Proteobacteria Alphaproteobacteria Rhizobiaceae
1,BE42,
2,ASV2717,Armatimonadetes Fimbriimonadia Fimbriimonadaceae
3,BE41,
4,BE47,


In [243]:
dfT.to_csv(os.path.join(workDir, 'SeqofInt_leaflabels.txt'), sep='\t', index=False)

### Responder color strips

In [324]:
TreeDir = 'iTOL'
if not os.path.isdir('TreeDir'):
    os.mkdir(TreeDir)

* CHCS 

In [325]:
#Replace responder value '1.0' with colorstrip RGB
dfR = dfM[['ASV', 'CH-CS']]
mask = dfR['CH-CS'] == 1.0
column_name = 'CH-CS'
dfR.loc[mask, column_name] = '#1b9e77'
dfR.head()

#Replace NaN values with white RGB
dfR['CH-CS'] = dfR['CH-CS'].fillna('#ffffff')

#dfR = dfR[['ASV', 'CH-CS']]
dfR.head()


dfR.to_csv(os.path.join(TreeDir, 'CHCS_colorstrip.txt'), sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


* CHBS

In [326]:
#Replace responder value '1.0' with colorstrip RGB
dfR = dfM[['ASV', 'CH-BS']]
mask = dfR['CH-BS'] == 1.0
column_name = 'CH-BS'
dfR.loc[mask, column_name] = '#7570b3'
dfR.head()

#Replace NaN values with white RGB
dfR['CH-BS'] = dfR['CH-BS'].fillna('#ffffff')

#dfR = dfR[['ASV', 'CH-CS']]
dfR.head()


dfR.to_csv(os.path.join(TreeDir, 'CHBS_colorstrip.txt'), sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


In [327]:
#Replace responder value '1.0' with colorstrip RGB
dfR = dfM[['ASV', 'RH-CS']]
mask = dfR['RH-CS'] == 1.0
column_name = 'RH-CS'
dfR.loc[mask, column_name] = '#d95f02'
dfR.head()

#Replace NaN values with white RGB
dfR['RH-CS'] = dfR['RH-CS'].fillna('#ffffff')

#dfR = dfR[['ASV', 'CH-CS']]
#dfR.head()


dfR.to_csv(os.path.join(TreeDir, 'RHCS_colorstrip.txt'), sep='\t', index=False)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':


## Identify isolates closely related to responders
* Perhaps one color for CHBS or CHCS and another color for isolates only close to RHCS

In [386]:
df = pandas.read_table('ASV-Isolate_blast', sep = "\t", header=None)
df.columns = ['qseqid','sseqid',
'pident',
'length',
'mismatch',
'gapopen',
'qstart',
'qend',
'sstart',
'send',
'evalue',
'bitscore']
df.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,ASV1,BE256,95.29,255,8,4,1,253,381,633,5e-114,401.0
1,ASV3,BE215,99.21,253,2,0,1,253,384,636,9.999999999999999e-131,457.0
2,ASV6,BE96,90.91,253,23,0,1,253,434,686,1e-95,340.0
3,ASV8,BE183,100.0,253,0,0,1,253,382,634,4.999999999999999e-134,468.0
4,ASV11,BE193,97.63,253,6,0,1,253,429,681,4.9999999999999995e-124,435.0


In [401]:
df = pandas.read_table('ASV-Isolate_blast', sep = "\t", header=None)
df.columns = ['qseqid','sseqid',
'pident',
'length',
'mismatch',
'gapopen',
'qstart',
'qend',
'sstart',
'send',
'evalue',
'bitscore']
df = df.loc[(df.sseqid == 'BE127')& (df.pident > 97)]
df

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
220,ASV261,BE127,98.81,253,3,0,1,253,435,687,5e-129,451.0
222,ASV263,BE127,98.02,253,5,0,1,253,435,687,1.0000000000000001e-125,440.0
302,ASV361,BE127,98.02,253,5,0,1,253,435,687,1.0000000000000001e-125,440.0
477,ASV582,BE127,99.6,253,1,0,1,253,435,687,2e-132,462.0
854,ASV1069,BE127,98.81,253,3,0,1,253,435,687,5e-129,451.0
1043,ASV1320,BE127,98.42,253,4,0,1,253,435,687,2e-127,446.0
1397,ASV1796,BE127,97.63,253,6,0,1,253,435,687,4.9999999999999995e-124,435.0
1638,ASV2116,BE127,100.0,253,0,0,1,253,435,687,4.999999999999999e-134,468.0
2227,ASV2910,BE127,97.63,253,6,0,1,253,435,687,4.9999999999999995e-124,435.0
2253,ASV2943,BE127,98.42,253,4,0,1,253,435,687,2e-127,446.0


In [418]:
tmp = dfM[dfM.ASV.isin(list(sigCHCS.ASV))]
tmp

Unnamed: 0,ASV,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT,meanAbund,sumAbund,CH-BS,CH-CS,RH-CS
4,ASV2481,ASV2481,Proteobacteria,Deltaproteobacteria,Haliangiaceae,Haliangium,0.000176,0.000000,0.000325,0.000014,0.000000,0.000000,0.000176,0.000339,,1.0,
7,ASV1404,ASV1404,Proteobacteria,Alphaproteobacteria,Dongiaceae,Dongia,0.000412,0.000037,0.000301,0.000051,0.000078,0.000000,0.000449,0.000467,1.0,1.0,
11,ASV231,ASV231,Proteobacteria,Deltaproteobacteria,Haliangiaceae,Haliangium,0.000073,0.000007,0.005937,0.000103,0.000301,0.000095,0.000081,0.006348,1.0,1.0,
13,ASV480,ASV480,Proteobacteria,Gammaproteobacteria,Burkholderiaceae,Leptothrix,0.000544,0.000037,0.003459,0.000012,0.000149,0.000061,0.000581,0.003657,1.0,1.0,
17,ASV897,ASV897,Proteobacteria,Gammaproteobacteria,Burkholderiaceae,,0.000000,0.000000,0.001839,0.000145,0.000000,0.000000,0.000000,0.001985,1.0,1.0,
18,ASV2621,ASV2621,Proteobacteria,Deltaproteobacteria,Haliangiaceae,Haliangium,0.000000,0.000000,0.000638,0.000000,0.000000,0.000000,0.000000,0.000638,1.0,1.0,
28,ASV4705,ASV4705,Proteobacteria,Deltaproteobacteria,Haliangiaceae,Haliangium,0.000000,0.000000,0.000290,0.000000,0.000000,0.000000,0.000000,0.000290,,1.0,
32,ASV153,ASV153,Proteobacteria,Gammaproteobacteria,Unknown_Family,Acidibacter,0.004421,0.000623,0.002689,0.000266,0.000401,0.000000,0.005045,0.003980,,1.0,
33,ASV150,ASV150,Proteobacteria,Alphaproteobacteria,Sphingomonadaceae,Sphingomonas,0.000089,0.000000,0.000445,0.000009,0.000542,0.000000,0.000089,0.000996,1.0,1.0,
37,ASV1975,ASV1975,Bacteroidetes,Bacteroidia,Microscillaceae,Ohtaekwangia,0.000079,0.000045,0.000625,0.000039,0.000038,0.000000,0.000125,0.000747,1.0,1.0,


In [349]:
df = df.loc[df.pident > 97]
#iTOL code for bolding ASV names requires
#node 9606 will have its label displayed in blue with bold italic font, and with yellow background
#9606,label,node,#0000ff,1,bold-italic,#ffff00
#ASV,label,node,#000000,1,bold,#ffffff
df = df[['qseqid']]
df['label'] = 'label'
df['node'] = 'node'
df['font_color'] = '#000000' #black
df['number'] = '1'
df['font_type'] = 'bold'
df['background_colr'] = '#ffff00'#yellow background

#mask = dfR['RH-CS'] == 1.0
#df[['ASV','label',]]
df.head()
df.to_csv(os.path.join(TreeDir, 'labelStyle.txt'), sep='\t', index=False)

## Explore

In [354]:
dfM.loc[dfM.Family == "Paenibacillaceae"]

Unnamed: 0,ASV,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT,meanAbund,sumAbund,CH-BS,CH-CS,RH-CS
49,ASV5271,ASV5271,Firmicutes,Bacilli,Paenibacillaceae,Brevibacillus,0.0,0.0,0.0,0.0,6.8e-05,5.6e-05,0.0,6.8e-05,,,
67,ASV7471,ASV7471,Firmicutes,Bacilli,Paenibacillaceae,Paenibacillus,0.0,4.6e-05,2.6e-05,0.0,3.1e-05,0.0,4.6e-05,0.000103,,,
435,ASV302,ASV302,Firmicutes,Bacilli,Paenibacillaceae,Paenibacillus,0.00037,2.7e-05,0.000165,0.000106,0.000147,0.000134,0.000397,0.000444,,,


In [359]:
dfM.head()

Unnamed: 0,ASV,OTU,Phylum,Class,Family,Genus,BP,BS,CH,CS,RH,RT,meanAbund,sumAbund,CH-BS,CH-CS,RH-CS
0,ASV2719,ASV2719,Proteobacteria,Alphaproteobacteria,Rhizobiaceae,Aminobacter,3.8e-05,0.000155,9.6e-05,2.5e-05,0.000177,1.3e-05,0.000193,0.000453,,,
1,BE42,,,,,,,,,,,,,,,,
2,ASV2717,ASV2717,Armatimonadetes,Fimbriimonadia,Fimbriimonadaceae,,0.000154,3e-05,0.000187,4e-05,2.9e-05,1.1e-05,0.000184,0.000286,1.0,,
3,BE41,,,,,,,,,,,,,,,,
4,ASV2481,ASV2481,Proteobacteria,Deltaproteobacteria,Haliangiaceae,Haliangium,0.000176,0.0,0.000325,1.4e-05,0.0,0.0,0.000176,0.000339,,1.0,


# closest hits to responders

In [362]:
df = pandas.read_table('Isolate-ASV_blast', sep = "\t", header=None)
df.columns = ['qseqid','sseqid',
'pident',
'length',
'mismatch',
'gapopen',
'qstart',
'qend',
'sstart',
'send',
'evalue',
'bitscore']
df.head()

Unnamed: 0,qseqid,sseqid,pident,length,mismatch,gapopen,qstart,qend,sstart,send,evalue,bitscore
0,BE26,ASV193,100.0,253,0,0,444,696,1,253,3e-132,468
1,BE27,ASV193,100.0,253,0,0,450,702,1,253,3e-132,468
2,BE28,ASV193,100.0,253,0,0,452,704,1,253,3e-132,468
3,BE29,ASV39,100.0,253,0,0,433,685,1,253,3e-132,468
4,BE30,ASV39,100.0,253,0,0,433,685,1,253,3e-132,468


In [370]:
IsolateMatches = list(df.sseqid)

### Check closest isolate match against CH-CS responders

In [381]:
sigtab = dfl2fc.loc[(dfl2fc.padj < 0.05) & (dfl2fc.log2FoldChange > 0)]
sigCHCS = sigtab.loc[sigtab.Test == 'CH-CS']
matchCHCS = sigCHCS.loc[sigCHCS.ASV.isin(IsolateMatches)]
matchCHCS = set(list(matchCHCS.ASV))
matchCHCS

{'ASV12', 'ASV150', 'ASV1811', 'ASV452', 'ASV58', 'ASV8'}

### Check closest isolate match against CH-BS responders

In [382]:
sigtab = dfl2fc.loc[(dfl2fc.padj < 0.05) & (dfl2fc.log2FoldChange > 0)]
sigCHBS = sigtab.loc[sigtab.Test == 'CH-BS']
#sigCHBS.head()
matchCHBS = sigCHBS.loc[sigCHBS.ASV.isin(IsolateMatches)]
matchCHBS = set(list(matchCHBS.ASV))
matchCHBS

{'ASV1030',
 'ASV12',
 'ASV13',
 'ASV150',
 'ASV1811',
 'ASV2669',
 'ASV360',
 'ASV452',
 'ASV8'}

In [383]:
sigtab = dfl2fc.loc[(dfl2fc.padj < 0.05) & (dfl2fc.log2FoldChange > 0)]
sigRHCS = sigtab.loc[sigtab.Test == 'RH-CS']
#sigCHBS.head()
matchRHCS = sigRHCS.loc[sigRHCS.ASV.isin(IsolateMatches)]
matchRHCS = set(list(matchRHCS.ASV))
matchRHCS


{'ASV12',
 'ASV13',
 'ASV171',
 'ASV188',
 'ASV23',
 'ASV233',
 'ASV236',
 'ASV296',
 'ASV350',
 'ASV362',
 'ASV39',
 'ASV430',
 'ASV452',
 'ASV477',
 'ASV481',
 'ASV522',
 'ASV58',
 'ASV687',
 'ASV765',
 'ASV77',
 'ASV792',
 'ASV8',
 'ASV81',
 'ASV891',
 'ASV90'}

In [385]:
len(matchRHCS)

25