# Description

 Code for RhizCG microbial community analysis to be presented at ESA, August 12th, 2015


# Setting variables

In [11]:
workDir = '/home/bryan/RhizCG/data/MiSeq_amplicon/MergedRuns/PlantPhylo'
biomFileDir = '/home/bryan/RhizCG/data/MiSeq_amplicon/MergedRuns/OTU_binning/'

biomFile = '/home/bryan/RhizCG/data/MiSeq_amplicon/MergedRuns/OTU_binning/otu_table_wtax.biom'
metadataFile = '/home/bryan/RhizCG/data/MiSeq_amplicon/metadata_RhizCG_merged.txt'
treeFile = '/home/bryan/RhizCG/data/MiSeq_amplicon/MergedRuns/fasttree/otusn.tree'
log2foldFile = '/home/bryan/RhizCG/data/MiSeq_amplicon/MergedRuns/DeSeq2/bare-plant.csv'

# Init

In [2]:
import os
from pandas import DataFrame
from IPython.display import FileLink

In [3]:
%matplotlib inline
%load_ext rpy2.ipython

In [29]:
%%R
library(picante)
library(phyloseq)
library(ggplot2)
library(foreach)
library(doParallel)
library(gridExtra)
library(dplyr)
library(tidyr)
library(vegan)
library(DESeq2)


  res = super(Function, self).__call__(*new_args, **new_kwargs)

  res = super(Function, self).__call__(*new_args, **new_kwargs)
Attaching package: ‘nlme’


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    collapse


  res = super(Function, self).__call__(*new_args, **new_kwargs)

    collapse


  res = super(Function, self).__call__(*new_args, **new_kwargs)


In [5]:
if not os.path.isdir(workDir):
    os.mkdir(workDir)

# Loading/editting OTU table

In [6]:
%%R -i biomFile -i metadataFile -i workDir -i treeFile
# loading
setwd(workDir)

## biom file
physeq = import_biom(biomFile, treeFile)

## loading sample metadata
sample.data = import_qiime_sample_data(metadataFile)
physeq = merge_phyloseq(physeq,sample.data)

#apply gsub across columns to replace ___ w/ "", reload to phyloseq object
t = tax_table(physeq)
t = apply(t, 2, function(y) gsub("__", "", y))
tax_table(physeq) = t

In [7]:
%%R
plantData = read.table('/home/bryan/RhizCG/data/MiSeq_amplicon/PlotData.txt', header = TRUE)

s = sample_data(physeq)
PD = plantData[,c(1,18, 20,21, 23:29, 44:47, 52:59)]
s2 = left_join(s, PD, by = c("X.sample" = "Unique"))
rownames(s2) = s2$X.sample
sample_data(physeq) = s2
sample_data(physeq) %>% head

Sample Data:        [6 samples by 43 sample variables]:
                           X.sample Library Primer_Plate Primer_Well_ID
sample2-1.06_Npos sample2-1.06_Npos       1            1             H9
sample6-4.27_Npos sample6-4.27_Npos       1            2            F11
sample4-4.27_Nneg sample4-4.27_Nneg       1            1            B12
sample4-1.06_Nneg sample4-1.06_Nneg       1            1            E11
sample2-1.06_Nneg sample2-1.06_Nneg       1            1             G9
sample6-3.07_Nneg sample6-3.07_Nneg       1            2            C11
                  primerFR_ID_byPlate primerFR_ID_total barcode_ID    PlotID
sample2-1.06_Npos                  72                72         72 1.06_Npos
sample6-4.27_Npos                  86               182        182 4.27_Npos
sample4-4.27_Nneg                  90                90         90 4.27_Nneg
sample4-1.06_Nneg                  85                85         85 1.06_Nneg
sample2-1.06_Nneg                  71                71

In [8]:
%%R
#subset out positive and negative controls
physeq = subset_samples(physeq, !is.na(DAP))
#set DAP to factor
#sample_data(physeq)$DAP = factor(sample_data(physeq)$DAP)
print(physeq)

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 11246 taxa and 240 samples ]
sample_data() Sample Data:       [ 240 samples by 43 sample variables ]
tax_table()   Taxonomy Table:    [ 11246 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 11246 tips and 11245 internal nodes ]


In [9]:
%%R 
# filtering/normalizing
physeq.snorm = prune_samples(sample_sums(physeq) >= 10, physeq)
physeq.snorm = transform_sample_counts(physeq.snorm, function(x) x/sum(x))
  
physeq.thresh = filter_taxa(physeq.snorm, function(x) sum(x > 0) > (0.1 * length(x)), TRUE)
print(physeq.snorm)
#physeq.thresh = physeq.snorm

print(physeq.thresh)

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 11246 taxa and 240 samples ]
sample_data() Sample Data:       [ 240 samples by 43 sample variables ]
tax_table()   Taxonomy Table:    [ 11246 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 11246 tips and 11245 internal nodes ]
phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 6736 taxa and 240 samples ]
sample_data() Sample Data:       [ 240 samples by 43 sample variables ]
tax_table()   Taxonomy Table:    [ 6736 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 6736 tips and 6735 internal nodes ]


# Compare DeSeq output to plant phylogenetic distance

### Read DeSeq output

In [18]:
%%R -i log2foldFile
l2f = read.csv(log2foldFile, header = TRUE)
colnames(l2f)

 [1] "log2FoldChange" "p"              "Rank1"          "Rank2"         
 [5] "Rank3"          "Rank4"          "Rank5"          "Rank6"         
 [9] "Rank7"          "Rank8"          "OTU"            "Day"           
[13] "Plant"          "p.adj"         


In [24]:
%%R
l2f$OTU %>% unique %>% length

[1] 7049


### Spread by plant to make matrix of log2fold change in each OTU by plant combo

In [21]:
%%R
df = l2f[,c(1,11,13)]
l2f_m = spread(df, Plant, log2FoldChange, fill = 0, drop = TRUE)
l2f_m %>% head

        OTU     75-062      ABUTH      AMAPO        B73        B97      ECHCG
1    OTU.10  3.7523433  3.5494178  3.9076702  4.7268592  3.9815247  3.6698594
2   OTU.100  0.2596795  0.1158978 -0.6560064 -0.3820701 -0.2671950  0.1329242
3  OTU.1000  1.4402601  0.3949173  0.4435041  0.2573087  0.6151993  0.6697786
4 OTU.10000  0.0000000  0.0000000  0.0000000  0.0000000  1.6438161  0.0000000
5 OTU.10004 -0.4835025 -0.2445230 -0.2615567  0.3824627  1.1055433 -0.4398923
6 OTU.10009  0.0000000  0.0000000  0.0000000  5.1824890  0.0000000  0.0000000
        ELCOR      ERATE      FAGES       GLYMA        H99      HELAN
1  3.24272186  3.9915833  2.2250560  4.48740202  2.4067471  2.8786942
2 -0.02328959  0.2387057  1.0417471 -1.42186719 -0.8572358 -2.2450542
3  1.47498794  1.4759667  1.9412585  0.16666145  0.4474753  0.0000000
4  0.00000000  0.0000000  0.0000000  0.00000000  0.0000000  0.0000000
5  0.37905805 -0.3615203 -0.7816264 -0.04939975  0.8477534  0.9908883
6  6.73701928  5.7646992  0.000000

In [23]:
%%R
dim(l2f_m)

[1] 7049   23


### recode all negative log2fold changes with 0

In [28]:
%%R
l2f_me = l2f_m
l2f_me[l2f_me < 0] = 0 
head(l2f_me)

        OTU    75-062     ABUTH     AMAPO       B73       B97     ECHCG
1    OTU.10 3.7523433 3.5494178 3.9076702 4.7268592 3.9815247 3.6698594
2   OTU.100 0.2596795 0.1158978 0.0000000 0.0000000 0.0000000 0.1329242
3  OTU.1000 1.4402601 0.3949173 0.4435041 0.2573087 0.6151993 0.6697786
4 OTU.10000 0.0000000 0.0000000 0.0000000 0.0000000 1.6438161 0.0000000
5 OTU.10004 0.0000000 0.0000000 0.0000000 0.3824627 1.1055433 0.0000000
6 OTU.10009 0.0000000 0.0000000 0.0000000 5.1824890 0.0000000 0.0000000
     ELCOR     ERATE    FAGES     GLYMA       H99     HELAN     Hp301     Il14H
1 3.242722 3.9915833 2.225056 4.4874020 2.4067471 2.8786942 4.3976331 4.3466981
2 0.000000 0.2387057 1.041747 0.0000000 0.0000000 0.0000000 0.0000000 0.2930102
3 1.474988 1.4759667 1.941258 0.1666615 0.4474753 0.0000000 0.7054947 0.8511066
4 0.000000 0.0000000 0.000000 0.0000000 0.0000000 0.0000000 0.0000000 0.0000000
5 0.379058 0.0000000 0.000000 0.0000000 0.8477534 0.9908883 0.0000000 0.0000000
6 6.737019 5.764

## Calculate pairwise weighted Unifrac distances between samples

need to figure out how to run unifrac measure outside of phyloseq

In [37]:
%%R -i treeFile
Tree = read.tree(treeFile)
str(Tree)
l2f_wunif = picante::unifrac(as(t(l2f_me), "matrix"), Tree)

List of 5
 $ edge       : int [1:25135, 1:2] 12570 12570 12570 12571 12572 12573 12573 12572 12574 12574 ...
 $ Nnode      : int 12567
 $ tip.label  : chr [1:12569] "X90478" "OTU.5899" "OTU.11339" "OTU.12624" ...
 $ edge.length: num [1:25135] 0.4631 0.0343 0.0501 0.0495 0.0295 ...
 $ node.label : chr [1:12567] "" "0.606.5" "0.960" "0.910" ...
 - attr(*, "class")= chr "phylo"
 - attr(*, "order")= chr "cladewise"
Error in picante::unifrac(as(t(l2f_me), "matrix"), Tree) : 
  Rooted phylogeny required for UniFrac calculation


In [30]:
%%R
otu_table(physeq.thresh) %>% head

OTU Table:          [6 taxa and 240 samples]
                     taxa are rows
         sample2-1.06_Npos sample6-4.27_Npos sample4-4.27_Nneg
OTU.5899      0.000000e+00      8.832284e-06      0.000000e+00
OTU.8680      1.329704e-04      1.413165e-04      8.043887e-05
OTU.2666      3.128715e-05      0.000000e+00      0.000000e+00
OTU.323       5.397034e-04      7.330796e-04      4.343699e-04
OTU.7636      9.386146e-05      1.324843e-04      3.217555e-05
OTU.3540      7.821788e-06      2.649685e-05      0.000000e+00
         sample4-1.06_Nneg sample2-1.06_Nneg sample6-3.07_Nneg
OTU.5899      1.655382e-05      4.908120e-05      3.229766e-05
OTU.8680      1.655382e-04      1.079786e-04      2.422324e-04
OTU.2666      1.655382e-05      4.908120e-05      0.000000e+00
OTU.323       9.270142e-04      7.264018e-04      1.098120e-03
OTU.7636      2.151997e-04      5.889744e-05      9.689297e-05
OTU.3540      3.310765e-05      2.944872e-05      0.000000e+00
         sample5-3.07_Nneg sample6-3.0

In [None]:
## 

In [None]:
%%R
registerDoParallel(cores=28)
full.wunif.dist = phyloseq::distance(physeq.thresh, 
                      method = "unifrac", 
                      weighted = TRUE,
                      fast = TRUE, 
                      parallel = TRUE, 
                      normalized = FALSE)