## Description:

    Use DeSeq2 to identify differential abundance between hybrids
    
    Format table of rhizosphere responders for supplemental
   


# Setting variables

In [2]:
workDir = '/home/bryan/ERA/data/MiSeq/20170417_run1/DESeq2'
#using physeq file with sparsity of greater than 3 in 3 samples
physeqFile = '/home/bryan/ERA/data/MiSeq/20170417_run1/phyloseq/ERA_thresh.rds'

# Init

In [3]:
%load_ext rpy2.ipython

In [4]:
%%R
# The required package list:
reqpkg = c("plyr","BiocParallel","doParallel", "DESeq2", "foreach", "ggplot2", 
   "gridExtra","scales", "phyloseq", "dplyr", "tidyr", 
           "reshape2", "vegan", "RColorBrewer")
# Load all required packages and show version
for (i in reqpkg) {
    print(i)
    print(packageVersion(i))
    suppressMessages(library(i, quietly = TRUE, verbose = FALSE, 
                             warn.conflicts = FALSE, character.only = TRUE))
}

[1] "plyr"
[1] ‘1.8.4’
[1] "BiocParallel"
[1] ‘1.8.2’
[1] "doParallel"
[1] ‘1.0.10’
[1] "DESeq2"
[1] ‘1.14.1’
[1] "foreach"
[1] ‘1.4.3’
[1] "ggplot2"
[1] ‘2.2.1’
[1] "gridExtra"
[1] ‘2.2.1’
[1] "scales"
[1] ‘0.4.1’
[1] "phyloseq"
[1] ‘1.19.1’
[1] "dplyr"
[1] ‘0.5.0’
[1] "tidyr"
[1] ‘0.6.0’
[1] "reshape2"
[1] ‘1.4.2’
[1] "vegan"
[1] ‘2.4.3’
[1] "RColorBrewer"
[1] ‘1.1.2’


In [5]:
%%R
register(MulticoreParam(20))

# Loading phyloseq object

In [6]:
%%R -i physeqFile

physeq.Full = readRDS(physeqFile)
physeq.Full

phyloseq-class experiment-level object
otu_table()   OTU Table:         [ 4592 taxa and 243 samples ]
sample_data() Sample Data:       [ 243 samples by 46 sample variables ]
tax_table()   Taxonomy Table:    [ 4592 taxa by 8 taxonomic ranks ]
phy_tree()    Phylogenetic Tree: [ 4592 tips and 4591 internal nodes ]


In [7]:
%%R



# Set rep and R_Year as a factor
sample_data(physeq.Full)$Rep = factor(sample_data(physeq.Full)$Rep) 
sample_data(physeq.Full)$R_Year = factor(sample_data(physeq.Full)$R_Year)

sample_data(physeq.Full)$Plant %>% levels

#remove BARE soil controls and untreated seeds
phy = subset_samples(physeq.Full, Genotype == "Bare" | SeedTreatment == "Treated")


#re-order levels 
sample_data(phy)$Plant %>% levels


 [1] "1936" "1939" "1942" "1953" "1961" "1971" "1975" "1984" "1991" "2001"
[11] "2003" "2011" "Bare"


## Function for generating differential abundance table

In [32]:
%%R

asNumeric = function(x) { as.numeric(as.character(x)) }

get_Ps = function(physeq.obj) {
    diagdds = phyloseq_to_deseq2(physeq.obj, ~Plant)
    diagdds = DESeq(diagdds, quiet = TRUE, parallel = TRUE)
    theta = 1
    r = results(diagdds)
    beta = r$log2FoldChange
    betaSE = r$lfcSE
    p = pnorm(beta, theta, betaSE, lower.tail = FALSE)
    r$p = p
    return(r[, c("log2FoldChange", "p")])
}

get_Psv2 = function(physeq.obj) {
    diagdds = phyloseq_to_deseq2(physeq.obj, ~Plant)
    diagdds = DESeq(diagdds, quiet = TRUE, parallel = TRUE)
    #theta = 1
    r = results(diagdds, independentFiltering = TRUE, parallel = TRUE, 
                lfcThreshold=1, altHypothesis="greaterAbs")
#     beta = r$log2FoldChange
#     betaSE = r$lfcSE
#     p = pnorm(beta, theta, betaSE, lower.tail = FALSE)
#    r$p = p
   
    return(r) #[, c("log2FoldChange", "p")])
}

#Function to trim physeq object to just samples to include in Deseq analysis
deseq_prune = function(physeq.obj, plant, day) {
    physeq.md = sample_data(physeq.obj)
    p = prune_samples((physeq.md$Plant %in% c("Bare", plant))&
                     (physeq.md$TimePoint == day), physeq.obj)
    p.thresh = filter_taxa(p, function(x) sum(x > 0) > 0, TRUE)
    return(p.thresh)
}


## Loop through sampling dates and plants to compare to bare soil

In [33]:
%%R
Sd = sample_data(phy) %>% as.data.frame()
days = levels(Sd$TimePoint)

df_all = data.frame()
df_plant = data.frame()

for (day in days) {
    #print(day) ##need to remove
    Sd.d = Sd[Sd$TimePoint == day,]
    Sd.d = Sd.d[Sd.d$Plant != "Bare"]
    Plants = levels(Sd.d$Plant) 
    Plants %>% print

    for (plant in Plants) {
        DF = NULL
        
        #Trim physeq object by to day-plant combo 
        physeq.plant = deseq_prune(phy, plant, day)
        
        #re-order levels 
        sample_data(physeq.plant)$Plant = relevel(sample_data(physeq.plant)$Plant, "Bare")
        
        #convert to deseq using function above
        # using second function to test difference with manual calculations vs internal DESeq calculations
        DF = get_Psv2(physeq.plant)
        
        supp = tax_table(physeq.plant)
            if (!setequal(row.names(DF), row.names(supp))) {
            stop("Ordination and supplementary data indices differ on the following:\n.",
            setdiff(row.names(DF), row.names(supp)))
        }  
   
        DF = data.frame(DF, supp)
        DF$OTU = rownames(DF)
        DF$TimePoint = day
        DF$Plant = plant
        df_plant = rbind(df_plant, DF)     
    }

}
df_all = df_plant  


[1] "2011"
[1] "1939" "1953" "1971" "1984" "2001" "2011"
 [1] "1936" "1939" "1942" "1953" "1961" "1971" "1975" "1984" "1991" "2001"
[11] "2003" "2011"
[1] "1939" "1953" "1971" "1984" "2001" "2011"


In [34]:
%%R
BareHybrid = df_all

In [35]:
%%R
BareHybrid$p.adj = p.adjust(BareHybrid$pvalue, "BH")

In [36]:
%%R
# Explore rhizosphere responders after multiple comparison correction across all comps
sig.up = BareHybrid %>% filter(p.adj < 0.05 & log2FoldChange > 0)

print(paste("OTUs significantly enriched in the rhizosphere of any genotype: ", 
            sig.up$OTU %>% unique %>% length, sep = ""))

sig.up.t1 = BareHybrid %>% filter(p.adj < 0.05 & log2FoldChange > 0 & TimePoint == "T1")
print(paste("OTUs significantly enriched in the rhizosphere at T1: ", sig.up.t1$OTU %>% unique %>% length, sep = ""))

sig.up.t2 = BareHybrid %>% filter(p.adj < 0.05 & log2FoldChange > 0 & TimePoint == "T2")
print(paste("OTUs significantly enriched in the rhizosphere at T2: ", sig.up.t2$OTU %>% unique %>% length, sep = ""))

sig.up.t3 = BareHybrid %>% filter(p.adj < 0.05 & log2FoldChange > 0 & TimePoint == "T3")
print(paste("OTUs significantly enriched in the rhizosphere at T3: ", sig.up.t3$OTU %>% unique %>% length, sep = ""))

sig.up.t0 = BareHybrid %>% filter(p.adj < 0.05 & log2FoldChange > 0 & TimePoint == "T0")
print(paste("OTUs significantly enriched in the rhizosphere at T0: ", sig.up.t0$OTU %>% unique %>% length, sep = ""))

#sig.up

[1] "OTUs significantly enriched in the rhizosphere of any genotype: 284"
[1] "OTUs significantly enriched in the rhizosphere at T1: 125"
[1] "OTUs significantly enriched in the rhizosphere at T2: 78"
[1] "OTUs significantly enriched in the rhizosphere at T3: 242"
[1] "OTUs significantly enriched in the rhizosphere at T0: 0"


In [37]:
%%R -i workDir


write.csv(BareHybrid, file = "/home/bryan/ERA/data/MiSeq/20170417_run1/DESeq2/BarevsHybrid.csv", row.names = FALSE, sep = ",")

In [38]:
%%R
sessionInfo()

R version 3.3.2 (2016-10-31)
Platform: x86_64-pc-linux-gnu (64-bit)
Running under: Ubuntu 16.04.3 LTS

locale:
 [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
 [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
 [5] LC_MONETARY=en_US.UTF-8    LC_MESSAGES=en_US.UTF-8   
 [7] LC_PAPER=en_US.UTF-8       LC_NAME=C                 
 [9] LC_ADDRESS=C               LC_TELEPHONE=C            
[11] LC_MEASUREMENT=en_US.UTF-8 LC_IDENTIFICATION=C       

attached base packages:
 [1] stats4    parallel  tools     stats     graphics  grDevices utils    
 [8] datasets  methods   base     

other attached packages:
 [1] RColorBrewer_1.1-2         vegan_2.4-3               
 [3] lattice_0.20-35            permute_0.9-4             
 [5] reshape2_1.4.2             tidyr_0.6.0               
 [7] dplyr_0.5.0                phyloseq_1.19.1           
 [9] scales_0.4.1               gridExtra_2.2.1           
[11] ggplot2_2.2.1              DESeq2_1.14.1             
[13] SummarizedExper