Notebook to take into account the relative abundances of species when calculating the average number of gene copies across species of a same genus / family.
- data too large to be included in this Git repository 
- data available on demand: albane.ruaud@tuebingen.mpg.de

# Preambule

In [1]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(qs)
library(stringr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


qs v0.23.4.



# Taxa

In [2]:
# taxa used in the model
feats <- qread('../data/taxa_not_flt.qs') %>%
    select(-Sample)
feats <- data.frame(Feature = colnames(feats), Abundance_avg = colMeans(feats))

In [3]:
feats <- feats %>% subset(Abundance_avg > 0 ) 
feats %>% dim

In [4]:
feats %>% head(2)

Unnamed: 0_level_0,Feature,Abundance_avg
Unnamed: 0_level_1,<chr>,<dbl>
s_1_14_0_10_34_76_sp002763075,s_1_14_0_10_34_76_sp002763075,6.1734e-07
s_1_19_sp001922585,s_1_19_sp001922585,4.584657e-07


# llcds results

In [5]:
# hyddb
hyddb <- fread('/ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/Bacteria_marker_genes/data/hydDB/hyddb-results.csv'
               , sep=',', header=FALSE) %>%
    select(-V6, -V7, -V8) 
hyddb$V3 <- hyddb$V3 %>% str_replace(pattern = ' +$', replacement = '')

In [6]:
# re-loading table
hits <- fread('/ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/Bacteria_marker_genes/data/llcds_hyd/annotate/dmnd_hits_all.tsv.gz')

In [7]:
# filtering
hits_f <- hits %>% subset(pident >= 50 & length / qlen > 0.8)

In [8]:
hits %>% dim
hits_f %>% dim

In [9]:
hits_f <- hits_f %>% left_join(hyddb, c('sseqid'='V3'))

In [10]:
hits_f %>% dim

# Add taxonomic level

In [11]:
aa_files_f <- read.table('/ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/Bacteria_marker_genes/data/samples.tsv'
                        , sep = '\t', header = TRUE)  %>%
    select(Sample, Phylum, Class, Order, Family, Genus, Species) %>%
    rename('Genome' = Sample)

In [12]:
# all taxa lacking hits
hits_f <- aa_files_f %>%
    left_join(hits_f, by = 'Genome')
hits_f$V5[is.na(hits_f$V5)] <- 'None'

In [13]:
hits_f %>% colnames

In [14]:
hits_tax <- hits_f %>% select(Genome, Phylum, Class, Order, Family, Genus, Species, V5, pident) %>% 
                subset(Species %in% feats$Feature)

# Per taxonomic level

## species

In [15]:
pp <- hits_tax %>% group_by(Phylum, Class, Order, Family, Genus, Species, V5) %>% 
        summarise(n_copies = n(), pident = mean(pident, na.rm = TRUE))

`summarise()` regrouping output by 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species' (override with `.groups` argument)



## add all missing categories for each species

In [16]:
sp <- pp$Species %>% unique
hctg <- pp$V5 %>% unique

In [17]:
# create all combinations of species and hydrogenases needed
to_add <- expand.grid(sp, hctg) %>% as.data.frame
colnames(to_add) <- c('Species', 'V5')
    
# keep only the ones missing: give them a 0 count number and NA pident
to_add <- setdiff(to_add, pp %>% ungroup %>% select(Species, V5)) %>% 
                # add the taxonomic levels
                left_join(pp %>% select(Phylum, Class, Order,Family, Genus, Species), by = 'Species') 
to_add$n_copies <- 0
to_add$pident <- NaN
    
# add them to the dataset
pp <- rbind(pp, to_add)

In [18]:
pp %>% tail(2)

Phylum,Class,Order,Family,Genus,Species,V5,n_copies,pident
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>
p_Verrucomicrobiota,c_Verrucomicrobiae,o_Verrucomicrobiales,f_Akkermansiaceae,g_Akkermansia,s_Akkermansia_sp001580195,[NiFe] Group 3c,0,
p_Verrucomicrobiota,c_Verrucomicrobiae,o_Verrucomicrobiales,f_Akkermansiaceae,g_Akkermansia,s_Akkermansia_sp002358905,[NiFe] Group 3c,0,


## genera

In [19]:
gen <- pp %>% left_join(feats, by = c('Species' = 'Feature')) %>% 
        group_by(Phylum, Class, Order, Family, Genus, V5) %>% 
        summarise(n_copies = weighted.mean(x = n_copies, w = Abundance_avg, na.rm = TRUE)
                  , pident =  weighted.mean(x = pident, w = Abundance_avg, na.rm = TRUE))
gen$Feature <- gen$Genus 

`summarise()` regrouping output by 'Phylum', 'Class', 'Order', 'Family', 'Genus' (override with `.groups` argument)



## family

In [20]:
fam <- pp %>% left_join(feats, by = c('Species' = 'Feature')) %>% 
        group_by(Phylum, Class, Order, Family, V5) %>% 
        summarise(n_copies = weighted.mean(x = n_copies, w = Abundance_avg, na.rm = TRUE)
                  , pident =  weighted.mean(x = pident, w = Abundance_avg, na.rm = TRUE))
fam$Feature <- fam$Family

`summarise()` regrouping output by 'Phylum', 'Class', 'Order', 'Family' (override with `.groups` argument)



## merge

In [21]:
colnames(pp)
colnames(gen)
colnames(fam)

In [22]:
gen$Species <- NA
fam$Genus <- NA
fam$Species <- NA
pp$Feature <- pp$Species

In [23]:
nrow(pp)
nrow(gen)
nrow(fam)

In [24]:
pp <- union(pp, gen) %>% union(fam)

In [25]:
nrow(pp)

In [26]:
pp %>% head

Phylum,Class,Order,Family,Genus,Species,V5,n_copies,pident,Feature
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>
p_Acidobacteriota,c_Acidobacteriae,o_Acidobacteriales,f_Koribacteraceae,g_QIAA01,s_QIAA01_sp003224905,,1,,s_QIAA01_sp003224905
p_Acidobacteriota,c_Blastocatellia,o_Pyrinomonadales,f_Pyrinomonadaceae,g_OLB17,s_OLB17_sp002360555,,1,,s_OLB17_sp002360555
p_Actinobacteriota,c_Actinobacteria,o_Actinomycetales,f_Actinomycetaceae,g_Actinomyces,s_Actinomyces_dentalis,,1,,s_Actinomyces_dentalis
p_Actinobacteriota,c_Actinobacteria,o_Actinomycetales,f_Actinomycetaceae,g_Actinomyces,s_Actinomyces_gerencseriae,,1,,s_Actinomyces_gerencseriae
p_Actinobacteriota,c_Actinobacteria,o_Actinomycetales,f_Actinomycetaceae,g_Actinomyces,s_Actinomyces_graevenitzii,,1,,s_Actinomyces_graevenitzii
p_Actinobacteriota,c_Actinobacteria,o_Actinomycetales,f_Actinomycetaceae,g_Actinomyces,s_Actinomyces_israelii,,1,,s_Actinomyces_israelii


In [27]:
(!(unique(pp$Feature) %in% feats$Feature)) %>% sum

In [28]:
#write.table(pp, file = '../data/llcds_hyd/hyd-annot_byPredFeat_byHyd_weightedRA.txt'
#            , sep = '\t', quote = FALSE)

# Formatting

In [29]:
colnames(pp)[colnames(pp) == 'V5'] <- 'hyd_ctg'

In [30]:
feat_imp <- qread('../tmp/Variable_importance.qs')
hyd_pergene <- pp %>% ungroup %>% select(-Phylum, -Class, -Order, -Family, -Genus, -Species) %>% 
                    left_join(feat_imp, by = 'Feature')

## add descriptions

In [31]:
hyd_description <- read.table('hydDB_curated.txt', sep = '\t', header = TRUE)

In [32]:
hyd_description %>% subset(pred_function %in% c('Bifurcating', 'Bidirectional'))

Unnamed: 0_level_0,hyd_ctg,pred_function,electron_donor_acceptor,O2,comment
Unnamed: 0_level_1,<chr>,<chr>,<chr>,<chr>,<chr>
1,[Fe],Bidirectional,Hmd,tolerant,Reduction of Hmd during hydrogenotrophic methanogenesis
4,[FeFe] Group A3,Bifurcating,"NAD, ferredoxin",labile,
26,[NiFe] Group 3a,Bidirectional,F420,tolerant,Reduction of F420 during hydrogenotrophic methanogenesis
27,[NiFe] Group 3b,Bidirectional,NADP,tolerant,Directly couples oxidation of NADPH to fermentative evolution of H2. Diverse soil and aquatic environments.
28,[NiFe] Group 3c,Bifurcating,heterodisulfide and ferredoxin,tolerant,In methanogens
29,[NiFe] Group 3d,Bidirectional,NAD,tolerant,


In [33]:
hyd_pergene <- hyd_pergene %>% left_join(hyd_description %>% select(-comment), by = 'hyd_ctg')

In [34]:
hyd_pergene <- hyd_pergene %>% subset(!(str_detect(Feature, pattern = '^(f|g)\\_') & hyd_ctg == 'None') )

In [35]:
# simplify electron donor/acceptor variable: Sulfate and others if sulfate listed
hyd_pergene$electron_donor_acceptor <- ifelse(str_detect(hyd_pergene$electron_donor_acceptor, pattern = 'Sulfate')
                                             , 'Sulfate and others', hyd_pergene$electron_donor_acceptor)
# simplify: keep only the interesting labels (in tmp) and group the others
tmp <- c('Ferredoxin', 'NAD', 'NAD, ferredoxin', 'NADP', 'Formate', 'Sulfate and others')
hyd_pergene$electron_donor_acceptor <- ifelse(!(hyd_pergene$electron_donor_acceptor %in% tmp), 'Other', hyd_pergene$electron_donor_acceptor)

In [36]:
hyd_pergene$pred_general <- ifelse(hyd_pergene$pred_function %in% c('Bifurcating', 'Evolving'), 'H2 producing'
                                   , hyd_pergene$pred_function)

In [37]:
# this one is bifurcating but actually involved in H2 uptake
hyd_pergene$pred_general[ hyd_pergene$hyd_ctg == '[NiFe] Group 3c' ] <- 'H2uptake'

In [38]:
tmpFun <- function(x){ifelse(is.na(x), 0, x)}
hyd_pergene <- hyd_pergene %>% mutate_at(vars(Gini, importance), tmpFun)

In [39]:
colnames(hyd_pergene) <- str_replace(colnames(hyd_pergene), pattern = 'importance', replacement = 'endoR')

In [40]:
qsave(hyd_pergene, 'HydDB_res.qs')