Notebook to take into account the relative abundances of species when calculating the average number of gene copies across species of a same genus / family.
- data too large to be included in this Git repository 
- data available on demand: albane.ruaud@tuebingen.mpg.de

# Preambule

In [1]:
library(dplyr)
library(tidyr)
library(ggplot2)
library(data.table)
library(qs)
library(stringr)


Attaching package: ‘dplyr’


The following objects are masked from ‘package:stats’:

    filter, lag


The following objects are masked from ‘package:base’:

    intersect, setdiff, setequal, union



Attaching package: ‘data.table’


The following objects are masked from ‘package:dplyr’:

    between, first, last


qs v0.23.4.



# Taxa

In [2]:
# taxa used in the model
feats <- qread('../data/taxa_not_flt.qs') %>%
    select(-Sample)
feats <- data.frame(Feature = colnames(feats), Abundance_avg = colMeans(feats))

In [3]:
feats <- feats %>% subset(Abundance_avg > 0 ) 
feats %>% dim

In [4]:
feats %>% head(2)

Unnamed: 0_level_0,Feature,Abundance_avg
Unnamed: 0_level_1,<chr>,<dbl>
s_1_14_0_10_34_76_sp002763075,s_1_14_0_10_34_76_sp002763075,6.1734e-07
s_1_19_sp001922585,s_1_19_sp001922585,4.584657e-07


# llcds results

In [5]:
hits <- fread('/ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/Bacteria_marker_genes/data/llcds_acetobase/annotate/dmnd_hits_all.tsv.gz')

In [6]:
# filtering
hits_f <- hits %>% subset(pident >= 50 & length / qlen > 0.8)
hits_f$V5 <- 'fhs'

In [7]:
hits %>% dim
hits_f %>% dim

# Add taxonomic level

In [8]:
aa_files_f <- read.table('/ebio/abt3_projects/Methanogen_SCFA/Metagenomes_methanogen/Bacteria_marker_genes/data/samples.tsv'
                        , sep = '\t', header = TRUE)  %>%
    select(Sample, Phylum, Class, Order, Family, Genus, Species) %>%
    rename('Genome' = Sample)

In [9]:
# all taxa lacking hits
hits_f <- aa_files_f %>%
    left_join(hits_f, by = 'Genome')
hits_f$V5[is.na(hits_f$V5)] <- 'None'

In [10]:
hits_f %>% colnames

In [11]:
hits_tax <- hits_f %>% select(Genome, Phylum, Class, Order, Family, Genus, Species, V5, pident) %>% 
                subset(Species %in% feats$Feature)

# Per taxonomic level

## species

In [12]:
pp <- hits_tax %>% group_by(Phylum, Class, Order, Family, Genus, Species, V5) %>% 
        summarise(n_copies = n(), pident = mean(pident, na.rm = TRUE))

`summarise()` regrouping output by 'Phylum', 'Class', 'Order', 'Family', 'Genus', 'Species' (override with `.groups` argument)



## add all missing categories for each species

In [13]:
sp <- pp$Species %>% unique
hctg <- pp$V5 %>% unique

In [14]:
# create all combinations of species and acetobase gene needed
to_add <- expand.grid(sp, hctg) %>% as.data.frame
colnames(to_add) <- c('Species', 'V5')
    
# keep only the ones missing: give them a 0 count number and NA pident
to_add <- setdiff(to_add, pp %>% ungroup %>% select(Species, V5)) %>% 
                # add the taxonomic levels
                left_join(pp %>% select(Phylum, Class, Order,Family, Genus, Species), by = 'Species') 
to_add$n_copies <- 0
to_add$pident <- NaN
    
# add them to the dataset
pp <- rbind(pp, to_add)

## genera

In [15]:
gen <- pp %>% left_join(feats, by = c('Species' = 'Feature')) %>% 
        group_by(Phylum, Class, Order, Family, Genus, V5) %>% 
        summarise(n_copies = weighted.mean(x = n_copies, w = Abundance_avg, na.rm = TRUE)
                  , pident =  weighted.mean(x = pident, w = Abundance_avg, na.rm = TRUE))
gen$Feature <- gen$Genus 

`summarise()` regrouping output by 'Phylum', 'Class', 'Order', 'Family', 'Genus' (override with `.groups` argument)



## family

In [16]:
fam <- pp %>% left_join(feats, by = c('Species' = 'Feature')) %>% 
        group_by(Phylum, Class, Order, Family, V5) %>% 
        summarise(n_copies = weighted.mean(x = n_copies, w = Abundance_avg, na.rm = TRUE)
                  , pident =  weighted.mean(x = pident, w = Abundance_avg, na.rm = TRUE))
fam$Feature <- fam$Family

`summarise()` regrouping output by 'Phylum', 'Class', 'Order', 'Family' (override with `.groups` argument)



## merge

In [17]:
colnames(pp)
colnames(gen)
colnames(fam)

In [18]:
gen$Species <- NA
fam$Genus <- NA
fam$Species <- NA
pp$Feature <- pp$Species

In [19]:
nrow(pp)
nrow(gen)
nrow(fam)

In [20]:
gen  %>% ungroup %>% count(V5, is.na(pident), n_copies > 0)

V5,is.na(pident),n_copies > 0,n
<chr>,<lgl>,<lgl>,<int>
fhs,False,True,695
fhs,True,False,246
,True,False,642
,True,True,299


In [21]:
pp <- union(pp, gen) %>% union(fam)

In [22]:
nrow(pp)

In [23]:
pp %>% head

Phylum,Class,Order,Family,Genus,Species,V5,n_copies,pident,Feature
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<dbl>,<dbl>,<chr>
p_Acidobacteriota,c_Acidobacteriae,o_Acidobacteriales,f_Koribacteraceae,g_QIAA01,s_QIAA01_sp003224905,,1,,s_QIAA01_sp003224905
p_Acidobacteriota,c_Blastocatellia,o_Pyrinomonadales,f_Pyrinomonadaceae,g_OLB17,s_OLB17_sp002360555,fhs,10,93.85,s_OLB17_sp002360555
p_Actinobacteriota,c_Actinobacteria,o_Actinomycetales,f_Actinomycetaceae,g_Actinomyces,s_Actinomyces_dentalis,,1,,s_Actinomyces_dentalis
p_Actinobacteriota,c_Actinobacteria,o_Actinomycetales,f_Actinomycetaceae,g_Actinomyces,s_Actinomyces_gerencseriae,fhs,10,77.18,s_Actinomyces_gerencseriae
p_Actinobacteriota,c_Actinobacteria,o_Actinomycetales,f_Actinomycetaceae,g_Actinomyces,s_Actinomyces_graevenitzii,fhs,10,77.52,s_Actinomyces_graevenitzii
p_Actinobacteriota,c_Actinobacteria,o_Actinomycetales,f_Actinomycetaceae,g_Actinomyces,s_Actinomyces_israelii,fhs,10,79.61,s_Actinomyces_israelii


In [24]:
(!(unique(pp$Feature) %in% feats$Feature)) %>% sum

# Formatting

In [25]:
feat_imp <- qread('../tmp/Variable_importance.qs')

In [26]:
pp <- pp %>% subset(V5 != 'None') %>% ungroup %>% 
        select(-Phylum, -Class, -Order, -Family, -Genus, -Species) %>% 
        left_join(feat_imp, by = 'Feature')

In [27]:
qsave(pp, 'Acetobase_res.qs')