In [None]:
knitr::opts_knit$set(root.dir = '/tmp')

###Load libraries

In [75]:
library(tidyverse)
library(stringr)
library(conflicted)
library(phyloseq)

In [3]:
conflict_prefer("filter","dplyr")
conflict_prefer("rename","dplyr")
conflict_prefer("mutate","dplyr")

[conflicted] Will prefer [34mdplyr::filter[39m over any other package
[conflicted] Will prefer [34mdplyr::rename[39m over any other package
[conflicted] Will prefer [34mdplyr::mutate[39m over any other package


# Create features tables for each dataset

# 1. Muehlbauer samples

## 1.1. Import tables

In [132]:
setwd("/tmp/global2/aborbon/shortbred_quantify/quantify_muehlbauer/results_R/")
tmp.muehl=list.files(path = "/tmp/global2/aborbon/shortbred_quantify/quantify_muehlbauer/results_R/" ,pattern = "*.txt")
myfiles.muehl = lapply(tmp.muehl, read.delim)

## 1.2. Sample names

In [102]:
sample.names.muehl <- as_tibble(sapply(strsplit(basename(tmp.muehl), "quantify"), `[`, 2))%>%
  filter(!is.na(value))

In [134]:
filenamesMuehl=read_tsv("/tmp/global2/aborbon/shortbred_quantify/quantify_muehlbauer/results_R/filenames_muehl.tsv",col_names=F)%>%
  cbind(sample.names.muehl)%>%
  mutate(Sample=str_remove_all(value,"(?<=\\d).txt"),.keep="unused")

[1mRows: [22m[34m19[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [96]:
head(myfiles.muehl[[5]])
head(filenamesMuehl[5,2])

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>
1,AAA26278_1,0,0,242
2,AAA62843_1,0,0,300
3,AAA62844_1,0,0,300
4,AAB33952_1,0,0,33
5,AAB82613_1,0,0,300
6,AAC38200_1,0,0,300


## 1.3. Add a column with sample name and study name to each table in tmp list

In [137]:
#Define list where lists with sample name will be stored. This vector is empty, will be filled with the loop in next step.
vector.muehl = 1:length(myfiles.muehl)
myfiles2.muehl = vector("list",length(myfiles.muehl)) 

#Create loop to add sample name to each list
for (i in vector.muehl) {
  myfiles2.muehl[[i]]=mutate(myfiles.muehl[[i]],Sample=filenamesMuehl[i,2]) #En cada posicion de myfiles2 aplicar esta function
}

head(myfiles2.muehl[[1]]) #checkpoint

#Create another empty list to store the study name to the above lists
muehlSamples = vector("list",length(myfiles2.muehl))

#Loop to add Study name
for (i in vector.muehl) {
  muehlSamples[[i]]=mutate(myfiles2.muehl[[i]],StudyName="Muehlbauer_2020")
  
}
head(muehlSamples[[1]])

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>
1,AAA26278_1,0,0,242,100UNCULTURED_S7
2,AAA62843_1,0,0,300,100UNCULTURED_S7
3,AAA62844_1,0,0,300,100UNCULTURED_S7
4,AAB33952_1,0,0,33,100UNCULTURED_S7
5,AAB82613_1,0,0,300,100UNCULTURED_S7
6,AAC38200_1,0,0,300,100UNCULTURED_S7


Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample,StudyName
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>,<chr>
1,AAA26278_1,0,0,242,100UNCULTURED_S7,Muehlbauer_2020
2,AAA62843_1,0,0,300,100UNCULTURED_S7,Muehlbauer_2020
3,AAA62844_1,0,0,300,100UNCULTURED_S7,Muehlbauer_2020
4,AAB33952_1,0,0,33,100UNCULTURED_S7,Muehlbauer_2020
5,AAB82613_1,0,0,300,100UNCULTURED_S7,Muehlbauer_2020
6,AAC38200_1,0,0,300,100UNCULTURED_S7,Muehlbauer_2020


In [138]:
#Store all tables in one table
muehlSamples2=bind_rows(muehlSamples)

In [162]:
head(muehlSamples2)

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample,StudyName
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>,<chr>
1,AAA26278_1,0,0,242,100UNCULTURED_S7,Muehlbauer_2020
2,AAA62843_1,0,0,300,100UNCULTURED_S7,Muehlbauer_2020
3,AAA62844_1,0,0,300,100UNCULTURED_S7,Muehlbauer_2020
4,AAB33952_1,0,0,33,100UNCULTURED_S7,Muehlbauer_2020
5,AAB82613_1,0,0,300,100UNCULTURED_S7,Muehlbauer_2020
6,AAC38200_1,0,0,300,100UNCULTURED_S7,Muehlbauer_2020


## 1.4. Create the features table 

In [140]:
muehlSamples3 = pivot_wider(data=muehlSamples2,id_cols=Family,names_from=Sample,values_from=Count)%>%
  mutate(Family=str_replace_all(Family,"(?<=\\d)_",".")) # This regex "(?<=\\d)_" means every underscore (_) preceded by any digit (\\d)

In [None]:
head(muehlSamples3)


# 2. Amato dataset

## 2.1. Import lists

In [11]:
tmp.amato=list.files(path = "/tmp/global2/aborbon/shortbred_quantify/quantify_amato/pcLen90/results_R/" ,pattern = "*.txt")
myfiles = lapply(tmp.amato, read.delim)

## 2.2. Sample names

In [16]:
sample.names <- as_tibble(sapply(strsplit(basename(tmp.amato), "pcLen30quantify"), `[`, 2))%>%
  filter(!is.na(value))

In [17]:
filenamesAmato=read_tsv("/tmp/global2/aborbon/shortbred_quantify/quantify_amato/pcLen90/results_R/filenames.tsv",col_names=F)%>%
  cbind(sample.names)%>%
  mutate(Sample=str_remove_all(value,"(?<=\\d).txt"),.keep="unused")

[1mRows: [22m[34m95[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [19]:
head(filenamesAmato)

Unnamed: 0_level_0,X1,Sample
Unnamed: 0_level_1,<chr>,<chr>
1,pcLen30quantifyAC622.txt,AC622
2,pcLen30quantifyAC660.txt,AC660
3,pcLen30quantifyAC665.txt,AC665
4,pcLen30quantifyAC667.txt,AC667
5,pcLen30quantifyAC668.txt,AC668
6,pcLen30quantifyAPAL36.txt,Ahyb1000


#### Checkpoint: Testing that order in both lists match the same samples

In [21]:
head(myfiles[[5]])
head(filenamesAmato[5,2])

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>
1,AAA26278_1,0,0,242
2,AAA62843_1,0,0,300
3,AAA62844_1,0,0,300
4,AAB33952_1,0,0,33
5,AAB82613_1,0,0,300
6,AAC38200_1,0,0,300


## 2.3. Add a column with sample name and study name to each table in tmp list

In [26]:
#Define list where lists with sample name will be stored. This vector is empty, will be filled with the loop in next step.
vector=1:length(myfiles)
myfiles2 = vector("list",length(myfiles)) 

#Create loop to add sample name to each list
for (i in vector) {
  myfiles2[[i]]=mutate(myfiles[[i]],Sample=filenamesAmato[i,2]) #En cada posicion de myfiles2 aplicar esta function
}

head(myfiles2[[5]]) #checkpoint

#Create another empty list to store the study name to the above lists
amatoSamples = vector("list",length(myfiles2))

#Loop to add Study name
for (i in vector) {
  amatoSamples[[i]]=mutate(myfiles2[[i]],StudyName="Amato_2019")
  
}


Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>
1,AAA26278_1,0,0,242,AC668
2,AAA62843_1,0,0,300,AC668
3,AAA62844_1,0,0,300,AC668
4,AAB33952_1,0,0,33,AC668
5,AAB82613_1,0,0,300,AC668
6,AAC38200_1,0,0,300,AC668


In [27]:
#Store all tables in one big table
amatoSamples2=bind_rows(amatoSamples)

In [28]:
head(amatoSamples2)

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample,StudyName
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>,<chr>
1,AAA26278_1,0,0,242,AC622,Amato_2019
2,AAA62843_1,0,0,300,AC622,Amato_2019
3,AAA62844_1,0,0,300,AC622,Amato_2019
4,AAB33952_1,0,0,33,AC622,Amato_2019
5,AAB82613_1,0,0,300,AC622,Amato_2019
6,AAC38200_1,0,0,300,AC622,Amato_2019


## 2.4. Create the features table
Protein family (rows) and samples (columns)

In [30]:
amatoSamples3 = pivot_wider(data=amatoSamples2,id_cols=Family,names_from=Sample,values_from=Count)%>%
  mutate(Family=str_replace_all(Family,"(?<=\\d)_",".")) # This regex "(?<=\\d)_" means every underscore (_)

In [32]:
head(amatoSamples3)

Family,AC622,AC660,AC665,AC667,AC668,Ahyb1000,Ahyb135,Ahyb257,Ahyb401,⋯,RC2036,RC2068,RC2071,RC2096,RC2098,RT2013,RT2017,RT2019,RT2021,RT2048
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AAA26278.1,0,0,0,0.0,0,0.0,0,0,0,⋯,0,0,0.0,0.0,0.0,0,0,0,0,0
AAA62843.1,0,0,0,0.0,0,0.0,0,0,0,⋯,0,0,0.0,0.0,0.0,0,0,0,0,0
AAA62844.1,0,0,0,0.05403324,0,0.05203338,0,0,0,⋯,0,0,0.1159795,0.08850892,0.0,0,0,0,0,0
AAB33952.1,0,0,0,0.0,0,0.0,0,0,0,⋯,0,0,0.0,0.0,0.0,0,0,0,0,0
AAB82613.1,0,0,0,0.0,0,0.0,0,0,0,⋯,0,0,0.0,0.0,0.1362072,0,0,0,0,0
AAC38200.1,0,0,0,0.0,0,0.0,0,0,0,⋯,0,0,0.0,0.0,0.0,0,0,0,0,0


# 3. Vertebrates dataset 
    

## 3.1. Import lists

In [33]:
setwd("/tmp/global2/aborbon/shortbred_quantify/quantify_vertebrates/pcLen30/results_R")
tmp.vert=list.files(path = "/tmp/global2/aborbon/shortbred_quantify/quantify_vertebrates/pcLen30/results_R" ,pattern = "*.txt")
myfiles.vert = lapply(tmp.vert, read.delim)

## 3.2. Sample names

In [35]:
sample.names.vert <- as_tibble(sapply(strsplit(basename(tmp.vert), "quantify"), `[`, 2))%>%
  filter(!is.na(value))

In [36]:
filenamesVert=read_tsv("/tmp/global2/aborbon/shortbred_quantify/quantify_vertebrates/pcLen30/results_R/filenames_vert.tsv",col_names=F)%>%
  cbind(sample.names.vert)%>%
  mutate(Sample=str_remove_all(value,".txt"),.keep="unused")

[1mRows: [22m[34m323[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [37]:
head(filenamesVert)

Unnamed: 0_level_0,X1,Sample
Unnamed: 0_level_1,<chr>,<chr>
1,quantifyF141_Wood_Sandpiper.txt,F14_Common_Bream
2,quantifyF144_Wood_Sandpiper.txt,F141_Wood_Sandpiper
3,quantifyF14_Common_Bream.txt,F144_Wood_Sandpiper
4,quantifyF157a_European_Toad.txt,F157a_European_Toad
5,quantifyF157b_European_Toad.txt,F157b_European_Toad
6,quantifyF241_Great_Cormorant.txt,F241_Great_Cormorant


#### Checkpoint: Testing that order in both lists match the same samples

In [39]:
head(myfiles.vert[[5]])

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>
1,AAA26278_1,0,0,242
2,AAA62843_1,0,0,300
3,AAA62844_1,0,0,300
4,AAB33952_1,0,0,33
5,AAB82613_1,0,0,300
6,AAC38200_1,0,0,300


In [40]:
head(filenamesVert[5,2])

## 3.3. Add a column with sample name and study name to each table in 'tmp' list

In [42]:
#Define list where lists with samples name will be stored. This is an empty vector.
vector.vert=1:length(myfiles.vert)
myfiles2.vert = vector("list",length(myfiles.vert)) 

#Create loop to add sample name to each list
for (i in vector.vert) {
  myfiles2.vert[[i]]=mutate(myfiles.vert[[i]],Sample=filenamesVert[i,2]) #En cada posicion de myfiles2 aplicar esta function
}

head(myfiles2.vert[[1]]) #checkpoint

#Create another empty list to store the study name to the above lists
VertSamples = vector("list",length(myfiles2.vert))

#Loop to add Study name
for (i in vector.vert) {
  VertSamples[[i]]=mutate(myfiles2.vert[[i]],StudyName="Vertebrates")
  
}
head(VertSamples[[1]])

head(myfiles2.vert[[1]])

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>
1,AAA26278_1,0.0,0,242,F14_Common_Bream
2,AAA62843_1,0.0,0,300,F14_Common_Bream
3,AAA62844_1,0.01420288,2,300,F14_Common_Bream
4,AAB33952_1,0.0,0,33,F14_Common_Bream
5,AAB82613_1,0.0,0,300,F14_Common_Bream
6,AAC38200_1,0.0,0,300,F14_Common_Bream


Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample,StudyName
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>,<chr>
1,AAA26278_1,0.0,0,242,F14_Common_Bream,Vertebrates
2,AAA62843_1,0.0,0,300,F14_Common_Bream,Vertebrates
3,AAA62844_1,0.01420288,2,300,F14_Common_Bream,Vertebrates
4,AAB33952_1,0.0,0,33,F14_Common_Bream,Vertebrates
5,AAB82613_1,0.0,0,300,F14_Common_Bream,Vertebrates
6,AAC38200_1,0.0,0,300,F14_Common_Bream,Vertebrates


Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>
1,AAA26278_1,0.0,0,242,F14_Common_Bream
2,AAA62843_1,0.0,0,300,F14_Common_Bream
3,AAA62844_1,0.01420288,2,300,F14_Common_Bream
4,AAB33952_1,0.0,0,33,F14_Common_Bream
5,AAB82613_1,0.0,0,300,F14_Common_Bream
6,AAC38200_1,0.0,0,300,F14_Common_Bream


## 3.4. Create the features table
Protein family (rows) and samples (columns)

In [54]:
#Store all tables in one big table
VertSamples2=bind_rows(VertSamples)

#Create the observations matrix with the protein family (rows) and samples (columns)
VertSamples3 = pivot_wider(data=VertSamples2,id_cols=Family,names_from=Sample,values_from=Count)%>%
  mutate(Family=str_replace_all(Family,"(?<=\\d)_",".")) # This regex "(?<=\\d)_" means every underscore (_)

In [55]:
head(VertSamples3)

Family,F14_Common_Bream,F141_Wood_Sandpiper,F144_Wood_Sandpiper,F157a_European_Toad,F157b_European_Toad,F241_Great_Cormorant,F300_Asp,F302_Northern_Pike,F315_Northern_Pike,⋯,X87_Garden_Dormouse,X89_Fat_Dormouse,X90_Water_Rail,X91_Dunnock,X92_Lesser_White_toothed_Shrew,X93_Red_Sheep,X94_Mangalica,X95_Meadow_Viper,X96_European_Hare,X98_Eurasian_Red_Squirrel
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AAA26278.1,0.0,0,0,0,0,0,0,0,0,⋯,0,0,0,0.0,0,0,0,0,0,0
AAA62843.1,0.0,0,0,0,0,0,0,0,0,⋯,0,0,0,0.0,0,0,0,0,0,0
AAA62844.1,0.01420288,0,0,0,0,0,0,0,0,⋯,0,0,0,0.07660891,0,0,0,0,0,0
AAB33952.1,0.0,0,0,0,0,0,0,0,0,⋯,0,0,0,0.0,0,0,0,0,0,0
AAB82613.1,0.0,0,0,0,0,0,0,0,0,⋯,0,0,0,0.0,0,0,0,0,0,0
AAC38200.1,0.0,0,0,0,0,0,0,0,0,⋯,0,0,0,0.0,0,0,0,0,0,0


# 4. Plants datasets

## 4.1. Import lists

In [43]:
setwd("/tmp/global2/aborbon/shortbred_quantify/quantify_plants/pcLen30/results_R")
tmp.plants=list.files(path = "/tmp/global2/aborbon/shortbred_quantify/quantify_plants/pcLen30/results_R" ,pattern = "*.txt")
myfiles.plants = lapply(tmp.plants, read.delim)

## 4.2. Sample names

In [44]:
sample.names.plants <- as_tibble(sapply(strsplit(basename(tmp.plants), "quantify"), `[`, 2))%>%
  filter(!is.na(value))

In [45]:
filenamesPlants=read_tsv("/tmp/global2/aborbon/shortbred_quantify/quantify_plants/pcLen30/results_R/filenames_plants.tsv",col_names=F)%>%
  cbind(sample.names.plants)%>%
  mutate(Sample=str_remove_all(value,"(?<=\\d).txt"),.keep="unused")

[1mRows: [22m[34m104[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


#### Checkpoint: Testing that order in both lists match the same samples

In [46]:
head(myfiles.plants[[5]])
head(filenamesPlants[5,2])

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>
1,AAA26278_1,0,1,242
2,AAA62843_1,0,0,300
3,AAA62844_1,0,0,300
4,AAB33952_1,0,0,33
5,AAB82613_1,0,0,300
6,AAC38200_1,0,0,300


## 4.3. Add a column with sample name and study name to each table in tmp list

In [47]:
#Define list where lists with sample name will be stored. This is an empty vector
vector.plants=1:length(myfiles.plants)
myfiles2.plants = vector("list",length(myfiles.plants)) 

#Create loop to add sample name to each list
for (i in vector.plants) {
  myfiles2.plants[[i]]=mutate(myfiles.plants[[i]],Sample=filenamesPlants[i,2]) #En cada posicion de myfiles2 aplicar esta function
}

head(myfiles2.plants[[5]]) #checkpoint

#Create another empty list to store the study name to the above lists
PlantsSamples = vector("list",length(myfiles2.plants))

#Loop to add Study name
for (i in vector.plants) {
  PlantsSamples[[i]]=mutate(myfiles2.plants[[i]],StudyName="Plants")
  
}


Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>
1,AAA26278_1,0,1,242,ERR2114808
2,AAA62843_1,0,0,300,ERR2114808
3,AAA62844_1,0,0,300,ERR2114808
4,AAB33952_1,0,0,33,ERR2114808
5,AAB82613_1,0,0,300,ERR2114808
6,AAC38200_1,0,0,300,ERR2114808


## 4.4. Create features table
Protein family (rows) and samples (columns)

In [48]:
#Store all tables in one big table
PlantsSamples2=bind_rows(PlantsSamples)
head(PlantsSamples2)

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample,StudyName
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>,<chr>
1,AAA26278_1,0,0,242,ERR2114804,Plants
2,AAA62843_1,0,0,300,ERR2114804,Plants
3,AAA62844_1,0,0,300,ERR2114804,Plants
4,AAB33952_1,0,0,33,ERR2114804,Plants
5,AAB82613_1,0,0,300,ERR2114804,Plants
6,AAC38200_1,0,0,300,ERR2114804,Plants


In [50]:
#Create the observations matrix with the protein family (rows) and samples (columns)
PlantsSamples3 = pivot_wider(data=PlantsSamples2,id_cols=Family,names_from=Sample,values_from=Count)%>%
  mutate(Family=str_replace_all(Family,"(?<=\\d)_",".")) # This regex "(?<=\\d)_" means every underscore (_)

In [53]:
head(PlantsSamples3)

Family,ERR2114804,ERR2114805,ERR2114806,ERR2114807,ERR2114808,ERR2114809,ERR2114810,ERR2114811,ERR2114812,⋯,SRR6436017,SRR6436020,SRR6436021,SRR6489859,SRR6942504,SRR6942514,SRR6942518,SRR6942519,SRR6982567,SRR6982699
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AAA26278.1,0,0,0,0,0,0.0,0,0,0,⋯,0.0,0.0,0.0,0.0,0,0,0,0,0,0
AAA62843.1,0,0,0,0,0,0.0,0,0,0,⋯,0.0,0.0,0.0,0.0,0,0,0,0,0,0
AAA62844.1,0,0,0,0,0,0.05395261,0,0,0,⋯,0.06688848,0.04865381,0.05925485,0.039522,0,0,0,0,0,0
AAB33952.1,0,0,0,0,0,0.0,0,0,0,⋯,0.0,0.0,0.0,0.0,0,0,0,0,0,0
AAB82613.1,0,0,0,0,0,0.0,0,0,0,⋯,0.0,0.0,0.0,0.0,0,0,0,0,0,0
AAC38200.1,0,0,0,0,0,0.0,0,0,0,⋯,0.0,0.0,0.0,0.0,0,0,0,0,0,0


# 5. MGnify datasets

##  5.1. Import lists

In [58]:
setwd("/tmp/global2/aborbon/shortbred_quantify/quantify_mgnify/results_R/")
tmp.mg=list.files(path = "/tmp/global2/aborbon/shortbred_quantify/quantify_mgnify/results_R/" ,pattern = "*.txt")
myfiles.mg = lapply(tmp.mg, read.delim)

## 5.2. Sample names

In [59]:
sample.names.mg <- as_tibble(sapply(strsplit(basename(tmp.mg), "quantify"), `[`, 2))%>%
  filter(!is.na(value))

In [60]:
filenamesMg=read_tsv("/tmp/global2/aborbon/shortbred_quantify/quantify_mgnify/filenames_mgnify.tsv",col_names=F)%>%
  cbind(sample.names.mg)%>%
  mutate(Sample=str_remove_all(value,"(?<=\\d).txt"),.keep="unused")

[1mRows: [22m[34m286[39m [1mColumns: [22m[34m1[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (1): X1

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


#### SANITY CHECK: Testing that order in both lists match the same samples

In [61]:
head(myfiles.mg[[5]])
head(filenamesMg[5,2])

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>
1,AAA26278_1,0,0,242
2,AAA62843_1,0,0,300
3,AAA62844_1,0,0,300
4,AAB33952_1,0,0,33
5,AAB82613_1,0,0,300
6,AAC38200_1,0,0,300


## 5.3. Add a column with sample name and study name to each table in tmp list

In [62]:
#Define list where lists with sample name will be stored. This vector is empty, will be filled with the loop in next step.
vector.mg=1:length(myfiles.mg)
myfiles2.mg = vector("list",length(myfiles.mg)) 

#Create loop to add sample name to each list
for (i in vector.mg) {
  myfiles2.mg[[i]]=mutate(myfiles.mg[[i]],Sample=filenamesMg[i,2]) #En cada posicion de myfiles2 aplicar esta function
}

head(myfiles2.mg[[5]]) #checkpoint

#Create another empty list to store the study name to the above lists
MgSamples = vector("list",length(myfiles2.mg))

#Loop to add Study name
for (i in vector.mg) {
  MgSamples[[i]]=mutate(myfiles2.mg[[i]],StudyName="Mgnify")
  
}

head(MgSamples[[96]])

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>
1,AAA26278_1,0,0,242,DRS034404
2,AAA62843_1,0,0,300,DRS034404
3,AAA62844_1,0,0,300,DRS034404
4,AAB33952_1,0,0,33,DRS034404
5,AAB82613_1,0,0,300,DRS034404
6,AAC38200_1,0,0,300,DRS034404


Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample,StudyName
Unnamed: 0_level_1,<chr>,<dbl>,<int>,<int>,<chr>,<chr>
1,AAA26278_1,0,0,242,ERS1579941,Mgnify
2,AAA62843_1,0,0,300,ERS1579941,Mgnify
3,AAA62844_1,0,0,300,ERS1579941,Mgnify
4,AAB33952_1,0,0,33,ERS1579941,Mgnify
5,AAB82613_1,0,0,300,ERS1579941,Mgnify
6,AAC38200_1,0,0,300,ERS1579941,Mgnify


## 5.4. Create features table
Protein family (rows) and samples (columns)

In [63]:
#Store all tables in one big table
MgSamples2=bind_rows(MgSamples)

In [64]:
#Create the observations matrix with the protein family (rows) and samples (columns)
MgSamples3 = pivot_wider(data=MgSamples2,id_cols=Family,names_from=Sample,values_from=Count)%>%
  mutate(Family=str_replace_all(Family,"(?<=\\d)_",".")) # This regex "(?<=\\d)_" means every underscore (_)

In [65]:
head(MgSamples3)

Family,DRS034400,DRS034401,DRS034402,DRS034403,DRS034404,DRS034405,ERS1030529,ERS1030531,ERS1030532,⋯,SRS697339,SRS697340,SRS697341,SRS697342,SRS697346,SRS697347,SRS697348,SRS963313,SRS963594,SRS963627
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AAA26278.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0
AAA62843.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0
AAA62844.1,0,0,0,0,0,0,0.1333889,0.04741833,0.1484504,⋯,0.0290513,0.02566416,0.07860821,0.06148927,0.03277108,0.06399751,0.0362305,0,0,0.03910346
AAB33952.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0
AAB82613.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0
AAC38200.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,0,0.0


# 6. Create final features table
This table includes all datasets

In [141]:
AllSamples2=bind_rows(MgSamples2,VertSamples2,amatoSamples2,PlantsSamples2,MuehlSamples2)
AllSamples2[is.na(AllSamples2)] <- 0 #Replace NAs wth 0 
#AllSamples2_sub=distinct(AllSamples2,Sample,.keep_all=T)

In [142]:
head(AllSamples2)

Unnamed: 0_level_0,Family,Count,Hits,TotMarkerLength,Sample,StudyName
Unnamed: 0_level_1,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<chr>
1,AAA26278_1,0,0,242,DRS034400,Mgnify
2,AAA62843_1,0,0,300,DRS034400,Mgnify
3,AAA62844_1,0,0,300,DRS034400,Mgnify
4,AAB33952_1,0,0,33,DRS034400,Mgnify
5,AAB82613_1,0,0,300,DRS034400,Mgnify
6,AAC38200_1,0,0,300,DRS034400,Mgnify


## 6.1. Final features table - RPKM (dbl)

In [72]:
AllSamples3=pivot_wider(data=AllSamples2,id_cols=Family,names_from=Sample,values_from=Count)%>%
  mutate(Family=str_replace_all(Family,"(?<=\\d)_",".")) # This regex "(?<=\\d)_" means every underscore (_)

In [73]:
head(AllSamples3)

Family,DRS034400,DRS034401,DRS034402,DRS034403,DRS034404,DRS034405,ERS1030529,ERS1030531,ERS1030532,⋯,36UNCULTURED_S10,400UNCULTURED_S11,40UNCULTURED_S15,4UNCULTURED_S2,500UNCULTURED_S13,8UNCULTURED_S5,lucaAR31_S16,lucaAR32_S17,lucaAR33_S18,lucaAR34_S19
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AAA26278.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0,0.0,0,0,0,0,0.0,0,0,0
AAA62843.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0,0.0,0,0,0,0,0.0,0,0,0
AAA62844.1,0,0,0,0,0,0,0.1333889,0.04741833,0.1484504,⋯,0,0.05441838,0,0,0,0,0.02146873,0,0,0
AAB33952.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0,0.0,0,0,0,0,0.0,0,0,0
AAB82613.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0,0.0,0,0,0,0,0.0,0,0,0
AAC38200.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0,0.0,0,0,0,0,0.0,0,0,0


In [149]:
write_tsv(AllSamples3,"/ebio/abt3_projects2/Flagellin_Diversity/code/notebooks/shortbred/matrixAll.tsv")

## 6.2. Final features table - Counts (int)

In [144]:
AllSamples3_count=pivot_wider(data=AllSamples2,id_cols=Family,names_from=Sample,values_from=Hits)%>%
  mutate(Family=str_replace_all(Family,"(?<=\\d)_",".")) # This regex "(?<=\\d)_" means every underscore (_)

In [145]:
head(AllSamples3_count)

Family,DRS034400,DRS034401,DRS034402,DRS034403,DRS034404,DRS034405,ERS1030529,ERS1030531,ERS1030532,⋯,36UNCULTURED_S10,400UNCULTURED_S11,40UNCULTURED_S15,4UNCULTURED_S2,500UNCULTURED_S13,8UNCULTURED_S5,lucaAR31_S16,lucaAR32_S17,lucaAR33_S18,lucaAR34_S19
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AAA26278.1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AAA62843.1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AAA62844.1,0,0,0,0,0,0,3,1,3,⋯,0,2,0,0,0,0,1,0,0,0
AAB33952.1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AAB82613.1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0
AAC38200.1,0,0,0,0,0,0,0,0,0,⋯,0,0,0,0,0,0,0,0,0,0


In [148]:
write_tsv(AllSamples3_count,"/ebio/abt3_projects2/Flagellin_Diversity/code/notebooks/shortbred/matrixAll_count.tsv")

# 7. Phyloseq objects for diversity analyses
## 7.1. Create OTU table.  

In [77]:
otuAllSamples = AllSamples3 %>%
  rename(Accession="Family")
head(otuAllSamples)

Accession,DRS034400,DRS034401,DRS034402,DRS034403,DRS034404,DRS034405,ERS1030529,ERS1030531,ERS1030532,⋯,36UNCULTURED_S10,400UNCULTURED_S11,40UNCULTURED_S15,4UNCULTURED_S2,500UNCULTURED_S13,8UNCULTURED_S5,lucaAR31_S16,lucaAR32_S17,lucaAR33_S18,lucaAR34_S19
<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,⋯,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
AAA26278.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0,0.0,0,0,0,0,0.0,0,0,0
AAA62843.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0,0.0,0,0,0,0,0.0,0,0,0
AAA62844.1,0,0,0,0,0,0,0.1333889,0.04741833,0.1484504,⋯,0,0.05441838,0,0,0,0,0.02146873,0,0,0
AAB33952.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0,0.0,0,0,0,0,0.0,0,0,0
AAB82613.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0,0.0,0,0,0,0,0.0,0,0,0
AAC38200.1,0,0,0,0,0,0,0.0,0.0,0.0,⋯,0,0.0,0,0,0,0,0.0,0,0,0


## 7.2. Create samples tables

### 7.2.1. Primates datasets

In [78]:
mappingPrimates=read_tsv("/ebio/abt3_projects/small_projects/aborbon/TLR5/Primates_Amanda/R_Analyses/primateDatasets_metadata.txt",col_names=T)%>%
  mutate(biome_0="root")%>%
  mutate(biome_1="Host-associated")%>%
  mutate(biome_2="Mammalia")%>%
  mutate(biome_3="Primates")%>%
  mutate(biome_4="fecal")%>%
  rename(Dataset="StudyName")%>%
  rename(family="Family")%>%
  select(Sample,Dataset,biome_0,biome_1,biome_2,biome_3,biome_4,family) #n=114

[1mRows: [22m[34m114[39m [1mColumns: [22m[34m25[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (23): Sample, StudyName, body_habitat, body_product, body_site, collecti...
[32mdbl[39m  (1): host_taxid
[33mlgl[39m  (1): Wild

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


### 7.2.2. Vertebrates datasets

In [79]:
mappingVert=read_tsv("/ebio/abt3_projects/Georg_animal_feces/data/mapping/unified_metadata_complete_200429.tsv",col_names=T) %>% #n=323
  rename(Sample="SampleID")%>%
  inner_join(count(VertSamples2,Sample),by="Sample")%>%
  mutate(Dataset="Vertebrata")%>%
  mutate(biome_0="root")%>%
  mutate(biome_1="Host-associated")%>%
  rename(biome_2="class")%>%
  rename(biome_3="order")%>%
  rename(biome_4="sample_type")%>%
  rename(family="family")%>%
  select(Sample,Dataset,biome_0,biome_1,biome_2,biome_3,biome_4,family)

[1mRows: [22m[34m475[39m [1mColumns: [22m[34m72[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (62): SampleID, host_subject_id, host_german_name, host_common_name, sci...
[32mdbl[39m  (9): host_taxid, age, No_individuals, indiv_coreset, duplicates_group, ...
[33mlgl[39m  (1): host_body_temp

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


### 7.2.3. MGnify datasets

In [80]:
mappingMg=read_tsv("/tmp/global2/aborbon/public_metagenomes.txt",col_names=T)%>% #n=311
  rename(Dataset="MGnify_Study_ID")%>%
  rename(Sample="ID")%>%
  select(Sample,Dataset,biome_0,biome_1,biome_2,biome_3,biome_4,family)%>%
  filter(biome_2!="Plants")

[1mRows: [22m[34m516[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (14): ID, SRA_accession, Biosample, Accession, MGnify_Study_ID, biome_0,...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


### 7.2.4. Plants datasets

In [81]:
mappingPlants=read_tsv("/tmp/global2/aborbon/public_metagenomes.txt",col_names=T)%>% #n=104
  rename(Sample="ID")%>%
  inner_join(count(PlantsSamples2,Sample),by="Sample")%>%
  rename(Dataset="MGnify_Study_ID")%>%
  select(Sample,Dataset,biome_0,biome_1,biome_2,biome_3,biome_4,family)

[1mRows: [22m[34m516[39m [1mColumns: [22m[34m14[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m "\t"
[31mchr[39m (14): ID, SRA_accession, Biosample, Accession, MGnify_Study_ID, biome_0,...

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


### 7.2.5. Final samples tables - all datasets

In [82]:
mappingAll=rbind(mappingMg,mappingVert,mappingPrimates,mappingPlants)%>%
  mutate(biome_2=str_replace(biome_2,"Mammals","Mammalia"))%>%
  mutate(biome_2=str_replace(biome_2,"Fish","Actinopterygii"))

In [84]:
head(mappingAll)
nrow(mappingAll)

Sample,Dataset,biome_0,biome_1,biome_2,biome_3,biome_4,family
<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>,<chr>
ERS233527,MGYS00000320,root,Environmental,Aquatic,Marine,Intertidal zone,
ERS233526,MGYS00000320,root,Environmental,Aquatic,Marine,Intertidal zone,
ERS233524,MGYS00000320,root,Environmental,Aquatic,Marine,Intertidal zone,
ERS233525,MGYS00000320,root,Environmental,Aquatic,Marine,Intertidal zone,
ERS612894,MGYS00000447,root,Environmental,Aquatic,Marine,Intertidal zone,
ERS612888,MGYS00000447,root,Environmental,Aquatic,Marine,Intertidal zone,


In [None]:
write_tsv(mappingAll,"/tmp/global2/aborbon/metadataAllSamples.tsv")

In [154]:
count(subset(mappingAll,biome_1=="Environmental"),biome_3)

biome_3,n
<chr>,<int>
Freshwater,28
Marine,75
Non-marine Saline and Alkaline,11
Soil,52
