## Load libraries

In [None]:
library(tidyverse)
library(stringr)
library(tidyr)
library(dplyr)
library(readr)
library(taxonomizr)

## Import accessions of the flagellin database

In [None]:
#Import accessions
accn.full.db=read.delim("/ebio/abt3_projects/small_projects/aborbon/TLR5/Dalong_Flagellin/FinalDatabase/accessions_curatedFlagellin_pfam_panther_noflgl.txt",header=F)%>%
  rename(Accession=V1)

## Import GTDB metadata and taxonomy files

In [None]:
#Import metadata of GTDB version 202
gtdb_bac_metadata_v202=read.delim("/ebio/abt3_projects/small_projects/aborbon/TLR5/Dalong_Flagellin/gtdb_v202/bac120_metadata_r202.tsv")

#Import taxonomy of GTDB version 202
gtdb_bac.tax_v202=read.delim("/ebio/abt3_projects/small_projects/aborbon/TLR5/Dalong_Flagellin/gtdb_v202/bac120_taxonomy_r202.tsv",header=F)%>%
  rename(accession=V1)%>%
  separate(V2,into=c("Domain","Phylum","Class","Order","Family","Genus","Species"),sep=";")

#Merge taxonomy and metadata files
gtdb_taxonomy.v202=rbind(gtdb_bac.tax_v202,gtdb_arc.tax_v202) #n=258406
taxid.to.gtdb_taxonomy.v202=inner_join(gtdb_bac.tax_v202,gtdb_bac_metadata_v202)%>%
  select(Domain,Phylum,Class,Order,Family,Genus,Species,ncbi_taxid,accession)


## Obtain NCBI taxonomic annotation for the flagellin database - taxonomizr

In [None]:
#1. Get taxids with taxonomizr
#n=33051
taxids.full.db=accessionToTaxa((as.character(accn.full.db$Accession)),"/ebio/abt3_projects2/databases_no-backup/NCBI_accession2taxid/accessionTaxa.sql",version="version")

#2. Assign taxonomy
#Ouput: 29036
taxonomy.ncbi.full.db=accn.full.db%>%
  mutate(ncbi_taxid=taxids.full.db)%>%
  mutate(ncbi_phylum=getTaxonomy(taxids.full.db,"/ebio/abt3_projects/databases_no-backup/NCBI_accession2taxid/accessionTaxa.sql",desiredTaxa=c("phylum")))%>%
  mutate(ncbi_class=getTaxonomy(taxids.full.db,"/ebio/abt3_projects/databases_no-backup/NCBI_accession2taxid/accessionTaxa.sql",desiredTaxa=c("class")))%>%
  mutate(ncbi_order=getTaxonomy(taxids.full.db,"/ebio/abt3_projects/databases_no-backup/NCBI_accession2taxid/accessionTaxa.sql",desiredTaxa=c("order")))%>%
  mutate(ncbi_family=getTaxonomy(taxids.full.db,"/ebio/abt3_projects/databases_no-backup/NCBI_accession2taxid/accessionTaxa.sql",desiredTaxa=c("family")))%>%
  mutate(ncbi_genus=getTaxonomy(taxids.full.db,"/ebio/abt3_projects/databases_no-backup/NCBI_accession2taxid/accessionTaxa.sql",desiredTaxa=c("genus")))%>%
  mutate(ncbi_species=getTaxonomy(taxids.full.db,"/ebio/abt3_projects/databases_no-backup/NCBI_accession2taxid/accessionTaxa.sql",desiredTaxa=c("species")))%>%
  filter(!is.na(ncbi_phylum)&!is.na(ncbi_genus))


## Obtain GTDB taxonomic annotation for the flagellin database

In [None]:

#1. Generate a big annotation file that includes NCBI and matching GTDB taxonomy for each accession
#Accessions without a matching GTDB taxonomy will be discarded
taxonomy.gtdb.full.db=inner_join(taxonomy.ncbi.full.db,taxid.to.gtdb_taxonomy.v202,by="ncbi_taxid")%>%
  group_by(ncbi_taxid)%>%
  distinct(Accession,.keep_all=T)%>%
  filter(!is.na(accession))

nrow(taxonomy.gtdb.full.db) #n=24915

#Export accession numbers to filter the fasta file
write_lines(taxonomy.gtdb.full.db$Accession, path="/ebio/abt3_projects/small_projects/aborbon/TLR5/Dalong_Flagellin/flic_flab_search/accessions_fulldb_taxonomy_gtdb.txt")


## Create metadata table of flagellins

In [None]:
metadata.gtdb.full.db=inner_join(taxonomy.gtdb.full.db,gtdb_metadata.v202,by="accession")
nrow(metadata.gtdb.full.db) #n=27207

In [None]:
#Get taxonomy with GTDB metadata

taxonomy.full.db.gtdb=left_join(taxonomy.gtdb.full.db,gtdb_bac_metadata_v202,by="ncbi_taxid")
nrow(taxonomy.full.db.gtdb)


## Taxonomy count per phylum

In [None]:

#Phylum counts
count_phylum.full.db=taxonomy.gtdb.full.db%>%
  group_by(Phylum)%>%
  count()%>%
  arrange(desc(n))
count_phylum.full.db

  

## Sources

In [None]:
taxonomy.gtdb.full.db %>%
  filter(Accession %in% taxonomy.full.db.gtdb)

## Plot taxonomy by phylum

In [None]:
taxonomy.gtdb.full.db=
ggplot(data=count_phylum.full.db,aes(x="",y=n,fill=reorder(Family,-n)))

ggplot(data=count_phylum.full.db,aes(x="",y=n,fill=reorder(Phylum,-n))) +
  geom_bar(stat="identity",width=1,color="white")+
  coord_polar("y",start = 0)+
  scale_fill_npg(name="Phylum")+
  theme_void()+
  labs(fill="Source")+
  theme(legend.title = element_blank())

top.fla=count_phylum.full.db%>%
  arrange(desc(n))

  ggplot(data=top.fla,aes(x=n,y=reorder(Phylum,-n),color=Phylum))+
  geom_point()+
  geom_segment(aes(x=0,xend=n,yend=Phylum,y=Phylum),size=1)+
  labs(x="Count",y="Phylum")+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 45, hjust = 1, size = 11),
          axis.text.y = element_text(size = 8))+
  scale_color_lancet()
  