This script searches GTDB-Tk reference genome for gyrase B sequences to be able to assign species-level taxonomy to gyraseB amplicon data. Genomes are annotated using Prodigal, then gyraseB sequences are filtered out using a HMM profile of gyrase B from the TIGR01059 profile in the individual hmms folder. Then nucleotide and protein sequences are output to fastas. These fastas will be used as reference databases for IDTaxa. 

Inputs: genome fastas, TIGR01059 gyrb HMM profile

Outputs: blast database of full length gyrb seq used to assign taxonomy and
additional fastas of just Bacteroidales seqs or Bacteroidales seqs plus outgroups 

In [3]:
import os
import pandas as pd
from Bio import SeqIO
from Bio import Seq

os.getcwd()
os.chdir('/Volumes/AHN/captive_ape_microbiome')

#define input/output folders
indir =  'data/gyrb/ref_gyrb_gtdbtk/GTDBTK_db'
outdir = 'results/gyrb/processing/Bacteroidetes_GTDBTK_ref'

os.system('mkdir -pv '+outdir+'/hmm_out')
os.system('mkdir -pv '+outdir+'/prodigal')
os.system('mkdir -pv '+outdir+'/gyrb_seqs')
os.system('mkdir -pv '+outdir+'/gyrb_fastas')

0

In [4]:
#Read in taxonomy file
tax = pd.read_csv(f'{indir}/taxonomy/gtdb_taxonomy.tsv',sep='\t',header=None) 
tax.columns = ["gtdbtk_genome", "taxonomy"]
tax['ncbi_genome'] = tax['gtdbtk_genome'].str.split('_',n=1,expand=True).loc[:,1]
gtdbtk_to_ncbi = tax.set_index('gtdbtk_genome')['ncbi_genome'].to_dict()
ncbi_to_tax = tax.set_index('ncbi_genome')['taxonomy'].to_dict()
tax[['Domain','Phylum','Class','Order','Family','Genus','Species']] = tax.taxonomy.apply( 
   lambda x: pd.Series(str(x).split(";"))) 


In [5]:
#Select only Bacteroidetes genomes
Bt = tax[tax['taxonomy'].str.contains('Bacteroidota')]
print(len(Bt),'Bacteroidetes genomes in GTDBTK database')

#Select representatives, one from each other Order
not_Bt = tax[~tax['taxonomy'].str.contains('Bacteroidota')]
not_Bt_reps = not_Bt.groupby('Order').head(1)
print(len(not_Bt_reps),'Other bacterial orders in GTDBTK database')

#Bacteroidetes and other representative taxa 
Bt_plusreps = pd.concat([Bt, not_Bt_reps])
print(len(Bt_plusreps),'Total genomes in GTDBTK database added to assign taxonomy')

2843 Bacteroidetes genomes in GTDBTK database
903 Other bacterial orders in GTDBTK database
3746 Total genomes in GTDBTK database added to assign taxonomy


### Annotate GTDB-Tk Bacteroidales and other taxonomic representative genomes

In [5]:
#list genomes with fastas
genome_files = [f.split('_genomic.fna.gz')[0] for f in os.listdir(f'{indir}/fastani/database/')]

#select only those genomes in Bt_plusreps
Bt_plusreps = Bt_plusreps[Bt_plusreps['ncbi_genome'].isin(genome_files)]
print(len(Bt_plusreps),'genomes') #there are several reps listed in metadata without genomes in the GTDBTK folder

#how many have already been annotated
annotated_genomes = [f.split('_genomic')[0] for f in os.listdir(f'{outdir}/prodigal') if f.endswith('.faa')]
# test run
#annotated_genomes = annotated_genomes[:-5]
print(len(annotated_genomes),'genomes already annotated')

genomes_to_do = set(Bt_plusreps['ncbi_genome'])-set(annotated_genomes) 
print(len(genomes_to_do),'genomes to annotate') 

3738 genomes
3738 genomes already annotated
0 genomes to annotate


In [6]:
def prodigal(genome):
    #Annotate genomes using prodigal and identifies gyrb gene using a gtdbtk hmmprofile 
    os.system(f'gunzip {indir}/fastani/database/{genome}_genomic.fna.gz')
    os.system(f'prodigal -i {indir}/fastani/database/{genome}_genomic.fna -d {outdir}/prodigal/{genome}_genomic.fna -a {outdir}/prodigal/{genome}_genomic.faa')
    os.system(f'hmmsearch --tblout {outdir}/hmm_out/{genome}.txt {indir}/markers/tigrfam/individual_hmms/TIGR01059.HMM {outdir}/prodigal/{genome}_genomic.faa')
    os.system(f'gzip {indir}/fastani/database/{genome}_genomic.fna')   

In [7]:
#Annotate genomes that haven't been annotated
for genome in genomes_to_do:
    prodigal(genome)             

### Identify best gyrb hit and output fasta

In [8]:
def get_besthit(genome):
    #parse hmmsearch result to return sequence header and evalue
    try:
        df = pd.read_csv(f'{outdir}/hmm_out/{genome}.txt',delim_whitespace=True,header=None,comment='#')
        besthit = df.iloc[0]
        name=besthit[0]
        evalue=besthit[4]
        return(pd.Series([name,evalue]))
    except:
        return(pd.Series(['no_hit','NA']))
print('test parsing hmmsearch file')
print(get_besthit('GCF_002849695.1'))
   
def write_fasta(genome,besthit):
    record_dict = SeqIO.to_dict(SeqIO.parse(f"{outdir}/prodigal/{genome}_genomic.fna", "fasta"))
    besthit_record = record_dict[besthit]
    besthit_record.id = genome
    tax =ncbi_to_tax[genome]
    besthit_record.description = tax
    
    if len(besthit_record.seq) < 1800: #ensures gene is fulllength
        return('partial_len')
    elif 'N' in str(besthit_record.seq): 
        return('ambig_char')
    else:
        SeqIO.write(besthit_record, f"{outdir}/gyrb_seqs/{genome}.fasta", "fasta")
        return('passing_hit')
#test
write_fasta('GCF_002849695.1','NZ_CP018937.1_3992')  

Bt_plusreps[['best_hit','evalue']] = Bt_plusreps['ncbi_genome'].apply(get_besthit)
#filter out seqs with no hit
Bt_plusreps_hits = Bt_plusreps[Bt_plusreps['best_hit']!='no_hit']
print(len(Bt_plusreps_hits),'gyrB seqs found')
Bt_plusreps_hits = Bt_plusreps_hits[Bt_plusreps_hits['evalue'].astype(float)<float(1e-250)]
print(len(Bt_plusreps_hits),'gyrB seqs passing eval threshold')

#remove fastas
os.system(f"rm -r {outdir}/gyrb_seqs/")
os.system(f"mkdir -pv {outdir}/gyrb_seqs/")

Bt_plusreps_hits['records'] = Bt_plusreps_hits.apply(
                                lambda row: write_fasta(row['ncbi_genome'],row['best_hit']),
                                axis = 1)
print('summary of gyrb hits')
print(Bt_plusreps_hits['records'].value_counts())
print('summary of Bt gyrb hits')
print(Bt_plusreps_hits['records'][Bt_plusreps_hits['Order']=='o__Bacteroidales'].value_counts())

Bt_plusreps_hits.to_csv(f'{outdir}/Bt_plusreps_hits.txt',sep='\t',index=False)

0    NZ_CP018937.1_3992
1              3.5e-287
dtype: object
3636 gyrB seqs found
3277 gyrB seqs passing eval threshold
summary of gyrb hits
passing_hit    3172
ambig_char       91
partial_len      14
Name: records, dtype: int64
summary of Bt gyrb hits
passing_hit    891
ambig_char       7
partial_len      3
Name: records, dtype: int64


### concat gyrb fastas and translate

In [15]:
%%bash
cd results/gyrb/processing/Bacteroidetes_GTDBTK_ref

#concatenate gyrb seqs to file
cat gyrb_seqs/*.fasta > gyrb_fastas/gtdbtk_gyrb.fasta

#translate, align aa, and uses a guide for nucleotide seqs
transeq -sequence gyrb_fastas/gtdbtk_gyrb.fasta -outseq  gyrb_fastas/gtdbtk_gyrb.faa

Translate nucleic acid sequences


In [17]:
%%bash
#cp to ref seq folder 
cp results/gyrb/processing/Bacteroidetes_GTDBTK_ref/gyrb_fastas/gtdbtk_gyrb.fasta ref_seqs/gtdbtk_gyrb.fasta
cp results/gyrb/processing/Bacteroidetes_GTDBTK_ref/gyrb_fastas/gtdbtk_gyrb.faa ref_seqs/gtdbtk_gyrb.faa
makeblastdb -in ref_seqs/gtdbtk_gyrb.faa -dbtype prot



Building a new DB, current time: 09/30/2020 12:13:58
New DB name:   /Volumes/AHN/captive_ape_microbiome/ref_seqs/gtdbtk_gyrb.faa
New DB title:  ref_seqs/gtdbtk_gyrb.faa
Sequence type: Protein
Deleted existing Protein BLAST database named /Volumes/AHN/captive_ape_microbiome/ref_seqs/gtdbtk_gyrb.faa
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 3172 sequences in 0.139764 seconds.
