The purpose of this code is to search GTDB-Tk reference genome for gyrase B sequences to be able to assign species-level taxonomy to gyraseB amplicon data. Genomes are annotated using Prodigal, then gyraseB sequences are filtered out using a HMM profile of gyrase B from the TIGR01059 profile in the individual hmms folder. Then nucleotide and protein sequences are output to fastas. These fastas will be used as reference databases for local blast searches used to assign taxonomy. 

In [2]:
import os
import pandas as pd
from Bio import SeqIO
from Bio import Seq

os.getcwd()
os.chdir('/Volumes/AHN/captive_ape_microbiome')


#create output folders
indir =  'data/gyrb/assign_taxonomy_gtdbtk/GTDBTK_db'
outdir = 'results/gyrb/processing/assign_taxonomy_gtdbtk'

os.system('mkdir -pv '+outdir+'/hmm_out')
os.system('mkdir -pv '+outdir+'/prodigal')
os.system('mkdir -pv '+outdir+'/gyrb_seqs')

0

In [3]:
#Read in taxonomy file
tax = pd.read_csv(f'{indir}/taxonomy/gtdb_taxonomy.tsv',sep='\t',header=None) 
tax.columns = ["gtdbtk_genome", "taxonomy"]
tax['ncbi_genome'] = tax['gtdbtk_genome'].str.split('_',n=1,expand=True).loc[:,1]
gtdbtk_to_ncbi = tax.set_index('gtdbtk_genome')['ncbi_genome'].to_dict()
ncbi_to_tax = tax.set_index('ncbi_genome')['taxonomy'].to_dict()
tax[['Domain','Phylum','Class','Order','Family','Genus','Species']] = tax.taxonomy.apply( 
   lambda x: pd.Series(str(x).split(";"))) 


In [4]:
#Select only Bacteroidales genomes
Bt = tax[tax['taxonomy'].str.contains('Bacteroidales')]
print(len(Bt),'Bacteroidales genomes in GTDBTK database')
#write out taxonomy file
Bt.to_csv(outdir+'/Bt_taxonomy.txt',index=False,sep='\t')


1029 Bacteroidales genomes in GTDBTK database


In [5]:
#Select representatives, one from each other Order
not_Bt = tax[~tax['taxonomy'].str.contains('Bacteroidales')]
not_Bt_reps = not_Bt.groupby('Order').head(1)
print(len(not_Bt_reps),'Other bacterial orders in GTDBTK database')
#write out taxonomy file
Bt_plusreps = pd.concat([Bt, not_Bt_reps])
Bt_plusreps.to_csv(outdir+'/Bt_plusreps_taxonomy.txt',index=False,sep='\t')
print(len(Bt_plusreps),'Total genomes in GTDBTK database added to assign taxonomy')

924 Other bacterial orders in GTDBTK database
1953 Total genomes in GTDBTK database added to assign taxonomy


### Annotate GTDB-Tk Bacteroidales and other taxonomic representative genomes

In [6]:
def prodigal(genome):
    #Annotate genomes using prodigal and identifies gyrb gene using a gtdbtk hmmprofile 
    os.system(f'gunzip {indir}/fastani/database/{genome}_genomic.fna.gz')
    os.system(f'prodigal -i {indir}/fastani/database/{genome}_genomic.fna -d {outdir}/prodigal/{genome}_genomic.fna -a {outdir}/prodigal/{genome}_genomic.faa')
    os.system(f'hmmsearch --tblout {outdir}/hmm_out/{genome}.txt {indir}/markers/tigrfam/individual_hmms/TIGR01059.HMM {outdir}/prodigal/{genome}_genomic.faa')
    os.system(f'gzip ref_seqs/GTDBTK_db/fastani/database/{genome}_genomic.fna')   
    
prodigal('GCF_002849695.1')

In [7]:
#list genomes with fastas
genome_files = [f.split('_genomic.fna.gz')[0] for f in os.listdir(f'{indir}/fastani/database/')]
#select only those genomes in Bt_plusreps
Bt_plusreps = Bt_plusreps[Bt_plusreps['ncbi_genome'].isin(genome_files)]
print(len(Bt_plusreps),'genomes') #there are several reps listed in metadata without genomes in the GTDBTK folder
#how many have already been annotated
annotated_genomes = [f.split('_genomic')[0] for f in os.listdir(f'{outdir}/prodigal') if f.endswith('.faa')]
# test run
#annotated_genomes = annotated_genomes[:-5]
print(len(annotated_genomes),'genomes already annotated')

genomes_to_do = set(Bt_plusreps['ncbi_genome'])-set(annotated_genomes) 
print(len(genomes_to_do),'genomes to annotate')
 

1944 genomes
1945 genomes already annotated
0 genomes to annotate


In [7]:
#Annotate genomes that haven't been annotated
for genome in genomes_to_do:
    prodigal(genome,outdir)             

### Identify best gyrb hit and output fasta

In [8]:
def get_besthit(genome):
    #parse hmmsearch result to return sequence header and evalue
    try:
        df = pd.read_csv(f'{outdir}/hmm_out/{genome}.txt',delim_whitespace=True,header=None,comment='#')
        besthit = df.iloc[0]
        name=besthit[0]
        evalue=besthit[4]
        return(pd.Series([name,evalue]))
    except:
        return(pd.Series(['no_hit','NA']))
#test 
print(get_besthit('GCF_002849695.1'))
   
def write_fasta(genome,besthit):
    record_dict = SeqIO.to_dict(SeqIO.parse(f"{outdir}/prodigal/{genome}_genomic.fna", "fasta"))
    besthit_record = record_dict[besthit]
    besthit_record.id = genome
    besthit_record.description=ncbi_to_tax[genome]
    if len(besthit_record.seq) < 1800:
        return('partial_len')
    elif 'N' in str(besthit_record.seq): #ensures gene is fulllength
        return('ambig_char')
    else:
        SeqIO.write(besthit_record, f"{outdir}/gyrb_seqs/{genome}.fasta", "fasta")
        return('passing_hit')
#test
write_fasta('GCF_002849695.1','NZ_CP018937.1_3992')  

Bt_plusreps[['best_hit','evalue']] = Bt_plusreps['ncbi_genome'].apply(get_besthit)
#filter out seqs with no hit
Bt_plusreps_hits = Bt_plusreps[Bt_plusreps['best_hit']!='no_hit']
print(len(Bt_plusreps_hits),'gyrB seqs found')
Bt_plusreps_hits = Bt_plusreps_hits[Bt_plusreps_hits['evalue'].astype(float)<float(1e-250)]
print(len(Bt_plusreps_hits),'gyrB seqs passing eval threshold')

#remove fastas
os.system(f"rm -r {outdir}/gyrb_seqs/")
os.system(f"mkdir -pv {outdir}/gyrb_seqs/")

Bt_plusreps_hits['records'] = Bt_plusreps_hits.apply(
                                lambda row: write_fasta(row['ncbi_genome'],row['best_hit']),
                                axis = 1)
print('summary of gyrb hits')
print(Bt_plusreps_hits['records'].value_counts())
print('summary of Bt gyrb hits')
print(Bt_plusreps_hits['records'][Bt_plusreps_hits['Order']=='o__Bacteroidales'].value_counts())

Bt_plusreps_hits.to_csv(f'{outdir}/Bt_plusreps_hits.txt',sep='\t',index=False)

0    NZ_CP018937.1_3992
1              3.5e-287
dtype: object
1859 gyrB seqs found
1621 gyrB seqs passing eval threshold
summary of gyrb hits
passing_hit    1561
ambig_char       50
partial_len      10
Name: records, dtype: int64
summary of Bt gyrb hits
passing_hit    890
ambig_char       7
partial_len      3
Name: records, dtype: int64


### Align fasta, select only Bacteroidales/amplicon region, make blastdb

In [9]:
%%bash

#concatenate gyrb seqs to file
mkdir -pv results/gyrb_bt_gtdbtk_ref/alignment
cat results/gyrb_bt_gtdbtk_ref/gyrb_seqs/*.fasta > results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta

#translate, align aa, and uses a guide for nucleotide seqs
transeq -sequence results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta -outseq results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.faa
mafft --auto --quiet results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.faa > results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.faa.aln
tranalign -asequence  results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta -bsequence results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.faa.aln -outseq results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta.aln



Translate nucleic acid sequences
Generate an alignment of nucleic coding regions from aligned proteins


In [10]:
#select only Bacteroidales seqs and realign
with open('results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta') as original: 
    with open(f'results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt.fasta', 'w') as Bt_only:
        records = SeqIO.parse(original, 'fasta')
        for record in records:
            if 'o__Bacteroidales' in record.description or 'p__Gemmatimonadota' in record.description or 'c__Chlorobia' in record.description:
                description = record.description.split(';')
                phylum = description[1]
                order = description[3]
                family = description[4]
                genus_sp = description[6].replace(' ','_')
                record.id = record.id + phylum + order + family + genus_sp
                record.description = ''
                if 'o__Bacteroidales' not in record.description:
                    print(record.id)
                SeqIO.write(record, Bt_only, 'fasta')

!transeq -sequence results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt.fasta -outseq results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt.faa
!mafft --auto --quiet results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt.faa > results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt.faa.aln
!tranalign -asequence  results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt.fasta -bsequence results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt.faa.aln -outseq results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt.fasta.aln


GCA_000210075.1p__Bacteroidotao__Bacteroidalesf__Bacteroidaceaes__Bacteroides_xylanisolvens
GCA_000431015.1p__Bacteroidotao__Bacteroidalesf__UBA932s__RC9_sp000431015
GCA_000431155.1p__Bacteroidotao__Bacteroidalesf__Muribaculaceaes__Paramuribaculum_sp000431155
GCA_000431215.1p__Bacteroidotao__Bacteroidalesf__Muribaculaceaes__CAG-1031_sp000431215
GCA_000431275.1p__Bacteroidotao__Bacteroidalesf__Bacteroidaceaes__CAG-617_sp000431275
GCA_000432515.1p__Bacteroidotao__Bacteroidalesf__UBA932s__RC9_sp000432515
GCA_000432655.1p__Bacteroidotao__Bacteroidalesf__UBA932s__RC9_sp000432655
GCA_000432735.1p__Bacteroidotao__Bacteroidalesf__Bacteroidaceaes__Bacteroides_A_sp000432735
GCA_000432775.1p__Bacteroidotao__Bacteroidalesf__UBA932s__CAG-831_sp000432775
GCA_000433175.1p__Bacteroidotao__Bacteroidalesf__Bacteroidaceaes__Prevotella_sp000433175
GCA_000433355.1p__Bacteroidotao__Bacteroidalesf__UBA932s__RC9_sp000433355
GCA_000433715.1p__Bacteroidotao__Bacteroidalesf__Bacteroidaceaes__Bacteroides_A_plebei

Generate an alignment of nucleic coding regions from aligned proteins


In [17]:
#trim to amplicon region
def trim_aln_to_amp(original_fasta,trim_fasta,start_pos,length):
    with open(original_fasta) as original_fasta:
        with open(trim_fasta, 'w') as trim_fasta:
            records = SeqIO.parse(original_fasta, 'fasta')
            for record in records:
                record.seq = record.seq[start_pos:] #trim at position indicated by cutadapt
                record.seq = [ch for ch in record.seq if ch != '-']
                record.seq = ''.join(record.seq[:length])
                record.seq = Seq.Seq(record.seq)
                SeqIO.write(record, trim_fasta, 'fasta')

#CGGAGGTAARTTCGAYAAAGG
#GGKFDKG

trim_aln_to_amp('results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta.aln',
                'results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.fasta',
                1307,
                250)
trim_aln_to_amp('results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.faa.aln',
                'results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.faa',
                440,
                83)
trim_aln_to_amp('results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt.fasta.aln',
                'results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt_amplicon.fasta',
                707,
                250)
trim_aln_to_amp('results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt.faa.aln',
                'results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_Bt_amplicon.faa',
                236,
                83)

In [18]:
%%bash 
cd results/gyrb_bt_gtdbtk_ref/alignment
mafft --auto --quiet gtdbtk_gyrb_Bt_amplicon.faa > gtdbtk_gyrb_Bt_amplicon.faa.aln
tranalign -asequence  gtdbtk_gyrb_Bt_amplicon.fasta -bsequence gtdbtk_gyrb_Bt_amplicon.faa.aln -outseq gtdbtk_gyrb_Bt_amplicon.fasta.aln


mafft --auto --quiet gtdbtk_gyrb_amplicon.faa > gtdbtk_gyrb_amplicon.faa.aln
tranalign -asequence  gtdbtk_gyrb_amplicon.fasta -bsequence gtdbtk_gyrb_amplicon.faa.aln -outseq gtdbtk_gyrb_amplicon.fasta.aln


Generate an alignment of nucleic coding regions from aligned proteins
Generate an alignment of nucleic coding regions from aligned proteins
Error: Guide protein sequence GCA_001784555.1_1 not found in nucleic sequence GCA_001784555.1
Error: Guide protein sequence GCA_002280485.1_1 not found in nucleic sequence GCA_002280485.1
Error: Guide protein sequence GCF_000192575.1_1 not found in nucleic sequence GCF_000192575.1
Error: Guide protein sequence GCF_000212395.1_1 not found in nucleic sequence GCF_000212395.1


In [19]:
%%bash
mkdir results/gyrb_bt_gtdbtk_ref/blast_db
cp results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.fasta results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.fasta
cp results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.faa results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.faa
makeblastdb -in results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.fasta -dbtype nucl
makeblastdb -in results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.faa -dbtype prot



Building a new DB, current time: 06/09/2020 11:32:44
New DB name:   /Volumes/AHN/captive_ape_microbiome/results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.fasta
New DB title:  results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Volumes/AHN/captive_ape_microbiome/results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.fasta
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 1562 sequences in 0.0552599 seconds.


Building a new DB, current time: 06/09/2020 11:32:44
New DB name:   /Volumes/AHN/captive_ape_microbiome/results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.faa
New DB title:  results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.faa
Sequence type: Protein
Deleted existing Protein BLAST database named /Volumes/AHN/captive_ape_microbiome/results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.faa
Keep MBits: T
Maximum file size: 1000000000B
Adding s

mkdir: results/gyrb_bt_gtdbtk_ref/blast_db: File exists
