The purpose of this code is to search GTDB-Tk reference genome for gyrase B sequences to be able to assign species-level taxonomy to gyraseB amplicon data. Genomes are annotated using Prodigal, then gyraseB sequences are filtered out using a HMM profile of gyrase B from the TIGR01059 profile in the individual hmms folder. Then nucleotide and protein sequences are output to fastas. These fastas will be used as reference databases for local blast searches used to assign taxonomy. 

In [2]:
import os
import pandas as pd
from Bio import SeqIO
from Bio import Seq

os.getcwd()
os.chdir('/Volumes/AHN/captive_ape_microbiome')
#create output folders
outdir= 'results/gyrb_bt_gtdbtk_ref'
os.system('mkdir -pv '+outdir+'/hmm_out')
os.system('mkdir -pv '+outdir+'/prodigal')
os.system('mkdir -pv '+outdir+'/gyrb_seqs')

0

In [3]:
#Read in taxonomy file
tax = pd.read_csv('ref_seqs/GTDBTK_db/taxonomy/gtdb_taxonomy.tsv',sep='\t',header=None) 
tax.columns = ["gtdbtk_genome", "taxonomy"]
tax['ncbi_genome'] = tax['gtdbtk_genome'].str.split('_',n=1,expand=True).loc[:,1]
gtdbtk_to_ncbi = tax.set_index('gtdbtk_genome')['ncbi_genome'].to_dict()
ncbi_to_tax = tax.set_index('ncbi_genome')['taxonomy'].to_dict()
tax[['Domain','Phylum','Class','Order','Family','Genus','Species']] = tax.taxonomy.apply( 
   lambda x: pd.Series(str(x).split(";"))) 
len(tax)

24706

In [3]:
#Select only Bacteroidales genomes
Bt = tax[tax['taxonomy'].str.contains('Bacteroidales')]
print(len(Bt),'Bacteroidales genomes in GTDBTK database')
#write out taxonomy file
Bt.to_csv(outdir+'/Bt_taxonomy.txt',index=False,sep='\t')


1029 Bacteroidales genomes in GTDBTK database


### Annotate GTDB-Tk Bacteroidales genomes

In [5]:
def prodigal(genome,outdir):
    #Annotate genomes using prodigal and identifies gyrb gene using a gtdbtk hmmprofile 
    os.system(f'gunzip ref_seqs/GTDBTK_db/fastani/database/{genome}_genomic.fna.gz')
    os.system(f'prodigal -i ref_seqs/GTDBTK_db/fastani/database/{genome}_genomic.fna -d {outdir}/prodigal/{genome}_genomic.fna -a {outdir}/prodigal/{genome}_genomic.faa')
    os.system(f'hmmsearch --tblout {outdir}/hmm_out/{genome}.txt ref_seqs/GTDBTK_db/markers/tigrfam/individual_hmms/TIGR01059.HMM {outdir}/prodigal/{genome}_genomic.faa')
    os.system(f'gzip ref_seqs/GTDBTK_db/fastani/database/{genome}_genomic.fna')   
    
prodigal('GCF_002849695.1',outdir)

In [4]:
#Annotate genomes that haven't been annotated
annotated_genomes = [f.split('_genomic')[0] for f in os.listdir(outdir+'/prodigal') if f.endswith('.faa')]

print(len(annotated_genomes),'genomes already annotated')

print('looping through remaining genomes to annotate',(set(Bt['ncbi_genome']) - set(annotated_genomes)))

for genome in Bt['ncbi_genome']:
    if os.path.exists('ref_seqs/GTDBTK_db/fastani/database/'+genome+'_genomic.fna.gz'):
        if genome not in annotated_genomes:
            print(genome)
            #prodigal(genome,outdir)          
    else:
        print(genome,'not found in fastani folder')       

1029 genomes already annotated
looping through remaining genomes to annotate set()


### Identify best gyrb hit and output fasta

In [39]:
def get_besthit(genome):
    #parse hmmsearch result to return sequence header and evalue
    try:
        df = pd.read_csv(f'{outdir}/hmm_out/{genome}.txt')
    except:
        return(genome,'file_unopened','NA')
        
    try:
        df = pd.read_csv(f'{outdir}/hmm_out/{genome}.txt',delim_whitespace=True,header=None,comment='#')
        besthit = df.iloc[0]
        name=besthit[0]
        evalue=besthit[4]
        return([genome,name,evalue])
    except:
        return([genome,'no_hit','NA'])


print(get_besthit('GCF_002849695.1'))
print(get_besthit('GCF_000687715.1')) 
print(get_besthit('GCA_002353975.1'))

['GCF_002849695.1', 'NZ_CP018937.1_3992', 3.499999999999998e-287]
['GCF_000687715.1', 'NZ_JHXD01000001.1_223', 1.399999999999999e-280]
['GCA_002353975.1', 'DEKQ01000149.1_30', 1.1999999999999993e-276]


In [40]:
besthits = Bt['ncbi_genome'].apply(get_besthit)
besthits_df = pd.DataFrame(besthits.values.tolist(),columns=['genome','besthit','evalue'])
besthits_df.to_csv(f'{outdir}/hmm_out/all_summary.txt',sep='\t',index=None)
print(len(besthits_df[besthits_df['besthit']=='no_hit']),'genomes with no hit')
print(len(besthits_df[besthits_df['besthit']=='file_unopened']),'genome files not found')

12 genomes with no hit
0 genome files not found


In [44]:
def write_fasta(genome,besthit,evalue):
    if besthit!='no_hit':
        if float(evalue) < float(1e-150): 
            record_dict = SeqIO.to_dict(SeqIO.parse(f"{outdir}/prodigal/{genome}_genomic.fna", "fasta"))
            besthit_record = record_dict[besthit]
            besthit_record.id = genome
            besthit_record.description=ncbi_to_tax[genome]
            if len(besthit_record.seq) > 1800: #ensures gene is fulllength
                if besthit_record.seq.count('N')<1:
                    SeqIO.write(besthit_record, f"{outdir}/gyrb_seqs/{genome}.fasta", "fasta")
                    return(ncbi_to_tax[genome])
        else:
            return(genome,'didnt meet cutoff')

            
print(write_fasta('GCA_900320445.1', 'ONSG01000024.1_10', '1.499999999999999e-280'))
print(write_fasta('GCF_000687715.1', 'NZ_JHXD01000001.1_223', '1.399999999999999e-280'))
print(write_fasta('GCF_002849695.1', 'NZ_CP018937.1_3992', '3.499999999999998e-287'))

d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella sp900320445
d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella ruminicola_B
d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Bacteroides;s__Bacteroides fragilis_A


In [45]:
for i in range(1,len(besthits_df)):
    try:
        write_fasta(besthits_df.loc[i]['genome'],besthits_df.loc[i]['besthit'],besthits_df.loc[i][2])
    except:
        print(besthits_df.loc[i]['genome'])

In [8]:
%%bash

#concatenate gyrb seqs to file
mkdir -pv results/gyrb_bt_gtdbtk_ref/alignment
cat results/gyrb_bt_gtdbtk_ref/gyrb_seqs/*.fasta > results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta

#translate, align aa, and uses a guide for nucleotide seqs
transeq -sequence results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta -outseq results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.faa
mafft --auto --quiet results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.faa > results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.faa.aln
tranalign -asequence  results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta -bsequence results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.faa.aln -outseq results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta.aln

#run cutadapt to see where to trim alignment
cutadapt --discard-untrimmed -e .1 -g CGGAGGTAARTTCGAYAAAGG  --overlap 20 -o results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta.aln.cutadapt results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta.aln
rm results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta.aln.cutadapt

This is cutadapt 2.5 with Python 3.7.4
Command line parameters: --discard-untrimmed -e .1 -g CGGAGGTAARTTCGAYAAAGG --overlap 20 -o results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta.aln.cutadapt results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb.fasta.aln
Processing reads on 1 core in single-end mode ...
Finished in 0.10 s (110 us/read; 0.55 M reads/minute).

=== Summary ===

Total reads processed:                     892
Reads with adapters:                       404 (45.3%)
Reads written (passing filters):           404 (45.3%)

Total basepairs processed:     2,606,424 bp
Total written (filtered):        903,481 bp (34.7%)

=== Adapter 1 ===

Sequence: CGGAGGTAARTTCGAYAAAGG; Type: regular 5'; Length: 21; Trimmed: 404 times.

No. of allowed errors:
0-9 bp: 0; 10-19 bp: 1; 20-21 bp: 2

Overview of removed sequences
length	count	expect	max.err	error counts
685	137	0.0	2	0 23 114
686	267	0.0	2	45 112 110


Translate nucleic acid sequences
Generate an alignment of nucleic coding regions from aligned proteins


In [12]:
with open(f'{outdir}/alignment/gtdbtk_gyrb.fasta.aln') as original, open(f'{outdir}/alignment/gtdbtk_gyrb_amplicon.fasta', 'w') as corrected:
    records = SeqIO.parse(original, 'fasta')
    for record in records:
        record.seq = record.seq[686:] #trim at position indicated by cutadapt
        record.seq = [ch for ch in record.seq if ch != '-']
        record.seq = ''.join(record.seq[:250])
        record.seq = Seq.Seq(record.seq)
        SeqIO.write(record, corrected, 'fasta')

In [15]:
%%bash
transeq -frame 2 -sequence results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.fasta -outseq results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.faa
mafft --auto --quiet results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.faa > results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.faa.aln
tranalign -asequence  results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.fasta  -bsequence results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.faa.aln -outseq results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.fasta.aln


Translate nucleic acid sequences
Generate an alignment of nucleic coding regions from aligned proteins


In [16]:
%%bash
mkdir results/gyrb_bt_gtdbtk_ref/blast_db
cp results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.fasta results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.fasta
cp results/gyrb_bt_gtdbtk_ref/alignment/gtdbtk_gyrb_amplicon.faa results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.faa
makeblastdb -in results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.fasta -dbtype nucl
makeblastdb -in results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.faa -dbtype prot



Building a new DB, current time: 05/19/2020 14:14:08
New DB name:   /Volumes/AHN/captive_ape_microbiome/results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.fasta
New DB title:  results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.fasta
Sequence type: Nucleotide
Deleted existing Nucleotide BLAST database named /Volumes/AHN/captive_ape_microbiome/results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.fasta
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 892 sequences in 0.029567 seconds.


Building a new DB, current time: 05/19/2020 14:14:08
New DB name:   /Volumes/AHN/captive_ape_microbiome/results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.faa
New DB title:  results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.faa
Sequence type: Protein
Deleted existing Protein BLAST database named /Volumes/AHN/captive_ape_microbiome/results/gyrb_bt_gtdbtk_ref/blast_db/gtdbtk_gyrb_amplicon.faa
Keep MBits: T
Maximum file size: 1000000000B
Adding seq

mkdir: results/gyrb_bt_gtdbtk_ref/blast_db: File exists
