In [16]:
import os
import shutil
import json
import zlib
import traceback
import urllib.request

import pandas as pd
from Bio import Entrez
from Bio.SearchIO.HmmerIO.hmmer3_text import Hmmer3TextParser

from eaglib.alignment import SeqsProfileInfo, SeqsProfile

In [7]:
! pip uninstall -y EAGLE
! pip install ../../dist/EAGLE-0.0.1-py3-none-any.whl

Found existing installation: EAGLE 0.0.1
Uninstalling EAGLE-0.0.1:
  Successfully uninstalled EAGLE-0.0.1
Processing /media/olga/Data/Denis/EAGLE/dist/EAGLE-0.0.1-py3-none-any.whl
Installing collected packages: EAGLE
Successfully installed EAGLE-0.0.1


In [14]:
def get_taxonomy(tax_id):
    tax_keys = ["superkingdom", "phylum", "clade", "class", "order", "family", "genus", "species"]
    tax_dict = {tax_key: None for tax_key in tax_keys}
    
    record = Entrez.efetch(db="taxonomy", id=tax_id, retmode='xml')
    tax_info = Entrez.read(record)[0]
    tax_dict["species"] = tax_info['ScientificName']
    
    for lin_tax in tax_info['LineageEx']:
        if lin_tax['Rank'] in tax_dict:
            tax_dict[lin_tax['Rank']] = lin_tax['ScientificName']
            
    return [tax_dict[tax_key] for tax_key in tax_keys]

In [None]:
assembly_summary_path = "archaea_assembly_summary.txt"
Entrez.email = "moshenskydenis@gmail.com"
db_dir = "archaea"

processed_ac = list()
arch_df = pd.read_csv(assembly_summary_path, sep="\t")
# arch_df.query("assembly_level=='Complete Genome' & refseq_category!='representative genome'")
for _, row in arch_df.query("assembly_level=='Complete Genome'").iterrows():
    ac = row['assembly_accession']  # id field in genomes_table
    asm = row['asm_name']
    taxonomy = get_taxonomy(row['species_taxid'])
    name = row['organism_name'] + ("" if pd.isna(row['infraspecific_name']) else " " + row['infraspecific_name'])
    ftp_prefix = row['ftp_path'] + "/" + ac + "_" + asm
    # fna_seq = [ftp_prefix+"_genomic.fna.gz"]
    fna_seq = [os.path.join(db_dir, ac+"_genomic.fna")] #
    
    try:
        with open(fna_seq[0], 'wb') as gfna_f: #
            gfna_f.write(zlib.decompress(urllib.request.urlopen(ftp_prefix+"_genomic.fna.gz").read(), 15+32)) #
        with open(os.path.join(db_dir, ac+"_rna_from_genomic.fna"), 'wb') as rna_f:
            rna_f.write(zlib.decompress(urllib.request.urlopen(ftp_prefix+"_rna_from_genomic.fna.gz").read(), 15+32))
        with open(os.path.join(db_dir, ac+"_translated_cds.faa"), 'wb') as tcds_f:
            tcds_f.write(zlib.decompress(urllib.request.urlopen(ftp_prefix+"_translated_cds.faa.gz").read(), 15+32))
        processed_ac.append({"id": ac, "name": name, "taxonomy": taxonomy, "btc_seqs": [], "fna_seq": fna_seq})
    except:
        print(traceback.format_exc())
    print(ac, name)

GCF_000762265.1 Methanobacterium formicicum strain=BRM9
GCF_001458655.1 Methanobacterium formicicum
GCF_002813085.1 Methanobrevibacter smithii strain=KB11
GCF_022846155.1 Methanobrevibacter smithii strain=CE91-St67
GCF_022846175.1 Methanobrevibacter smithii strain=CE91-St68
GCF_001889405.1 Methanohalophilus halophilus strain=Z-7982
GCF_019263745.1 Methanospirillum hungatei strain=GP1
GCF_010706455.1 Methanosarcina mazei strain=zm-15
GCF_019669945.1 Methanosarcina mazei strain=TMA
GCF_004799605.1 Halobacterium salinarum strain=91-R6
GCF_009729015.1 Acidianus ambivalens strain=LEI 10
GCF_002215405.1 Sulfolobus acidocaldarius strain=Y14 18-5
GCF_002215445.1 Sulfolobus acidocaldarius strain=Y14 20-20
GCF_002215485.1 Sulfolobus acidocaldarius strain=Y14 16-22
GCF_002215525.1 Sulfolobus acidocaldarius strain=Y14 13-1
GCF_002215565.1 Sulfolobus acidocaldarius strain=DG1
GCF_019175305.1 Saccharolobus shibatae strain=S38A
GCF_019175325.1 Saccharolobus shibatae strain=BEU9
GCF_000968355.2 Saccha

GCF_000007345.1 Methanosarcina acetivorans C2A strain=C2A
GCF_000007185.1 Methanopyrus kandleri AV19 strain=AV19
GCF_000007065.1 Methanosarcina mazei Go1 strain=Go1
GCF_000585495.1 Thermococcus nautili strain=30-1
GCF_000970205.1 Methanosarcina mazei S-6 strain=S-6
GCF_021233435.1 Halobacterium noricense strain=JCM 15102
GCF_000008665.1 Archaeoglobus fulgidus DSM 4304 strain=DSM 4304
GCF_000404165.1 Methanobrevibacter sp. AbM4 strain=AbM4
GCF_001477655.1 Methanobrevibacter millerae strain=SM9
GCF_000091665.1 Methanocaldococcus jannaschii DSM 2661 strain=DSM 2661
GCF_000151205.2 Thermococcus sp. AM4 strain=AM4
GCF_000013725.1 Methanococcoides burtonii DSM 6242 strain=DSM 6242
GCF_000008265.1 Picrophilus torridus DSM 9790 strain=DSM 9790
GCF_000011585.1 Methanococcus maripaludis S2 strain=S2
GCF_000591055.1 Natronomonas moolapensis 8.8.11 strain=8.8.11
GCF_000011125.1 Aeropyrum pernix K1 strain=K1
GCF_000011085.1 Haloarcula marismortui ATCC 43049 strain=ATCC 43049
GCF_005310945.1 Haloarc

In [98]:
processed_ac

['GCF_000762265.1', 'GCF_001458655.1', 'GCF_002813085.1']

In [3]:
arch_16s_profile = SeqsProfile.build(mult_aln="RF01959.fasta", name="16S_rRNA_archaea", method="infernal", seqs_type="nucl", noss=True)

# cmbuild :: covariance model construction from multiple sequence alignments
# INFERNAL 1.1.4 (Dec 2020)
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# CM file:                                            16S_rRNA_archaea.cm
# alignment file:                                     RF01959.fasta
# ignore secondary structure, if any:                 yes
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#                                                                      rel entropy
#                                                                      -----------
# idx    name                     nseq eff_nseq   alen  clen  bps bifs    CM   HMM description
# ------ -------------------- -------- -------- ------ ----- ---- ---- ----- ----- -----------
       1 RF01959                    86     1.86   1958  1478    0    0 0.380 0.38

In [3]:
arch_16s_profile = SeqsProfile(SeqsProfileInfo.load_from_dict({
    'name': '16S_rRNA_archaea',
    'path': '16S_rRNA_archaea.cm',
    'type': 'nucl',
    'weight': 1.0,
    'method': 'infernal'
}))

In [7]:
psr = arch_16s_profile.search(seqdb="archaea/GCF_000762265.1_rna_from_genomic.fna", threads=4)

In [13]:
with open("16S_rRNA_archaea_out_IhcB3eBE5D.psr") as out_f:
    for query in Hmmer3TextParser(out_f):
        print(query)

KeyboardInterrupt: 