In [1]:
import os
import shutil
import json
import zlib
import traceback
import urllib.request

import pandas as pd
from Bio import Entrez
from Bio import SearchIO
from Bio.SearchIO.HmmerIO.hmmer3_text import Hmmer3TextParser

from eaglib.alignment import SeqsProfileInfo, SeqsProfile

In [5]:
! pip uninstall -y EAGLE
! pip install ../../dist/EAGLE-0.0.1-py3-none-any.whl

Found existing installation: EAGLE 0.0.1
Uninstalling EAGLE-0.0.1:
  Successfully uninstalled EAGLE-0.0.1
Processing /media/olga/Data/Denis/EAGLE/dist/EAGLE-0.0.1-py3-none-any.whl
Installing collected packages: EAGLE
Successfully installed EAGLE-0.0.1


In [2]:
def get_taxonomy(tax_id):
    tax_keys = ["superkingdom", "phylum", "clade", "class", "order", "family", "genus", "species"]
    tax_dict = {tax_key: None for tax_key in tax_keys}
    
    record = Entrez.efetch(db="taxonomy", id=tax_id, retmode='xml')
    tax_info = Entrez.read(record)[0]
    tax_dict["species"] = tax_info['ScientificName']
    
    for lin_tax in tax_info['LineageEx']:
        if lin_tax['Rank'] in tax_dict:
            tax_dict[lin_tax['Rank']] = lin_tax['ScientificName']
            
    return [tax_dict[tax_key] for tax_key in tax_keys]

In [17]:
assembly_summary_path = "archaea_assembly_summary.txt"
Entrez.email = "moshenskydenis@gmail.com"
db_dir = "archaea"

processed_ac = list()
arch_df = pd.read_csv(assembly_summary_path, sep="\t")
# arch_df.query("assembly_level=='Complete Genome' & refseq_category!='representative genome'")
for _, row in arch_df.query("assembly_level=='Complete Genome'").iterrows():
    ac = row['assembly_accession']  # id field in genomes_table
    asm = row['asm_name']
    taxonomy = get_taxonomy(row['species_taxid'])
    name = row['organism_name'] + ("" if pd.isna(row['infraspecific_name']) else " " + row['infraspecific_name'])
    ftp_prefix = row['ftp_path'] + "/" + ac + "_" + asm
    fna_seq = [ftp_prefix+"_genomic.fna.gz"]
    # fna_seq = [os.path.join(db_dir, ac+"_genomic.fna")] #
    
    try:
    #     with open(fna_seq[0], 'wb') as gfna_f: #
    #         gfna_f.write(zlib.decompress(urllib.request.urlopen(ftp_prefix+"_genomic.fna.gz").read(), 15+32)) #
    #     with open(os.path.join(db_dir, ac+"_rna_from_genomic.fna"), 'wb') as rna_f:
    #         rna_f.write(zlib.decompress(urllib.request.urlopen(ftp_prefix+"_rna_from_genomic.fna.gz").read(), 15+32))
    #     with open(os.path.join(db_dir, ac+"_translated_cds.faa"), 'wb') as tcds_f:
    #         tcds_f.write(zlib.decompress(urllib.request.urlopen(ftp_prefix+"_translated_cds.faa.gz").read(), 15+32))
        processed_ac.append({"id": ac, "name": name, "taxonomy": taxonomy, "btc_seqs": [], "fna_seq": fna_seq})
    except:
        print(traceback.format_exc())
    print(ac, name)

GCF_000762265.1 Methanobacterium formicicum strain=BRM9
GCF_001458655.1 Methanobacterium formicicum
GCF_002813085.1 Methanobrevibacter smithii strain=KB11
GCF_022846155.1 Methanobrevibacter smithii strain=CE91-St67
GCF_022846175.1 Methanobrevibacter smithii strain=CE91-St68
GCF_001889405.1 Methanohalophilus halophilus strain=Z-7982
GCF_019263745.1 Methanospirillum hungatei strain=GP1
GCF_010706455.1 Methanosarcina mazei strain=zm-15
GCF_019669945.1 Methanosarcina mazei strain=TMA
GCF_004799605.1 Halobacterium salinarum strain=91-R6
GCF_009729015.1 Acidianus ambivalens strain=LEI 10
GCF_002215405.1 Sulfolobus acidocaldarius strain=Y14 18-5
GCF_002215445.1 Sulfolobus acidocaldarius strain=Y14 20-20
GCF_002215485.1 Sulfolobus acidocaldarius strain=Y14 16-22
GCF_002215525.1 Sulfolobus acidocaldarius strain=Y14 13-1
GCF_002215565.1 Sulfolobus acidocaldarius strain=DG1
GCF_019175305.1 Saccharolobus shibatae strain=S38A
GCF_019175325.1 Saccharolobus shibatae strain=BEU9
GCF_000968355.2 Saccha

GCF_000007345.1 Methanosarcina acetivorans C2A strain=C2A
GCF_000007185.1 Methanopyrus kandleri AV19 strain=AV19
GCF_000007065.1 Methanosarcina mazei Go1 strain=Go1
GCF_000585495.1 Thermococcus nautili strain=30-1
GCF_000970205.1 Methanosarcina mazei S-6 strain=S-6
GCF_021233435.1 Halobacterium noricense strain=JCM 15102
GCF_000008665.1 Archaeoglobus fulgidus DSM 4304 strain=DSM 4304
GCF_000404165.1 Methanobrevibacter sp. AbM4 strain=AbM4
GCF_001477655.1 Methanobrevibacter millerae strain=SM9
GCF_000091665.1 Methanocaldococcus jannaschii DSM 2661 strain=DSM 2661
GCF_000151205.2 Thermococcus sp. AM4 strain=AM4
GCF_000013725.1 Methanococcoides burtonii DSM 6242 strain=DSM 6242
GCF_000008265.1 Picrophilus torridus DSM 9790 strain=DSM 9790
GCF_000011585.1 Methanococcus maripaludis S2 strain=S2
GCF_000591055.1 Natronomonas moolapensis 8.8.11 strain=8.8.11
GCF_000011125.1 Aeropyrum pernix K1 strain=K1
GCF_000011085.1 Haloarcula marismortui ATCC 43049 strain=ATCC 43049
GCF_005310945.1 Haloarc

GCF_000214725.1 Methanobacterium paludis strain=SWAN1
GCF_013415905.1 Halosimplex pelagicum strain=R2
GCF_013415885.1 Halosimplex rubrum strain=R27
GCF_000191585.1 Methanobacterium lacus strain=AL-21
GCF_000214415.1 Methanotorris igneus Kol 5 strain=Kol 5
GCF_000698785.1 Nitrososphaera viennensis EN76 strain=EN76
GCF_000189575.1 Sulfolobus islandicus HVE10/4 strain=HVE10/4
GCF_000189555.1 Sulfolobus islandicus REY15A strain=REY15A
GCF_000213215.1 Acidianus hospitalis W1 strain=W1
GCF_001481685.1 Ignicoccus islandicus DSM 13165 strain=DSM 13165
GCF_000190315.1 Vulcanisaeta moutnovskia 768-28 strain=768-28
GCF_000204415.1 Methanothrix soehngenii GP6 strain=GP6
GCF_000193375.1 Thermoproteus uzoniensis 768-20 strain=768-20
GCF_000204925.1 Metallosphaera cuprina Ar-4 strain=Ar-4
GCF_020700235.1 Haladaptatus pallidirubidus strain=YIM 93656
GCF_019823105.1 Halobaculum magnesiiphilum strain=NBRC 109044
GCF_000340315.1 Sulfolobus acidocaldarius N8 strain=N8
GCF_000338775.1 Sulfolobus acidocalda

GCF_001484685.1 Thermococcus sp. 2319x1 strain=2319x1
GCF_902813195.1 Thermococcus sp. 2319x1 strain=Essen
GCF_003058365.1 Halococcoides cellulosivorans strain=HArcel1T
GCF_013402815.2 Natrinema halophilum strain=YPL8
GCF_020886315.1 Saccharolobus caldissimus strain=JCM 32116
GCF_013456555.2 Natrinema sp. YPL30 strain=YPL30
GCF_013391105.1 Natronomonas salina strain=YPL13
GCF_001647085.1 Thermococcus piezophilus strain=CDGS
GCF_001304615.2 Methanosarcina flavescens strain=E03.2
GCF_003268005.1 Methanosphaera sp. BMS strain=BMS
Traceback (most recent call last):
  File "/tmp/ipykernel_83358/2651093835.py", line 19, in <cell line: 8>
    gfna_f.write(zlib.decompress(urllib.request.urlopen(ftp_prefix+"_genomic.fna.gz").read(), 15+32)) #
  File "/home/olga/miniconda3/envs/eagle/lib/python3.8/urllib/request.py", line 222, in urlopen
    return opener.open(url, data, timeout)
  File "/home/olga/miniconda3/envs/eagle/lib/python3.8/urllib/request.py", line 525, in open
    response = self._ope

GCF_017094465.1 Halapricum desulfuricans strain=HSR-Est
GCF_017094505.1 Halapricum desulfuricans strain=HSR12-1
GCF_017094525.1 Halapricum desulfuricans strain=HSR12-2
GCF_019693215.1 Salinarchaeum sp. IM2453 strain=IM2453
GCF_023746595.1 Thermococcus sp. IOH2 strain=IOH2
GCF_023617305.1 Methanobrevibacter sp. TLL-48-HuF1 strain=TLL-48-HuF1
GCF_019879105.1 Natrinema sp. SYSU A 869 strain=SYSU A 869
GCF_019880225.1 Halobaculum rubrum strain=C46
GCF_020885915.1 Candidatus Methanoliparum sp. LAM-1 strain=LAM-1
GCF_020150815.1 Haloprofundus sp. Gai1-5 strain=Gai1-5
GCF_020097815.1 Haloprofundus sp. SQT7-1 strain=SQT7-1
GCF_020097835.1 Haloprofundus sp. SEDH52 strain=SEDH52
GCF_020405185.1 Halomicrobium salinisoli strain=LT50
GCF_020405245.1 Halomicrobium salinisoli strain=TH30
GCF_020405205.1 Natrinema salinisoli strain=SLN56
GCF_020405225.1 Natrinema sp. DC36 strain=DC36
GCF_020618475.1 Haladaptatus sp. PSR5 strain=PSR5
GCF_020614375.1 Haladaptatus sp. PSR8 strain=PSR8
GCF_021655615.1 Aci

In [98]:
processed_ac

['GCF_000762265.1', 'GCF_001458655.1', 'GCF_002813085.1']

In [5]:
arch_16s_profile_ss = SeqsProfile.build(mult_aln="RF01959.stockholm", name="16S_rRNA_archaea_ss", method="infernal", seqs_type="rna", noss=False)

# cmbuild :: covariance model construction from multiple sequence alignments
# INFERNAL 1.1.4 (Dec 2020)
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# CM file:                                            16S_rRNA_archaea_ss.cm
# alignment file:                                     RF01959.stockholm
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
#                                                                      rel entropy
#                                                                      -----------
# idx    name                     nseq eff_nseq   alen  clen  bps bifs    CM   HMM description
# ------ -------------------- -------- -------- ------ ----- ---- ---- ----- ----- -----------
       1 SSU_rRNA_archaea           86     1.06   1958  1478  457   30 0.589 0.302 Archaeal small subunit ribosomal RNA
#
# CPU time

Killed


In [4]:
arch_16s_profile = SeqsProfile(SeqsProfileInfo.load_from_dict({
    'name': '16S_rRNA_archaea',
    'path': '16S_rRNA_archaea.cm',
    'type': 'rna',
    'weight': 1.0,
    'method': 'infernal'
}))

In [5]:
psr = arch_16s_profile.search(seqdb="archaea/GCF_000762265.1_rna_from_genomic.fna", threads=4)

# cmsearch :: search CM(s) against a sequence database
# INFERNAL 1.1.4 (Dec 2020)
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query CM file:                         16S_rRNA_archaea.cm
# target sequence database:              archaea/GCF_000762265.1_rna_from_genomic.fna
# tabular output of hits:                16S_rRNA_archaea_out_J3o4Z3o0wS.psr
# number of worker threads:              4 [--cpu]
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Query:       RF01959  [CLEN=1478]
Hit scores:
 rank     E-value  score  bias  sequence                   start    end   mdl trunc   gc  description
 ----   --------- ------ -----  ------------------------- ------ ------   --- ----- ----  -----------
  (1) !         0 1293.4  12.7  lcl|NZ_CP006933.1_rrna_4       3   1475 + hmm     - 0.56  [locus_tag=BRM9_RS01355] [db_xref=RFAM:RF01

In [6]:
psr

Unnamed: 0,target name,accession,query name,accession.1,mdl,mdl from,mdl to,seq from,seq to,strand,trunc,pass,gc,bias,score,E-value,inc,description of target
0,lcl|NZ_CP006933.1_rrna_4,-,RF01959,-,hmm,3,1477,3,1475,+,-,6,0.56,12.7,1293.4,0,!,[locus_tag=BRM9_RS01355] [db_xref=RFAM:RF01959...
1,lcl|NZ_CP006933.1_rrna_25,-,RF01959,-,hmm,3,1477,3,1475,+,-,6,0.56,12.7,1293.4,0,!,[locus_tag=BRM9_RS03745] [db_xref=RFAM:RF01959...


In [3]:
hsp70_profile = SeqsProfile(SeqsProfileInfo.load_from_dict({
    'name': 'HSP70',
    'path': 'HSP70.hmm',
    'type': 'protein',
    'weight': 1.0,
    'method': 'hmmer'
}))

In [4]:
psr1 = hsp70_profile.search(seqdb="archaea/GCF_000762265.1_translated_cds.faa", threads=4)

# hmmsearch :: search profile(s) against a sequence database
# HMMER 3.3.2 (Nov 2020); http://hmmer.org/
# Copyright (C) 2020 Howard Hughes Medical Institute.
# Freely distributed under the BSD open source license.
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
# query HMM file:                  HSP70.hmm
# target sequence database:        archaea/GCF_000762265.1_translated_cds.faa
# per-dom hits tabular output:     HSP70_out_iR5bRhCqfP.psr
# number of worker threads:        4
# - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -

Query:       HSP70  [M=599]
Accession:   PF00012.23
Description: Hsp70 protein
Scores for complete sequences (score includes all domains):
   --- full sequence ---   --- best 1 domain ---    -#dom-
    E-value  score  bias    E-value  score  bias    exp  N  Sequence                                   Description
    ------- ------ -----    ------- ------ -----   ---- --  --------                                   

In [5]:
psr1

Unnamed: 0,target name,accession,tlen,query name,accession.1,qlen,E-value,seq_score,seq_bias,#,...,domain_score,domain_bias,hmm_from,hmm_to,ali_from,ali_to,env_from,env_to,acc,description of target
0,lcl|NZ_CP006933.1_prot_WP_048085785.1_2239,-,619,HSP70,PF00012.23,599,1.4e-229,760.4,15.6,1,...,73.0,0.1,1,74,7,85,7,88,0.93,[gene=dnaK] [locus_tag=BRM9_RS11300] [db_xref=...
1,lcl|NZ_CP006933.1_prot_WP_048085785.1_2239,-,619,HSP70,PF00012.23,599,1.4e-229,760.4,15.6,2,...,690.1,12.8,100,599,85,584,82,584,0.99,[gene=dnaK] [locus_tag=BRM9_RS11300] [db_xref=...
2,lcl|NZ_CP006933.1_prot_WP_048085701.1_2093,-,357,HSP70,PF00012.23,599,1.2e-14,50.6,10.0,1,...,4.4,0.0,2,53,22,70,21,82,0.69,[locus_tag=BRM9_RS10565] [db_xref=GeneID:24793...
3,lcl|NZ_CP006933.1_prot_WP_048085701.1_2093,-,357,HSP70,PF00012.23,599,1.2e-14,50.6,10.0,2,...,45.3,8.4,131,373,112,342,93,356,0.77,[locus_tag=BRM9_RS10565] [db_xref=GeneID:24793...
