In [1]:
import os
import shutil
import json
import zlib
import urllib.request

import pandas as pd
from Bio import Entrez

from eaglib.alignment import SeqsProfileInfo

In [3]:
! pip uninstall -y EAGLE
! pip install ../../dist/EAGLE-0.0.1-py3-none-any.whl

Found existing installation: wget 3.2
Uninstalling wget-3.2:
  Successfully uninstalled wget-3.2


In [2]:
def get_taxonomy(tax_id):
    tax_keys = ["superkingdom", "phylum", "clade", "class", "order", "family", "genus", "species"]
    tax_dict = {tax_key: None for tax_key in tax_keys}
    
    record = Entrez.efetch(db="taxonomy", id=tax_id, retmode='xml')
    tax_info = Entrez.read(record)[0]
    tax_dict["species"] = tax_info['ScientificName']
    
    for lin_tax in tax_info['LineageEx']:
        if lin_tax['Rank'] in tax_dict:
            tax_dict[lin_tax['Rank']] = lin_tax['ScientificName']
            
    return [tax_dict[tax_key] for tax_key in tax_keys]

In [6]:
assembly_summary_path = "archaea_assembly_summary.txt"
Entrez.email = "moshenskydenis@gmail.com"
db_dir = "archaea"


processed_ac = list()
arch_df = pd.read_csv(assembly_summary_path, sep="\t")
# arch_df.query("assembly_level=='Complete Genome' & refseq_category!='representative genome'")
for _, row in arch_df.query("assembly_level=='Complete Genome'").iterrows():
    ac = row['assembly_accession']  # id field in genomes_table
    asm = row['asm_name']
    taxonomy = get_taxonomy(row['species_taxid'])
    name = row['organism_name'] + ("" if pd.isna(row['infraspecific_name']) else " " + row['infraspecific_name'])
    ftp_prefix = row['ftp_path'] + "/" + ac + "_" + asm
    fna_seq = [ftp_prefix+"_genomic.fna.gz"]
    # fna_seq = [os.path.join(db_dir, ac+"_genomic.fna")]
    # with open(fna_seq[0], 'wb') as gfna_f:
    #     gfna_f.write(zlib.decompress(urllib.request.urlopen(ftp_prefix+"_genomic.fna.gz").read(), 15+32))
    with open(os.path.join(db_dir, ac+"_rna_from_genomic.fna"), 'wb') as rna_f:
        rna_f.write(zlib.decompress(urllib.request.urlopen(ftp_prefix+"_rna_from_genomic.fna.gz").read(), 15+32))
    with open(os.path.join(db_dir, ac+"_translated_cds.faa"), 'wb') as tcds_f:
        tcds_f.write(zlib.decompress(urllib.request.urlopen(ftp_prefix+"_translated_cds.faa.gz").read(), 15+32))
    print(ac, name, taxonomy, fna_seq)###
    processed_ac.append(ac)
    if _ >= 3: break

GCF_000762265.1 Methanobacterium formicicum strain=BRM9 ['Archaea', 'Euryarchaeota', 'Methanomada group', 'Methanobacteria', 'Methanobacteriales', 'Methanobacteriaceae', 'Methanobacterium', 'Methanobacterium formicicum'] ['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/762/265/GCF_000762265.1_ASM76226v1/GCF_000762265.1_ASM76226v1_genomic.fna.gz']
GCF_001458655.1 Methanobacterium formicicum ['Archaea', 'Euryarchaeota', 'Methanomada group', 'Methanobacteria', 'Methanobacteriales', 'Methanobacteriaceae', 'Methanobacterium', 'Methanobacterium formicicum'] ['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/001/458/655/GCF_001458655.1_Mb9/GCF_001458655.1_Mb9_genomic.fna.gz']
GCF_002813085.1 Methanobrevibacter smithii strain=KB11 ['Archaea', 'Euryarchaeota', 'Methanomada group', 'Methanobacteria', 'Methanobacteriales', 'Methanobacteriaceae', 'Methanobrevibacter', 'Methanobrevibacter smithii'] ['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/002/813/085/GCF_002813085.1_ASM281308v1/GCF_002813085.1_

In [98]:
processed_ac

['GCF_000762265.1', 'GCF_001458655.1', 'GCF_002813085.1']

In [3]:
os.path.splitext("krymbr")

('krymbr', '')