In [1]:
from __future__ import print_function, division
%matplotlib inline
import matplotlib.pylab as plt
import os, sys
import numpy as np
import pandas as pd
import os, shutil, glob
import urllib
import wget
import subprocess
import logging
from ete3 import NCBITaxa
ncbi = NCBITaxa()
from collections import OrderedDict, Counter
import time

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s [%(name)s] %(levelname)s : %(message)s')
logger = logging.getLogger(__name__)

#sys.path.append('/misc/workspace/amromics/amromics/')
#from  amromics.utils.bioseq import read_sequence_file


In [2]:
ncbi_root = '/misc/workspace/data/ncbi'
ncbi_url = 'ftp://ftp.ncbi.nlm.nih.gov/'

In [3]:
def download_assembly(genome_id):
    """
    Download the the genome file from patric if not there already
    """
    #make sure directory exist
    directory = os.path.join(patric_root,'genomes',genome_id)
    if not os.path.exists(directory):
        os.makedirs(directory)
    genome_file = os.path.join(directory, genome_id + '.fna')
    if os.path.isfile(genome_file):
        logger.info('Genome {} has been downloaded'.format(genome_id))
        return genome_file
    genome_url = patric_url + 'genomes/' + genome_id + '/' + genome_id + '.fna'    
    logger.info('Downloaded {}'.format(genome_id))
    return wget.download(genome_url, genome_file)


def run_command(cmd, timing_log=None):
    """
    Run a command line, return the returning code of the command
    :param cmd:
    :param timing_log:
    :return:
    """
    if timing_log is not None:
        cmd = '/usr/bin/time --append -v -o {} bash -c "{}"'.format(timing_log, cmd)
    #logger.info('Running "{}'.format(cmd))
    ret = os.system(cmd)
    if ret != 0:
        logger.error('Running "{}" returns {}'.format(cmd, ret))
    return ret


def get_linages(taxon_id, linage_ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']):
    lin_ids = ncbi.get_lineage(taxon_id)
    lin_names =  ncbi.get_taxid_translator(lin_ids)
    lin_ranks = ncbi.get_rank(lin_ids)
    #swap key<-> value
    lin_ranks = {lin_ranks[key]: key for key in lin_ranks}
    
    ret_names = [None] * len(linage_ranks)
    ret_ids = [None] * len(linage_ranks)
    for ii, rank in enumerate(linage_ranks):
        if rank in lin_ranks:
            lin_id = lin_ranks[rank]
            ret_ids[ii] = lin_id
            ret_names[ii] = lin_names[lin_id]
    return ret_ids, ret_names            
    

def download_patric_assembly_curl(genome_id):
    """
    Download the the genome file from patric if not there already
    """
    #make sure directory exist
    directory = os.path.join(patric_root,'genomes',genome_id)
    if not os.path.exists(directory):
        os.makedirs(directory)
    genome_file = os.path.join(directory, genome_id + '.fna')
    genome_file_gz = os.path.join(directory, genome_id + '.fna.gz')
    if os.path.isfile(genome_file_gz):
        logger.info('Previously done {}'.format(genome_id))
        return genome_file_gz
    elif os.path.isfile(genome_file):
        logger.info('Partly done {}'.format(genome_id))
        cmd = 'gzip ' + genome_file
        if run_command(cmd) != 0:
            return None
        else:
            assert os.path.isfile(genome_file_gz)
            return genome_file_gz
    else:
        # logger.info('Download')
        genome_url = patric_url + 'genomes/' + genome_id + '/' + genome_id + '.fna'    
        time.sleep(3)
        cmd = 'curl -R -o {} {}'.format(genome_file, genome_url)
        if run_command(cmd) != 0:
            return None
        cmd = 'gzip ' + genome_file
        if run_command(cmd) != 0:
            return None
        else:
            assert os.path.isfile(genome_file_gz)
            return genome_file_gz

def download_refseq_assembly_curl(refseq_dir):
    """
    Download the the genome file from refseq if not there already
    """
    #make sure directory exist
    path = refseq_dir[len(ncbi_url):]
    genome_path = os.path.join(ncbi_root, path)
    os.makedirs(genome_path, exist_ok=True)
    genome_id = refseq_dir.split('/')[-1]
    
    genome_file = os.path.join(genome_path, genome_id + '_genomic.fna.gz')
    annotation_file = os.path.join(genome_path, genome_id + '_genomic.gbff.gz')
    
    genome_url = refseq_dir + '/' + genome_id + '_genomic.fna.gz'
    gbf_url = refseq_dir + '/' + genome_id + '_genomic.gbff.gz'
    if not os.path.isfile(genome_file):
        cmd = 'curl --retry 10 -R -o {} {}'.format(genome_file, genome_url)
        time.sleep(1)
        ret = run_command(cmd)
        if ret != 0:
            logger.error('Error downloading {} return {}'.format(genome_url, ret))
            genome_file = None
        else:
            logger.info('Downloaded {}'.format(genome_file))
    else:
        pass
        #logger.info('File {} exists!'.format(genome_file))
    
    if not os.path.isfile(annotation_file):
        cmd = 'curl --retry 3 -R -o {} {}'.format(annotation_file, gbf_url)
        time.sleep(1)
        ret = run_command(cmd)
        if ret != 0:
            logger.error('Error downloading {} return {}'.format(gbf_url, ret))
            annotation_file = None 
        else:
            logger.info('Downloaded {}'.format(annotation_file))
    else:
        #logger.info('File {} exists!'.format(annotation_file))
        pass
    return genome_file, annotation_file


# Data preprocessing

## Get lineage information

As the lineage information from patric might have been outdated, we refer to NCBI to update the lineage information. As in turns out, there are changes to NCBI taxatomy incuding taxon IDs and taxon name. In fact, we
infor lieage information directly from taxon ID.



In [4]:
# #gl_df = pd.read_table(os.path.join(patric_root, 'RELEASE_NOTES', 'genome_lineage'),  dtype={'genome_id': str})


# gm_df = pd.read_table(os.path.join(patric_root, 'RELEASE_NOTES', 'genome_metadata'), dtype={'genome_id': str})
# #note that i decided to get the linage information instead of taking from gl
# linages = gm_df['taxon_id'].apply(lambda x:get_linages(x))
# linage_ids, linage_names  = zip(*linages)
# superkingdom_names, phylum_names, class_names, order_names, family_names, genus_names, species_names = zip(*linage_names)
# superkingdom_ids, phylum_ids, class_ids, order_ids, family_ids, genus_ids, species_ids = zip(*linage_ids)

# gm_df['species_name'] = species_names
# gm_df['genus_name'] = genus_names
# gm_df['species_id'] = species_ids
# gm_df['genus_id'] = genus_ids

# #gm_df['family'] = family_names
# #gm_df['order'] = order_names
# #gm_df['phylum'] = phylum_names
# #gm_df['domain'] = superkingdom_names

# #(gm_df['domain'] == 'Bacteria')
# #gm_df = gm_df[(gm_df['genome_status'] != 'Plasmid') & (gm_df['contigs'] < 500) ]
# print(len(gm_df))

We grouped the data in species, and pick those relavant species. Note that many Salmonella taxa do not have a
species name, and hence we get those from genus

In [5]:
#Look at the list of most common specices, and found:
# Counter(gm_df['species']).most_common()
amr_species = [    
    # 'Mycobacterium tuberculosis', # 27993),
    'Escherichia coli', # 23309),
    'Streptococcus pneumoniae', # 20697),
    'Salmonella enterica', # 18169),
    'Staphylococcus aureus', # 12878),
    'Klebsiella pneumoniae', # 12246),
    'Acinetobacter baumannii', # 6936),
    # None, # 5303),
    'Pseudomonas aeruginosa', # 5263),
    'Neisseria gonorrhoeae', # 5096),
    # 'Listeria monocytogenes', # 4111),
    # 'Campylobacter jejuni', # 2973),
    # 'uncultured Pelagibacteraceae bacterium', # 2390),
    # 'Clostridioides difficile', # 2345),
     'Enterococcus faecium', # 2189),
    # 'Streptococcus pyogenes', # 2063),
    # 'Neisseria meningitidis', # 2001),
    # 'Mycobacteroides abscessus', # 1734),
    # 'Campylobacter coli', # 1724),
    # 'Helicobacter pylori', # 1644),
    # 'Burkholderia pseudomallei', # 1615),
    'Enterococcus faecalis', # 1564),
    # 'Pseudomonas viridiflava', # 1540),
    'Shigella sonnei', # 1532),
    # 'Vibrio parahaemolyticus', # 1474),
    # 'Streptococcus agalactiae', # 1355),
    'Vibrio cholerae', # 1350),
    'Streptococcus suis', # 1319),
    # 'Bacillus cereus', # 1252),
    'Enterobacter cloacae', # 1187)
    'Corynebacterium diphtheriae',
]
#gm_df = gm_df[(gm_df['genus_name'] == 'Salmonella') | (gm_df['species_name'].isin(amr_species))]


amr_species = [    
    # 'Mycobacterium tuberculosis', # 27993),
   'Escherichia coli', # 23309),
#=#    'Streptococcus pneumoniae', # 20697),
#=#     'Salmonella enterica', # 18169),
    'Staphylococcus aureus', # 12878),
    'Klebsiella pneumoniae', # 12246),
#=#     'Acinetobacter baumannii', # 6936),
    # None, # 5303),
#=#     'Pseudomonas aeruginosa', # 5263),
#=#     'Neisseria gonorrhoeae', # 5096),
    # 'Listeria monocytogenes', # 4111),
    # 'Campylobacter jejuni', # 2973),
    # 'uncultured Pelagibacteraceae bacterium', # 2390),
    # 'Clostridioides difficile', # 2345),
    # 'Enterococcus faecium', # 2189),
    # 'Streptococcus pyogenes', # 2063),
    # 'Neisseria meningitidis', # 2001),
    # 'Mycobacteroides abscessus', # 1734),
    # 'Campylobacter coli', # 1724),
    # 'Helicobacter pylori', # 1644),
    # 'Burkholderia pseudomallei', # 1615),
#=#     'Enterococcus faecalis', # 1564),
    # 'Pseudomonas viridiflava', # 1540),
#=#     'Shigella sonnei', # 1532),
    # 'Vibrio parahaemolyticus', # 1474),
    # 'Streptococcus agalactiae', # 1355),
    # 'Vibrio cholerae', # 1350),
#=#     'Streptococcus suis', # 1319),
    # 'Bacillus cereus', # 1252),
    # 'Enterobacter cloacae', # 1187)
###    'Corynebacterium diphtheriae',
]


In [6]:
refseq_assemly = os.path.join(ncbi_root, 'assembly_summary_refseq.txt')


refseq_df = pd.read_csv(refseq_assemly, sep='\t', header=1)
refseq_df.rename({'# assembly_accession':'assembly_accession'},axis=1, inplace=True)

all_species_ids = set(refseq_df['species_taxid'])
species_map = {}
for taxon_id in all_species_ids:
    try:
        r = get_linages(taxon_id)
        species = r[1][6]
        genus = r[1][5]
        domain = r[1][0]        
        species_map[taxon_id] = (species, genus, domain)
    except:
        species_map[taxon_id] = (None, None, None)  
        

refseq_df['species_name'] = refseq_df['species_taxid'].apply(lambda x:species_map[x][0])
refseq_df['genus_name']   = refseq_df['species_taxid'].apply(lambda x:species_map[x][1])
refseq_df['domain_name']  = refseq_df['species_taxid'].apply(lambda x:species_map[x][2])

#refseq_df = refseq_df[(refseq_df['genus_name'] == 'Salmonella') | (refseq_df['species_name'].isin(amr_species))].reset_index()
df = refseq_df[(refseq_df['species_name'].isin(amr_species))].reset_index()

Counter(df['species_name'])    

  exec(code_obj, self.user_global_ns, self.user_ns)








Counter({'Escherichia coli': 26392,
         'Staphylococcus aureus': 13556,
         'Klebsiella pneumoniae': 12123})

In [7]:
complete_df = df[df.assembly_level == 'Complete Genome']
not_complete_df = df[df.assembly_level != 'Complete Genome']
#[['assembly_accession', 'biosample', 'infraspecific_name','seq_rel_date', 'asm_name', 'ftp_path']]#.to_csv(None, index=False)
not_complete_df = not_complete_df[not_complete_df['biosample'].isin(complete_df['biosample'])]

not_complete_df

Unnamed: 0,index,assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,...,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material,asm_not_live_date,species_name,genus_name,domain_name
121,1996,GCF_000166595.1,PRJNA224116,SAMN02436016,AELD00000000.1,na,913088,562,Escherichia coli TW11681,strain=TW11681,...,Institute for Genome Sciences,GCA_000166595.2,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Escherichia coli,Escherichia,Bacteria
2156,14702,GCF_000513035.1,PRJNA224116,SAMN02304041,AVCD00000000.1,na,1343836,562,Escherichia coli O157:H7 str. F8092B,strain=F8092B,...,USDA,GCA_000513035.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Escherichia coli,Escherichia,Bacteria
3989,17239,GCF_000567725.1,PRJNA224116,SAMN02138579,JCNV00000000.1,na,1328373,573,Klebsiella pneumoniae MGH 39,strain=MGH 39,...,Broad Institute,GCA_000567725.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella pneumoniae,Klebsiella,Bacteria
6651,24704,GCF_000735285.1,PRJNA224116,SAMN02768808,JPIQ00000000.1,na,573,573,Klebsiella pneumoniae,strain=IA565,...,University of Michigan,GCA_000735285.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella pneumoniae,Klebsiella,Bacteria
6744,25961,GCF_000770035.1,PRJNA224116,SAMN02951658,JPUT00000000.1,na,562,562,Escherichia coli,strain=2011EL-1370-2,...,Centers for Disease Control and Prevention,GCA_000770035.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Escherichia coli,Escherichia,Bacteria
9738,54076,GCF_001462715.1,PRJNA224116,SAMN04158281,LNGY00000000.1,na,562,562,Escherichia coli,strain=K71-77,...,Norwegian National Advisory Unit on Detection ...,GCA_001462715.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Escherichia coli,Escherichia,Bacteria
9739,54077,GCF_001462735.1,PRJNA224116,SAMN04158282,LNGZ00000000.1,na,573,573,Klebsiella pneumoniae,strain=K66-45,...,Norwegian National Advisory Unit on Detection ...,GCA_001462735.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella pneumoniae,Klebsiella,Bacteria
9748,54086,GCF_001462925.1,PRJNA224116,SAMN04158294,LNHL00000000.1,na,562,562,Escherichia coli,strain=50579417,...,Norwegian National Advisory Unit on Detection ...,GCA_001462925.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Escherichia coli,Escherichia,Bacteria
10949,64627,GCF_001728785.1,PRJNA224116,SAMN04191551,LYVB00000000.1,na,562,562,Escherichia coli,strain=FRIK2069,...,US Food and Drug Administration,GCA_001728785.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Escherichia coli,Escherichia,Bacteria
19094,101926,GCF_003008295.1,PRJNA224116,SAMN08637771,PVPK00000000.1,na,562,562,Escherichia coli,strain=BA22372,...,Christian Medical College,GCA_003008295.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Escherichia coli,Escherichia,Bacteria


In [14]:
# for i in range(1):
#     count = 0
#     for ii,row in complete_df.iterrows():    
#         genome_id = row['assembly_accession'] + '_' + row['asm_name']
#         genome_id = genome_id.replace(':','_')
#         genome_id = genome_id.replace(' ','_')
#         genome_id = genome_id.replace('/','_')
#         genome_id = genome_id.replace('#','_')

#         genome_file, annotation_file = download_refseq_assembly_curl(row['ftp_path'])
#         if (genome_file is None) or (annotation_file is None):        
#             break
#         # print(genome_id, row['species_name'])
#         count += 1
#     print(count)
#     #time.sleep(120)

470


In [42]:
#look at pyspoa notebook to see how to align a test sequence into the msa