In [1]:
from __future__ import print_function, division
# %matplotlib inline
# import matplotlib.pylab as plt
import os, sys
import numpy as np
import pandas as pd
import os, shutil, glob
import urllib
import wget
import subprocess
import logging
from ete3 import NCBITaxa
ncbi = NCBITaxa()
from collections import OrderedDict, Counter
import time

logging.basicConfig(level=logging.DEBUG,
                    format='%(asctime)s [%(name)s] %(levelname)s : %(message)s')
logger = logging.getLogger(__name__)

#sys.path.append('/misc/workspace/amromics/amromics/')
#from  amromics.utils.bioseq import read_sequence_file


In [2]:
ncbi_root = '/data/hoan/amromics/data/ncbi/'
ncbi_url = 'ftp://ftp.ncbi.nlm.nih.gov/'

In [3]:
def download_assembly(genome_id):
    """
    Download the the genome file from patric if not there already
    """
    #make sure directory exist
    directory = os.path.join(patric_root,'genomes',genome_id)
    if not os.path.exists(directory):
        os.makedirs(directory)
    genome_file = os.path.join(directory, genome_id + '.fna')
    if os.path.isfile(genome_file):
        logger.info('Genome {} has been downloaded'.format(genome_id))
        return genome_file
    genome_url = patric_url + 'genomes/' + genome_id + '/' + genome_id + '.fna'    
    logger.info('Downloaded {}'.format(genome_id))
    return wget.download(genome_url, genome_file)


def run_command(cmd, timing_log=None):
    """
    Run a command line, return the returning code of the command
    :param cmd:
    :param timing_log:
    :return:
    """
    if timing_log is not None:
        cmd = '/usr/bin/time --append -v -o {} bash -c "{}"'.format(timing_log, cmd)
    #logger.info('Running "{}'.format(cmd))
    ret = os.system(cmd)
    if ret != 0:
        logger.error('Running "{}" returns {}'.format(cmd, ret))
    return ret


def get_linages(taxon_id, linage_ranks = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']):
    lin_ids = ncbi.get_lineage(taxon_id)
    lin_names =  ncbi.get_taxid_translator(lin_ids)
    lin_ranks = ncbi.get_rank(lin_ids)
    #swap key<-> value
    lin_ranks = {lin_ranks[key]: key for key in lin_ranks}
    
    ret_names = [None] * len(linage_ranks)
    ret_ids = [None] * len(linage_ranks)
    for ii, rank in enumerate(linage_ranks):
        if rank in lin_ranks:
            lin_id = lin_ranks[rank]
            ret_ids[ii] = lin_id
            ret_names[ii] = lin_names[lin_id]
    return ret_ids, ret_names            
    

def download_patric_assembly_curl(genome_id):
    """
    Download the the genome file from patric if not there already
    """
    #make sure directory exist
    directory = os.path.join(patric_root,'genomes',genome_id)
    if not os.path.exists(directory):
        os.makedirs(directory)
    genome_file = os.path.join(directory, genome_id + '.fna')
    genome_file_gz = os.path.join(directory, genome_id + '.fna.gz')
    if os.path.isfile(genome_file_gz):
        logger.info('Previously done {}'.format(genome_id))
        return genome_file_gz
    elif os.path.isfile(genome_file):
        logger.info('Partly done {}'.format(genome_id))
        cmd = 'gzip ' + genome_file
        if run_command(cmd) != 0:
            return None
        else:
            assert os.path.isfile(genome_file_gz)
            return genome_file_gz
    else:
        # logger.info('Download')
        genome_url = patric_url + 'genomes/' + genome_id + '/' + genome_id + '.fna'    
        time.sleep(3)
        cmd = 'curl -R -o {} {}'.format(genome_file, genome_url)
        if run_command(cmd) != 0:
            return None
        cmd = 'gzip ' + genome_file
        if run_command(cmd) != 0:
            return None
        else:
            assert os.path.isfile(genome_file_gz)
            return genome_file_gz

def download_refseq_assembly_curl(refseq_dir):
    """
    Download the the genome file from refseq if not there already
    """
    #make sure directory exist
    path = refseq_dir[len(ncbi_url):]
    genome_path = os.path.join(ncbi_root, path)
    os.makedirs(genome_path, exist_ok=True)
    genome_id = refseq_dir.split('/')[-1]
    
    genome_file = os.path.join(genome_path, genome_id + '_genomic.fna.gz')
    annotation_file = os.path.join(genome_path, genome_id + '_genomic.gbff.gz')
    
    genome_url = refseq_dir + '/' + genome_id + '_genomic.fna.gz'
    gbf_url = refseq_dir + '/' + genome_id + '_genomic.gbff.gz'
    if not os.path.isfile(genome_file):
        cmd = 'curl --retry 10 -R -o {} {}'.format(genome_file, genome_url)
        time.sleep(1)
        ret = run_command(cmd)
        if ret != 0:
            logger.error('Error downloading {} return {}'.format(genome_url, ret))
            genome_file = None
        else:
            logger.info('Downloaded {}'.format(genome_file))
    else:
        pass
        #logger.info('File {} exists!'.format(genome_file))
    
    if not os.path.isfile(annotation_file):
        cmd = 'curl --retry 3 -R -o {} {}'.format(annotation_file, gbf_url)
        time.sleep(1)
        ret = run_command(cmd)
        if ret != 0:
            logger.error('Error downloading {} return {}'.format(gbf_url, ret))
            annotation_file = None 
        else:
            logger.info('Downloaded {}'.format(annotation_file))
    else:
        #logger.info('File {} exists!'.format(annotation_file))
        pass
    return genome_file, annotation_file

In [4]:
def download_refseq_assembly_curl_gff(refseq_dir, sample_id=""):
    """
    Download the the genome file from refseq if not there already
    sample_id = / + sample_id
    """
    #make sure directory exist
    path = refseq_dir[len(ncbi_url):]
    genome_path = os.path.join(ncbi_root, sample_id)
    os.makedirs(genome_path, exist_ok=True)
    genome_id = refseq_dir.split('/')[-1]
    
    genome_file = os.path.join(genome_path, genome_id + '_genomic.fna.gz')
    annotation_file = os.path.join(genome_path, genome_id + '_genomic.gff.gz')
    
    genome_url = refseq_dir + '/' + genome_id + '_genomic.fna.gz'
    gbf_url = refseq_dir + '/' + genome_id + '_genomic.gff.gz'
    if not os.path.isfile(genome_file):
        cmd = 'curl --retry 10 -R -o {} {}'.format(genome_file, genome_url)
        time.sleep(1)
        ret = run_command(cmd)
        if ret != 0:
            logger.error('Error downloading {} return {}'.format(genome_url, ret))
            genome_file = None
        else:
            logger.info('Downloaded {}'.format(genome_file))
    else:
        pass
        #logger.info('File {} exists!'.format(genome_file))
    
    if not os.path.isfile(annotation_file):
        cmd = 'curl --retry 3 -R -o {} {}'.format(annotation_file, gbf_url)
        time.sleep(1)
        ret = run_command(cmd)
        if ret != 0:
            logger.error('Error downloading {} return {}'.format(gbf_url, ret))
            annotation_file = None 
        else:
            logger.info('Downloaded {}'.format(annotation_file))
    else:
        #logger.info('File {} exists!'.format(annotation_file))
        pass
    return genome_file, annotation_file

# Data preprocessing

## Get lineage information

As the lineage information from patric might have been outdated, we refer to NCBI to update the lineage information. As in turns out, there are changes to NCBI taxatomy incuding taxon IDs and taxon name. In fact, we
infor lieage information directly from taxon ID.



In [5]:
# #gl_df = pd.read_table(os.path.join(patric_root, 'RELEASE_NOTES', 'genome_lineage'),  dtype={'genome_id': str})


# gm_df = pd.read_table(os.path.join(patric_root, 'RELEASE_NOTES', 'genome_metadata'), dtype={'genome_id': str})
# #note that i decided to get the linage information instead of taking from gl
# linages = gm_df['taxon_id'].apply(lambda x:get_linages(x))
# linage_ids, linage_names  = zip(*linages)
# superkingdom_names, phylum_names, class_names, order_names, family_names, genus_names, species_names = zip(*linage_names)
# superkingdom_ids, phylum_ids, class_ids, order_ids, family_ids, genus_ids, species_ids = zip(*linage_ids)

# gm_df['species_name'] = species_names
# gm_df['genus_name'] = genus_names
# gm_df['species_id'] = species_ids
# gm_df['genus_id'] = genus_ids

# #gm_df['family'] = family_names
# #gm_df['order'] = order_names
# #gm_df['phylum'] = phylum_names
# #gm_df['domain'] = superkingdom_names

# #(gm_df['domain'] == 'Bacteria')
# #gm_df = gm_df[(gm_df['genome_status'] != 'Plasmid') & (gm_df['contigs'] < 500) ]
# print(len(gm_df))

We grouped the data in species, and pick those relavant species. Note that many Salmonella taxa do not have a
species name, and hence we get those from genus

In [6]:
#Look at the list of most common specices, and found:
# Counter(gm_df['species']).most_common()
amr_species = [    
    # 'Mycobacterium tuberculosis', # 27993),
    'Escherichia coli', # 23309),
    'Streptococcus pneumoniae', # 20697),
    'Salmonella enterica', # 18169),
    'Staphylococcus aureus', # 12878),
    'Klebsiella pneumoniae', # 12246),
    'Acinetobacter baumannii', # 6936),
    # None, # 5303),
    'Pseudomonas aeruginosa', # 5263),
    'Neisseria gonorrhoeae', # 5096),
    # 'Listeria monocytogenes', # 4111),
    # 'Campylobacter jejuni', # 2973),
    # 'uncultured Pelagibacteraceae bacterium', # 2390),
    # 'Clostridioides difficile', # 2345),
     'Enterococcus faecium', # 2189),
    # 'Streptococcus pyogenes', # 2063),
    # 'Neisseria meningitidis', # 2001),
    # 'Mycobacteroides abscessus', # 1734),
    # 'Campylobacter coli', # 1724),
    # 'Helicobacter pylori', # 1644),
    # 'Burkholderia pseudomallei', # 1615),
    'Enterococcus faecalis', # 1564),
    # 'Pseudomonas viridiflava', # 1540),
    'Shigella sonnei', # 1532),
    # 'Vibrio parahaemolyticus', # 1474),
    # 'Streptococcus agalactiae', # 1355),
    'Vibrio cholerae', # 1350),
    'Streptococcus suis', # 1319),
    # 'Bacillus cereus', # 1252),
    'Enterobacter cloacae', # 1187)
    'Corynebacterium diphtheriae',
]
#gm_df = gm_df[(gm_df['genus_name'] == 'Salmonella') | (gm_df['species_name'].isin(amr_species))]


amr_species = [    
    # 'Mycobacterium tuberculosis', # 27993),
   # 'Escherichia coli', # 23309),
#=#    'Streptococcus pneumoniae', # 20697),
     # 'Salmonella enterica', # 18169),
    # 'Staphylococcus aureus', # 12878),
    # 'Klebsiella pneumoniae', # 12246),
#=#     'Acinetobacter baumannii', # 6936),
    # None, # 5303),
#=#     'Pseudomonas aeruginosa', # 5263),
#=#     'Neisseria gonorrhoeae', # 5096),
    # 'Listeria monocytogenes', # 4111),
    # 'Campylobacter jejuni', # 2973),
    # 'uncultured Pelagibacteraceae bacterium', # 2390),
    # 'Clostridioides difficile', # 2345),
    # 'Enterococcus faecium', # 2189),
    # 'Streptococcus pyogenes', # 2063),
    # 'Neisseria meningitidis', # 2001),
    # 'Mycobacteroides abscessus', # 1734),
    # 'Campylobacter coli', # 1724),
    # 'Helicobacter pylori', # 1644),
    # 'Burkholderia pseudomallei', # 1615),
#=#     'Enterococcus faecalis', # 1564),
    # 'Pseudomonas viridiflava', # 1540),
#=#     'Shigella sonnei', # 1532),
    # 'Vibrio parahaemolyticus', # 1474),
    # 'Streptococcus agalactiae', # 1355),
    # 'Vibrio cholerae', # 1350),
#=#     'Streptococcus suis', # 1319),
    # 'Bacillus cereus', # 1252),
    # 'Enterobacter cloacae', # 1187)
###    'Corynebacterium diphtheriae',
    # 'Cereibacter sphaeroides',
    # 'Staphylococcus aureus'
    # 'Campylobacter jejuni',
    # 'Campylobacter coli',
    'Klebsiella quasipneumoniae'
]


In [7]:
refseq_assemly = os.path.join(ncbi_root, 'assembly_summary_refseq.txt')


refseq_df = pd.read_csv(refseq_assemly, sep='\t', header=1)
refseq_df.rename({'# assembly_accession':'assembly_accession'},axis=1, inplace=True)

all_species_ids = set(refseq_df['species_taxid'])
species_map = {}
for taxon_id in all_species_ids:
    try:
        r = get_linages(taxon_id)
        species = r[1][6]
        genus = r[1][5]
        domain = r[1][0]        
        species_map[taxon_id] = (species, genus, domain)
    except:
        species_map[taxon_id] = (None, None, None)  
        

refseq_df['species_name'] = refseq_df['species_taxid'].apply(lambda x:species_map[x][0])
refseq_df['genus_name']   = refseq_df['species_taxid'].apply(lambda x:species_map[x][1])
refseq_df['domain_name']  = refseq_df['species_taxid'].apply(lambda x:species_map[x][2])

#refseq_df = refseq_df[(refseq_df['genus_name'] == 'Salmonella') | (refseq_df['species_name'].isin(amr_species))].reset_index()
df = refseq_df[(refseq_df['species_name'].isin(amr_species))].reset_index()

Counter(df['species_name'])    

  exec(code_obj, self.user_global_ns, self.user_ns)


Counter({'Klebsiella quasipneumoniae': 750})

In [8]:
complete_df = df[df.assembly_level == 'Complete Genome']
not_complete_df = df[df.assembly_level != 'Complete Genome']
#[['assembly_accession', 'biosample', 'infraspecific_name','seq_rel_date', 'asm_name', 'ftp_path']]#.to_csv(None, index=False)
not_complete_df = not_complete_df[not_complete_df['biosample'].isin(complete_df['biosample'])]

not_complete_df.head(3)

Unnamed: 0,index,assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,...,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material,asm_not_live_date,species_name,genus_name,domain_name


In [9]:
complete_df.shape

(87, 27)

In [10]:
complete_df

Unnamed: 0,index,assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,...,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material,asm_not_live_date,species_name,genus_name,domain_name
16,46567,GCF_001278905.1,PRJNA224116,SAMN03945398,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=HKUOPLA,...,University of Hong Kong,GCA_001278905.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
17,46661,GCF_001280925.1,PRJNA224116,SAMN03955386,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=HKUOPLC,...,University of Hong Kong,GCA_001280925.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
21,59203,GCF_001596075.2,PRJNA224116,SAMN04382091,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=ATCC 700603,...,The Unniversity of Queensland,GCA_001596075.2,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
40,80868,GCF_002187935.3,PRJNA224116,SAMN05960914,,na,1463164,1463165,Klebsiella quasipneumoniae subsp. similipneumo...,strain=G747,...,JCVI,GCA_002187935.3,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
46,83075,GCF_002239895.2,PRJNA224116,SAMN05960932,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=G4584,...,JCVI,GCA_002239895.2,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,237443,GCF_024734295.1,PRJNA224116,SAMN30333744,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=K18-45,...,Henan Agricultural University,GCA_024734295.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
514,238526,GCF_024917695.1,PRJNA224116,SAMN28174098,,na,1667327,1463165,Klebsiella quasipneumoniae subsp. quasipneumoniae,strain=5463,...,University of Texas MD Anderson Cancer Center,GCA_024917695.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
691,263163,GCF_900635925.1,PRJNA224116,SAMEA3251433,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=NCTC11357,...,SC,GCA_900635925.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
734,265867,GCF_901422025.1,PRJNA224116,SAMEA3368328,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=NCTC9170,...,SC,GCA_901422025.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria


In [25]:
# for i in range(1):
#     count = 0
#     for ii,row in complete_df.iterrows():    
#         genome_id = row['assembly_accession'] + '_' + row['asm_name']
#         genome_id = genome_id.replace(':','_')
#         genome_id = genome_id.replace(' ','_')
#         genome_id = genome_id.replace('/','_')
#         genome_id = genome_id.replace('#','_')

#         genome_file, annotation_file = complete_df(row['ftp_path'], 'complete/Kp30')
#         # if (genome_file is None) or (annotation_file is None):        
#         #     break
#         # print(genome_id, row['species_name'])
#         count += 1
#         print(count)
#         if count > 30:
#             break

In [26]:
# complete_df.shape
# for i in range(len(complete_df.index)):
#     genome_file, annotation_file = download_refseq_assembly_curl_gff(complete_df.iloc[i,:]['ftp_path'], 'Cereibacter_sphaeroides_full')

In [11]:
import random
random.seed(0)
# rindex = random.sample(range(len(complete_df.index)), 50)
rindex = random.sample(range(len(complete_df.index)), 87)
print(rindex)

[49, 53, 5, 33, 65, 62, 51, 38, 61, 45, 74, 27, 64, 17, 36, 73, 12, 32, 68, 18, 39, 70, 9, 42, 30, 35, 6, 22, 75, 20, 66, 40, 13, 78, 81, 28, 83, 16, 3, 85, 0, 84, 25, 63, 55, 46, 56, 31, 21, 15, 57, 4, 82, 14, 37, 76, 44, 71, 50, 2, 58, 10, 86, 69, 48, 43, 29, 79, 59, 72, 52, 60, 34, 19, 8, 54, 47, 7, 1, 23, 26, 24, 80, 11, 41, 67, 77]


In [28]:
# count = 1
# for i in range(len(complete_df.index)):
#     for j in range(len(not_complete_df.index)):
#         if not_complete_df.iloc[j,3]==complete_df.iloc[i,3]:
#             # print(count, "/", i, ":",j, end="---")
#             # print (complete_df.iloc[i, 3],":", not_complete_df.iloc[j,3])
#             cml = 'python /data/hoan/amromics/panta/panta.py -p init -a /data/hoan/amromics/data/ncbi/complete/'+\
#             not_complete_df.iloc[j,3] +'/*.fna.gz -o /data/hoan/amromics/data/ncbi/complete/'+\
#             not_complete_df.iloc[j,3] + '/output -as -s'
#             # print(cml)
#             print (complete_df.iloc[i, 3],",") 
#             # run_command(cml)
#             # genome_file, annotation_file = download_refseq_assembly_curl_gff(complete_df.iloc[i,:]['ftp_path'], 'complete/'+complete_df.iloc[i, 3])
#             # genome_file, annotation_file = download_refseq_assembly_curl_gff(not_complete_df.iloc[j,:]['ftp_path'], 'complete/'+not_complete_df.iloc[j, 3])
#             genome_file, annotation_file = download_refseq_assembly_curl_gff(not_complete_df.iloc[j,:]['ftp_path'], 'incomplete')
#             count += 1
# # not_complete_df.iloc[22, 3]

In [12]:
for i in rindex:
    genome_file, annotation_file = download_refseq_assembly_curl_gff(complete_df.iloc[i,:]['ftp_path'], 'Kq_full') #Salmonella enterica
     # genome_file, annotation_file = download_refseq_assembly_curl_gff(complete_df.iloc[i,:]['ftp_path'], 'complete/'+complete_df.iloc[i, 3])

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1612k  100 1612k    0     0  1045k      0  0:00:01  0:00:01 --:--:-- 1045k
2023-04-19 16:42:14,379 [__main__] INFO : Downloaded /data/hoan/amromics/data/ncbi/Kq_full/GCF_017743035.1_ASM1774303v1_genomic.fna.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  455k  100  455k    0     0   401k      0  0:00:01  0:00:01 --:--:--  402k
2023-04-19 16:42:16,546 [__main__] INFO : Downloaded /data/hoan/amromics/data/ncbi/Kq_full/GCF_017743035.1_ASM1774303v1_genomic.gff.gz
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100 1557k  100 1557k    0     0  1198k      0  0:00:01  0:00:01 --:--:-- 1197k
2023-04-19 16:42:18

In [13]:
complete_df

Unnamed: 0,index,assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,...,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material,asm_not_live_date,species_name,genus_name,domain_name
16,46567,GCF_001278905.1,PRJNA224116,SAMN03945398,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=HKUOPLA,...,University of Hong Kong,GCA_001278905.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
17,46661,GCF_001280925.1,PRJNA224116,SAMN03955386,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=HKUOPLC,...,University of Hong Kong,GCA_001280925.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
21,59203,GCF_001596075.2,PRJNA224116,SAMN04382091,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=ATCC 700603,...,The Unniversity of Queensland,GCA_001596075.2,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
40,80868,GCF_002187935.3,PRJNA224116,SAMN05960914,,na,1463164,1463165,Klebsiella quasipneumoniae subsp. similipneumo...,strain=G747,...,JCVI,GCA_002187935.3,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
46,83075,GCF_002239895.2,PRJNA224116,SAMN05960932,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=G4584,...,JCVI,GCA_002239895.2,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
508,237443,GCF_024734295.1,PRJNA224116,SAMN30333744,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=K18-45,...,Henan Agricultural University,GCA_024734295.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
514,238526,GCF_024917695.1,PRJNA224116,SAMN28174098,,na,1667327,1463165,Klebsiella quasipneumoniae subsp. quasipneumoniae,strain=5463,...,University of Texas MD Anderson Cancer Center,GCA_024917695.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
691,263163,GCF_900635925.1,PRJNA224116,SAMEA3251433,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=NCTC11357,...,SC,GCA_900635925.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria
734,265867,GCF_901422025.1,PRJNA224116,SAMEA3368328,,na,1463165,1463165,Klebsiella quasipneumoniae,strain=NCTC9170,...,SC,GCA_901422025.1,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/9...,,,na,Klebsiella quasipneumoniae,Klebsiella,Bacteria


In [14]:
#look at pyspoa notebook to see how to align a test sequence into the msa

### Find sequences have the same biosample: match sample to use the complete seq as references for incomplete.

In [None]:
# not_complete_df.iloc[3,:]['ftp_path']

In [None]:
shared_samples_list = []
for i in range(len(complete_df.index)):
    for j in range(len(not_complete_df.index)):
        if not_complete_df.iloc[j,3]==complete_df.iloc[i,3]:
            # run_command(cml)
            print(i, j, '-----------------------------------')
            complete_genome_id = complete_df.iloc[i,:]['ftp_path'].split('/')[-1] + '_genomic'
            incomplete_genome_id = not_complete_df.iloc[j,:]['ftp_path'].split('/')[-1] + '_genomic'
            # print (complete_df.iloc[i, 3], complete_genome_id, incomplete_genome_id) 
            # shared_samples_list.append([complete_df.iloc[i, 3], complete_genome_id, incomplete_genome_id])
            shared_samples_list.append([complete_df.iloc[i, 3], complete_genome_id, incomplete_genome_id, complete_df.iloc[i,:]['ftp_path'], not_complete_df.iloc[j,:]['ftp_path']])
            print(complete_df.iloc[i,:]['ftp_path'])
            print(not_complete_df.iloc[j,:]['ftp_path'])
            # count += 1

In [25]:
# df2 = pd.DataFrame(np.array(shared_samples_list),
#                    columns=['biosample', 'complete', 'incomplete'])
df2 = pd.DataFrame(np.array(shared_samples_list),
                   columns=['biosample', 'complete', 'incomplete', 'c', 'in'])

In [35]:
df2.iloc[:,0].values

array(['SAMN04158282', 'SAMN02768808', 'SAMN02138579', 'SAMN21016516',
       'SAMN21016509', 'SAMN21016508', 'SAMN21016533', 'SAMN21016532',
       'SAMN21016531', 'SAMN21016529', 'SAMN21016528', 'SAMN21016527',
       'SAMN21016534', 'SAMN21016525', 'SAMN21016524', 'SAMN21016539',
       'SAMN21016535', 'SAMN21016518', 'SAMN21016517', 'SAMN21016540',
       'SAMN21016543', 'SAMN21016544', 'SAMN21016542', 'SAMN21016479',
       'SAMN21016545', 'SAMN19931439', 'SAMN19931475', 'SAMN19931468',
       'SAMN19931463', 'SAMN19931444'], dtype=object)

In [None]:
# df2.to_csv('/data/hoan/amromics/data/ncbi/Kp30_mix/sample_info.csv', index=False)

In [21]:
not_complete_df.iloc[7,:].values

array([212786, 'GCF_021136355.1', 'PRJNA224116', 'SAMN21016516',
       'JAIUAO000000000.1', 'na', 573, 573, 'Klebsiella pneumoniae',
       'strain=BD-41', nan, 'latest', 'Contig', 'Major', 'Full',
       '2021/12/09', 'ASM2113635v1', 'University of Bern',
       'GCA_021136355.1', 'identical',
       'https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/021/136/355/GCF_021136355.1_ASM2113635v1',
       nan, nan, 'na', 'Klebsiella pneumoniae', 'Klebsiella', 'Bacteria'],
      dtype=object)

In [14]:
## find a sample given biosample:
complete_df.loc[complete_df['biosample']=='SAMN02152539']

Unnamed: 0,index,assembly_accession,bioproject,biosample,wgs_master,refseq_category,taxid,species_taxid,organism_name,infraspecific_name,...,submitter,gbrs_paired_asm,paired_asm_comp,ftp_path,excluded_from_refseq,relation_to_type_material,asm_not_live_date,species_name,genus_name,domain_name
46,9342,GCF_000364385.3,PRJNA224116,SAMN02152539,,na,573,573,Klebsiella pneumoniae,strain=ATCC BAA-2146,...,Sandia National Laboratories,GCA_000364385.3,identical,https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/0...,,,na,Klebsiella pneumoniae,Klebsiella,Bacteria


In [16]:
sample_df = complete_df.loc[complete_df['biosample']=='SAMN02152539']
sample_df['ftp_path'].values

array(['https://ftp.ncbi.nlm.nih.gov/genomes/all/GCF/000/364/385/GCF_000364385.3_ASM36438v3'],
      dtype=object)