In [1]:
%matplotlib inline

In [2]:
import pandas as pd
from Bio import SeqIO, Entrez
Entrez.email = 'adam.hockenberry@utexas.edu'  # Always tell NCBI who you are
import time
import glob

# First make the starting datatable better

In [3]:
df = pd.read_csv('../Data/NCBI_phage_db/all_complete_phage_info.csv')
print(df.shape)
df.head()

(17345, 18)


Unnamed: 0,Accession,Release_Date,Species,Genus,Family,Length,Sequence_Type,Nuc_Completeness,Genotype,Segment,Authors,Publications,Geo_Location,Host,Isolation_Source,Collection_Date,BioSample,GenBank_Title
0,NC_045425,2019-12-16T00:00:00Z,Thermus virus OH3,,Inoviridae,5688,RefSeq,complete,,,"Nagayoshi,Y., Kumagae,K., Mori,K., Tashiro,K.,...",26941711.0,Japan,Thermus thermophilus HB8,,,,"Thermus phage phiOH3 genomic DNA, complete genome"
1,NC_044940,2019-10-02T00:00:00Z,Pectinobacterium virus PEAT2,Peatvirus,Myoviridae,48659,RefSeq,complete,,,"Kalischuk,M., Hachey,J., Thomas,D., Kawchuk,L.",,,,,,,"Pectobacterium phage PEAT2, complete genome"
2,NC_043767,2019-07-20T00:00:00Z,Mycobacterium virus TA17a,Rosebushvirus,Siphoviridae,67324,RefSeq,complete,,,"Lunt,B.L., Payne,D.E., Fisher,J.N.B., Smith,K....",,"USA: Provo, UT",Mycolicibacterium smegmatis MC2 155,,2009-09-13,,"Mycobacterium virus TA17a, complete genome"
3,NC_043027,2019-06-28T00:00:00Z,Bacillus virus PBS1,,Myoviridae,252197,RefSeq,complete,,,"Russell,D.A., Jacobs-Sera,D., Duda,R., Hatfull...",,,Bacillus subtilis,,,,"Bacillus virus PBS1, complete genome"
4,NC_043028,2019-06-28T00:00:00Z,Xanthomonas virus Xf109,,Inoviridae,7190,RefSeq,complete,,,"Yeh,T.Y.",27743252.0,,Xanthomonas oryzae ATCC 35933,,,,"Xanthomonas phage Xf109, complete genome"


In [4]:
df = df[df['Host'].isnull()==False]
print(df.shape)

(12483, 18)


In [5]:
df['Host'].value_counts()

Mycolicibacterium smegmatis MC2 155        1727
Escherichia coli                            745
Gordonia terrae                             382
Escherichia coli C                          346
Lactococcus lactis                          323
                                           ... 
Streptococcus pseudoporcinus SPIN 20026       1
Streptococcus pneumoniae GA41317              1
Streptococcus uberis C5388                    1
Escherichia coli PA8                          1
Leptospira                                    1
Name: Host, Length: 967, dtype: int64

**Better treatment of host taxonomy**

In [23]:
taxonomies_to_fetch = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
problematic_hosts = []
for host in list(set(df['Host']))[:]:
    matching_indices = df[df['Host']==host].index
    if df.loc[matching_indices]['Host_superkingdom_id'].isnull().all()==False:
        print('Skipping')
        continue
    print(host)
    handle = Entrez.esearch(db='Taxonomy', term=host)
    record = Entrez.read(handle)
    if len(record['IdList']) != 1:
        problematic_hosts.append(host)
        continue
    record_id = record['IdList'][0]
    handle = Entrez.efetch(db="Taxonomy", id=record_id, retmode="xml")
    records = Entrez.read(handle)
    if len(records) != 1:
        problematic_hosts.append(host)
        continue        
    record = records[0]
    ###First get the genetic code right
    df.at[matching_indices, 'GeneticCode'] = record['GeneticCode']['GCId']
    ###Now possibly add the record itself
    if record ['Rank'] in taxonomies_to_fetch:
        df.at[matching_indices, 'Host_{}_id'.format(record['Rank'])] = record['TaxId']
        df.at[matching_indices, 'Host_{}_name'.format(record['Rank'])] = record['ScientificName']
    ###Finally go through the lineage
    lineage = record['LineageEx']
    for i in lineage:
        if i['Rank'] in taxonomies_to_fetch:
            df.at[matching_indices, 'Host_{}_id'.format(i['Rank'])] = i['TaxId']
            df.at[matching_indices, 'Host_{}_name'.format(i['Rank'])] = i['ScientificName']
    time.sleep(10)

Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Escherichia coli O157:H43 str. T22
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Escherichia coli O145:NM
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Nodularia
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Gordonia
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Skipping
Esc

Lysinibacillus sphaericus C3-41
Thermus sp. TC4
Lactococcus garvieae Lg2
Pseudomonas syringae
Morganella morganii
Microcystis aeruginosa
Enterococcus faecalis 62
Synechococcus sp. WH 8018
Fusobacterium nucleatum
Streptococcus intermedius
Vibrio cholerae O1
Streptococcus anginosus
Sphaerotilus natans
Idiomarinaceae bacterium N2-2
Vibrio metschnikovii
Streptococcus pneumoniae BS458
Acaryochloris marina
Rhizobium gallicum
Acinetobacter baumannii A118
Streptomyces toxytricini
Sodalis glossinidius
Mycolicibacterium aichiense
Escherichia coli C
Streptococcus pneumoniae ST556
Lactococcus lactis subsp. cremoris TIFN7
Streptococcus pneumoniae GA47033
Bacteroides intestinalis
Polaribacter
Streptomyces sp.
Lactobacillus
Epibacterium mobile
Microbacterium natoriense
Caulobacter vibrioides CB15
Streptococcus equinus
Trichormus variabilis
Plesiomonas sp.
Streptococcus parauberis KCTC 11980BP
Streptococcus suis ST1
Aeromonas rivipollensis
Pseudomonas aeruginosa CHA
Myxococcus xanthus
Sphingomonas pau

In [34]:
df['Host_species_name'].value_counts()

Mycolicibacterium smegmatis    1978
Escherichia coli               1187
Salmonella enterica             423
Pseudomonas aeruginosa          400
Gordonia terrae                 382
                               ... 
Salipiger profundus               1
Pseudomonas sp. 1-1-1b            1
Rothia dentocariosa               1
Phaeobacter inhibens              1
Microcystis elabens               1
Name: Host_species_name, Length: 549, dtype: int64

In [35]:
df['GeneticCode'].value_counts()

11    12021
1       270
4        15
Name: GeneticCode, dtype: int64

**Fix the problematic hosts**

In [29]:
print(len(set(problematic_hosts)))
problematic_hosts

27


['Escherichia coli O157:H43 str. T22',
 'Escherichia coli O145:NM',
 'Nodularia',
 'Gordonia',
 'Escherichia coli O104:H4',
 'Escherichia coli BL21(DE3)',
 'Bacillus',
 'Escherichia coli O157:H7 str. EDL933',
 '[Bacillus] clarkii',
 'Yersinia enterocolitica (type O:3)',
 'Streptomyces coelicolor A3(2)',
 'Edwardsiella',
 '[Brevibacterium] flavum',
 'Curvibacter sp. AEP1.3',
 'Streptococcus anginosus MAS624',
 'Morganella sp. (in: Bacteria)',
 'Clostridium saccharoperbutylacetonicum N1-4(HMT)',
 'Methylophilaceae bacterium IMCC19250',
 'Erwinia horticola',
 'Escherichia coli O157:H7',
 'Escherichia coli O111:H2',
 'Escherichia coli O103:H25 str. NIPH-11060424',
 'Escherichia coli DH5[alpha]',
 'Bacillus sp. (in: Bacteria)',
 'Yersinia',
 'Phormidium sp. MIS-PhA',
 'Paracoccus']

**Cases with special characters**

In [56]:
still_problematic = []
for host in problematic_hosts:
    matching_indices = df[df['Host']==host].index
    host = host.replace(':', ' ')
    host = host.replace('[', '').replace(']', '')
    host = host.replace('(', ' ').replace(')', ' ')
    if df.loc[matching_indices]['Host_superkingdom_id'].isnull().all()==False:
        print('Skipping')
        continue
    print(host)
    handle = Entrez.esearch(db='Taxonomy', term=host)
    record = Entrez.read(handle)
    if len(record['IdList']) != 1:
        still_problematic.append(host)
        continue
    record_id = record['IdList'][0]
    handle = Entrez.efetch(db="Taxonomy", id=record_id, retmode="xml")
    records = Entrez.read(handle)
    if len(records) != 1:
        still_problematic.append(host)
        continue        
    record = records[0]
    ###First get the genetic code right
    df.at[matching_indices, 'GeneticCode'] = record['GeneticCode']['GCId']
    ###Now possibly add the record itself
    if record ['Rank'] in taxonomies_to_fetch:
        df.at[matching_indices, 'Host_{}_id'.format(record['Rank'])] = record['TaxId']
        df.at[matching_indices, 'Host_{}_name'.format(record['Rank'])] = record['ScientificName']
    ###Finally go through the lineage
    lineage = record['LineageEx']
    for i in lineage:
        if i['Rank'] in taxonomies_to_fetch:
            df.at[matching_indices, 'Host_{}_id'.format(i['Rank'])] = i['TaxId']
            df.at[matching_indices, 'Host_{}_name'.format(i['Rank'])] = i['ScientificName']
    time.sleep(10)

Escherichia coli O157 H43 str. T22
Escherichia coli O145 NM
Nodularia
Gordonia
Escherichia coli O104 H4
Escherichia coli BL21 DE3 
Bacillus
Escherichia coli O157 H7 str. EDL933
Bacillus clarkii
Yersinia enterocolitica  type O 3 
Streptomyces coelicolor A3 2 
Edwardsiella
Brevibacterium flavum
Curvibacter sp. AEP1.3
Streptococcus anginosus MAS624
Morganella sp.  in  Bacteria 
Clostridium saccharoperbutylacetonicum N1-4 HMT 
Methylophilaceae bacterium IMCC19250
Erwinia horticola
Escherichia coli O157 H7
Escherichia coli O111 H2
Escherichia coli O103 H25 str. NIPH-11060424
Escherichia coli DH5alpha
Bacillus sp.  in  Bacteria 
Yersinia
Phormidium sp. MIS-PhA
Paracoccus


In [57]:
print(len(set(still_problematic)))
still_problematic

11


['Nodularia',
 'Gordonia',
 'Bacillus',
 'Edwardsiella',
 'Curvibacter sp. AEP1.3',
 'Streptococcus anginosus MAS624',
 'Methylophilaceae bacterium IMCC19250',
 'Erwinia horticola',
 'Yersinia',
 'Phormidium sp. MIS-PhA',
 'Paracoccus']

**Cases with annoying genuses**

In [60]:
really_problematic = []
for host in still_problematic:
    matching_indices = df[df['Host']==host].index
    host = host.replace(':', ' ')
    host = host.replace('[', '').replace(']', '')
    host = host.replace('(', ' ').replace(')', ' ')
    if df.loc[matching_indices]['Host_superkingdom_id'].isnull().all()==False:
        print('Skipping')
        continue
    print(host)
    handle = Entrez.esearch(db='Taxonomy', term=host)
    record = Entrez.read(handle)
    if len(record['IdList']) == 0:
        really_problematic.append(host)
        continue
    temp_ids = record['IdList']
    successful_ids = []
    for record_id in temp_ids:
        handle = Entrez.efetch(db="Taxonomy", id=record_id, retmode="xml")
        records = Entrez.read(handle)
        if len(records) != 1:
            really_problematic.append(host)
            continue        
        record = records[0]
        lineage = record['LineageEx']
        for i in lineage:
            if i['Rank']=='superkingdom' and i['ScientificName'] =='Bacteria':
                successful_ids.append(record_id)
    if len(successful_ids) == 1:
        record_id = successful_ids[0]
    else:
        really_problematic.append(host)
    
    handle = Entrez.efetch(db="Taxonomy", id=record_id, retmode="xml")
    records = Entrez.read(handle)
    if len(records) != 1:
        really_problematic.append(host)
        continue        
    record = records[0]
    ###First get the genetic code right
    df.at[matching_indices, 'GeneticCode'] = record['GeneticCode']['GCId']
    ###Now possibly add the record itself
    if record ['Rank'] in taxonomies_to_fetch:
        df.at[matching_indices, 'Host_{}_id'.format(record['Rank'])] = record['TaxId']
        df.at[matching_indices, 'Host_{}_name'.format(record['Rank'])] = record['ScientificName']
    ###Finally go through the lineage
    lineage = record['LineageEx']
    for i in lineage:
        if i['Rank'] in taxonomies_to_fetch:
            df.at[matching_indices, 'Host_{}_id'.format(i['Rank'])] = i['TaxId']
            df.at[matching_indices, 'Host_{}_name'.format(i['Rank'])] = i['ScientificName']
    time.sleep(5)

Nodularia
Gordonia
Bacillus
Edwardsiella
Curvibacter sp. AEP1.3
Streptococcus anginosus MAS624
Methylophilaceae bacterium IMCC19250
Erwinia horticola
Yersinia
Phormidium sp. MIS-PhA
Paracoccus


In [61]:
print(len(really_problematic))
print(really_problematic)

5
['Curvibacter sp. AEP1.3', 'Streptococcus anginosus MAS624', 'Methylophilaceae bacterium IMCC19250', 'Erwinia horticola', 'Phormidium sp. MIS-PhA']


In [72]:
answers = [2685271, 1328, 2030816, 551, 1199]

In [75]:
taxonomies_to_fetch = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
problematic_hosts = []
for host, answer in zip(really_problematic, answers):
    matching_indices = df[df['Host']==host].index
    if df.loc[matching_indices]['Host_superkingdom_id'].isnull().all()==False:
        print('Skipping')
        continue
    print(host)
    record_id = str(answer)
    handle = Entrez.efetch(db="Taxonomy", id=record_id, retmode="xml")
    records = Entrez.read(handle)    
    record = records[0]
    ###First get the genetic code right
    df.at[matching_indices, 'GeneticCode'] = record['GeneticCode']['GCId']
    ###Now possibly add the record itself
    if record ['Rank'] in taxonomies_to_fetch:
        df.at[matching_indices, 'Host_{}_id'.format(record['Rank'])] = record['TaxId']
        df.at[matching_indices, 'Host_{}_name'.format(record['Rank'])] = record['ScientificName']
    ###Finally go through the lineage
    lineage = record['LineageEx']
    for i in lineage:
        if i['Rank'] in taxonomies_to_fetch:
            df.at[matching_indices, 'Host_{}_id'.format(i['Rank'])] = i['TaxId']
            df.at[matching_indices, 'Host_{}_name'.format(i['Rank'])] = i['ScientificName']
    time.sleep(10)

Skipping
Skipping
Skipping
Skipping
Skipping


**Write the new and improved file**

In [82]:
df['Host_superkingdom_name'].value_counts()

Bacteria     12211
Eukaryota      270
Archaea          2
Name: Host_superkingdom_name, dtype: int64

In [83]:
df = df[df['Host_superkingdom_name']=='Bacteria']
print(df.shape)

(12211, 33)


In [89]:
df.to_csv('../Data/NCBI_phage_db/all_complete_phage_info_HOSTTAXONOMY_05152020.tsv', sep='\t', index=False)

# Select the rows that I care about

In [90]:
df = pd.read_csv('../Data/NCBI_phage_db/all_complete_phage_info_HOSTTAXONOMY_05152020.tsv', sep='\t')
print(df.shape)
df.head()

(12211, 33)


Unnamed: 0,Accession,Release_Date,Species,Genus,Family,Length,Sequence_Type,Nuc_Completeness,Genotype,Segment,...,Host_phylum_id,Host_phylum_name,Host_class_id,Host_class_name,Host_order_id,Host_order_name,Host_family_id,Host_family_name,Host_species_id,Host_species_name
0,NC_045425,2019-12-16T00:00:00Z,Thermus virus OH3,,Inoviridae,5688,RefSeq,complete,,,...,1297.0,Deinococcus-Thermus,188787.0,Deinococci,68933.0,Thermales,188786.0,Thermaceae,274.0,Thermus thermophilus
1,NC_043767,2019-07-20T00:00:00Z,Mycobacterium virus TA17a,Rosebushvirus,Siphoviridae,67324,RefSeq,complete,,,...,201174.0,Actinobacteria,1760.0,Actinobacteria,85007.0,Corynebacteriales,1762.0,Mycobacteriaceae,1772.0,Mycolicibacterium smegmatis
2,NC_043027,2019-06-28T00:00:00Z,Bacillus virus PBS1,,Myoviridae,252197,RefSeq,complete,,,...,1239.0,Firmicutes,91061.0,Bacilli,1385.0,Bacillales,186817.0,Bacillaceae,1423.0,Bacillus subtilis
3,NC_043028,2019-06-28T00:00:00Z,Xanthomonas virus Xf109,,Inoviridae,7190,RefSeq,complete,,,...,1224.0,Proteobacteria,1236.0,Gammaproteobacteria,135614.0,Xanthomonadales,32033.0,Xanthomonadaceae,347.0,Xanthomonas oryzae
4,NC_043029,2019-06-28T00:00:00Z,Stenotrophomonas virus SMA6,,Inoviridae,7648,RefSeq,complete,,,...,1224.0,Proteobacteria,1236.0,Gammaproteobacteria,135614.0,Xanthomonadales,32033.0,Xanthomonadaceae,40324.0,Stenotrophomonas maltophilia


In [102]:
df['Host_family_name'].value_counts()[:20]

Enterobacteriaceae      2453
Mycobacteriaceae        2143
Streptococcaceae        1446
Pseudomonadaceae         617
Gordoniaceae             452
Synechococcaceae         386
Bacillaceae              384
Staphylococcaceae        351
Vibrionaceae             326
Micrococcaceae           304
Streptomycetaceae        254
Microbacteriaceae        253
Propionibacteriaceae     211
Moraxellaceae            182
Flavobacteriaceae        181
Lactobacillaceae         128
Enterococcaceae          126
Burkholderiaceae         122
Pectobacteriaceae        105
Erwiniaceae              105
Name: Host_family_name, dtype: int64

In [None]:
selected_accession_numbers = ['NC_001604', 'NC_001416']
all_genomes = SeqIO.parse('../Data/NCBI_phage_db/all_complete_phage_nts.fasta', 'fasta')
for genome in all_genomes:
    if genome.id in selected_accession_numbers:
        with open('../Data/NCBI_phage_db/phage_genomes/{}.fasta'.format(genome.id), 'w') as outfile:
            SeqIO.write(genome, outfile, 'fasta')

# Split apart the combined genome `fasta` file to write the genomes that I selected

In [None]:
genome = SeqIO.read('../Data/NCBI_phage_db/phage_genomes/NC_001416.fasta', 'fasta')
# for genome_file in glob.glob('../Data/NCBI_phage_db/phage_genomes/*.fasta'):
#     genome = SeqIO.read(genome_file, 'fasta')

In [None]:
hits = []
all_cds = SeqIO.parse('../Data/NCBI_phage_db/all_complete_phage_CDS.fasta', 'fasta')
for cds in all_cds:
    if genome.id in cds.id.split(':')[0]:
        hits.append(cds)
print(len(hits))

In [None]:
print(len(hits))

In [None]:
for hit in hits:
    reverse_complement = False
    if 'join' in hit.id:
        continue
    if 'complement(' == hit.id[:11]:
        print('here')
        reverse_complement = True
#         hit.id = hit.id[11:-1]
    if len(hit.id.split(':')) == 2:
        region = hit.id.split(':')[-1]
        if len(region.split('..')) == 2:
            start = int(region.split('..')[0])
            stop = int(region.split('..')[-1])
    print(start, stop)
    print(genome.seq[start-1:stop]==hit.seq)

In [None]:
hit

In [None]:
hit

# Make CDS datatables for each of those genomes by merging the genome `fasta` file with the CDS `fasta` file

**et voila**