In [None]:
%matplotlib inline

In [None]:
import pandas as pd
from Bio import SeqIO, Entrez
Entrez.email = 'adam.hockenberry@utexas.edu'
import time
import glob

import os

# First read and place a few restrictions on the full data table

The relevant information was downloaded from: https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&Completeness_s=complete&VirusLineage_ss=Bacteriophage,%20all%20taxids&Proviral_s=include in November, 2020. Which is just: https://www.ncbi.nlm.nih.gov/labs/virus/vssi/#/virus?SeqType_s=Nucleotide&Proviral_s=include where I selected "Bacteriophage, all taxids (53348)" from the "Virus" search box and "complete" from the "Nucleotide Completeness" section. 

Raw data consists of a `.csv` containing relevant information about each sequence, a large `.fasta` file containing (nucleotide) genome sequences, and a large `.fasta` file containing coding sequences for each genome. I re-located and renamed these files in the `../Data/NCBI_phage_db/` directory:

1. `all_complete_phage_info_11_2020.csv`
2. `all_complete_phage_NTs_11_2020.fasta`
3. `all_complete_phage_CDSs_11_2020.fasta`

All of the resulting code assumes that these are valid file paths.

**First, read in the info file**

In [None]:
df = pd.read_csv('../../Data/NCBI_phage_db/all_complete_phage_info_11_2020.csv')
print(df.shape)
df.head()

**Use this space for some basic data exploration**

In [None]:
df['Family'].value_counts()

**Limit the dataset to only consider phages with explicitly defined hosts**

In [None]:
df = df[df['Host'].isnull()==False]
print(df.shape)

In [None]:
df['Host'].value_counts()

# Improve host taxonomy by querying strings to NCBI

This code takes some time and space, and is a bit finicky since we're basically trying to convert the messy string in the `Host` field to a meaningful string/number based on the NCBI taxonomy standards.

I'm surprised that this isn't done from the start, but such is the messiness of public databases...

In [None]:
###Define the taxonomy levels that I care about
taxonomies_to_fetch = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']

problematic_hosts = []
###Iterate through the *set* of host strings
for host in list(set(df['Host']))[:]:
    #Identify the indices that match this host string (to be used later)
    matching_indices = df[df['Host']==host].index
    print(host)
    #Try searching for the string
    handle = Entrez.esearch(db='Taxonomy', term=host)
    record = Entrez.read(handle)
    #Append anything with even a slight problem to an error bucket
    if len(record['IdList']) != 1:
        problematic_hosts.append(host)
        continue
        
    #Actually retreive the full taxonomy results
    record_id = record['IdList'][0]
    handle = Entrez.efetch(db="Taxonomy", id=record_id, retmode="xml")
    records = Entrez.read(handle)
    #Make sure no errors popped up again
    if len(records) != 1:
        problematic_hosts.append(host)
        continue        
    record = records[0]
    ###First get the genetic code right
    df.at[matching_indices, 'GeneticCode'] = record['GeneticCode']['GCId']
    ###Now possibly add the record itself
    if record['Rank'] in taxonomies_to_fetch:
        df.at[matching_indices, 'Host_{}_id'.format(record['Rank'])] = record['TaxId']
        df.at[matching_indices, 'Host_{}_name'.format(record['Rank'])] = record['ScientificName']
    ###Finally go through the lineage
    lineage = record['LineageEx']
    for i in lineage:
        if i['Rank'] in taxonomies_to_fetch:
            df.at[matching_indices, 'Host_{}_id'.format(i['Rank'])] = i['TaxId']
            df.at[matching_indices, 'Host_{}_name'.format(i['Rank'])] = i['ScientificName']
    time.sleep(6) ###Should probably sleep for some period of time

**Check out some basic summary stats from the resulting addtions to `df`**

In [None]:
df['Host_species_name'].value_counts()

In [None]:
df['GeneticCode'].value_counts()

## Now clean up / fix some of the problematic cases

In [None]:
problematic_hosts = list(set(problematic_hosts))
print(len(problematic_hosts))
for annoyance in problematic_hosts:
    print(annoyance)

**Fix the cases with special characters**

In [None]:
still_problematic = []
for host in problematic_hosts:
    matching_indices = df[df['Host']==host].index
    host = host.replace(':', ' ')
    host = host.replace('[', '').replace(']', '')
    host = host.replace('(', ' ').replace(')', ' ')
    print(host)
    handle = Entrez.esearch(db='Taxonomy', term=host)
    record = Entrez.read(handle)
    if len(record['IdList']) != 1:
        still_problematic.append(host)
        continue
    record_id = record['IdList'][0]
    handle = Entrez.efetch(db="Taxonomy", id=record_id, retmode="xml")
    records = Entrez.read(handle)
    if len(records) != 1:
        still_problematic.append(host)
        continue        
    record = records[0]
    ###First get the genetic code right
    df.at[matching_indices, 'GeneticCode'] = record['GeneticCode']['GCId']
    ###Now possibly add the record itself
    if record ['Rank'] in taxonomies_to_fetch:
        df.at[matching_indices, 'Host_{}_id'.format(record['Rank'])] = record['TaxId']
        df.at[matching_indices, 'Host_{}_name'.format(record['Rank'])] = record['ScientificName']
    ###Finally go through the lineage
    lineage = record['LineageEx']
    for i in lineage:
        if i['Rank'] in taxonomies_to_fetch:
            df.at[matching_indices, 'Host_{}_id'.format(i['Rank'])] = i['TaxId']
            df.at[matching_indices, 'Host_{}_name'.format(i['Rank'])] = i['ScientificName']
    time.sleep(10)

In [None]:
print(len(set(still_problematic)))
still_problematic

**Cases with annoying genuses**

In [None]:
really_problematic = []
for host in still_problematic:
    matching_indices = df[df['Host']==host].index
    host = host.replace(':', ' ')
    host = host.replace('[', '').replace(']', '')
    host = host.replace('(', ' ').replace(')', ' ')
    print(host)
    handle = Entrez.esearch(db='Taxonomy', term=host)
    record = Entrez.read(handle)
    if len(record['IdList']) == 0:
        really_problematic.append(host)
        continue
    temp_ids = record['IdList']
    successful_ids = []
    for record_id in temp_ids:
        handle = Entrez.efetch(db="Taxonomy", id=record_id, retmode="xml")
        records = Entrez.read(handle)
        if len(records) != 1:
            really_problematic.append(host)
            continue        
        record = records[0]
        lineage = record['LineageEx']
        for i in lineage:
            if i['Rank']=='superkingdom' and i['ScientificName'] =='Bacteria':
                successful_ids.append(record_id)
    if len(successful_ids) == 1:
        record_id = successful_ids[0]
    else:
        really_problematic.append(host)
    
    handle = Entrez.efetch(db="Taxonomy", id=record_id, retmode="xml")
    records = Entrez.read(handle)
    if len(records) != 1:
        really_problematic.append(host)
        continue        
    record = records[0]
    ###First get the genetic code right
    df.at[matching_indices, 'GeneticCode'] = record['GeneticCode']['GCId']
    ###Now possibly add the record itself
    if record ['Rank'] in taxonomies_to_fetch:
        df.at[matching_indices, 'Host_{}_id'.format(record['Rank'])] = record['TaxId']
        df.at[matching_indices, 'Host_{}_name'.format(record['Rank'])] = record['ScientificName']
    ###Finally go through the lineage
    lineage = record['LineageEx']
    for i in lineage:
        if i['Rank'] in taxonomies_to_fetch:
            df.at[matching_indices, 'Host_{}_id'.format(i['Rank'])] = i['TaxId']
            df.at[matching_indices, 'Host_{}_name'.format(i['Rank'])] = i['ScientificName']
    time.sleep(5)

In [None]:
print(len(really_problematic))
print(really_problematic)

**And finally some manual inspection lead to this kludge**

In [None]:
answers = [1199, 1531298, 1353243, 2030816, 29523, 551]

In [None]:
taxonomies_to_fetch = ['superkingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species']
problematic_hosts = []
for host, answer in zip(really_problematic, answers):
    matching_indices = df[df['Host']==host].index
    if df.loc[matching_indices]['Host_superkingdom_id'].isnull().all()==False:
        print('Skipping')
        continue
    print(host)
    record_id = str(answer)
    handle = Entrez.efetch(db="Taxonomy", id=record_id, retmode="xml")
    records = Entrez.read(handle)    
    record = records[0]
    ###First get the genetic code right
    df.at[matching_indices, 'GeneticCode'] = record['GeneticCode']['GCId']
    ###Now possibly add the record itself
    if record ['Rank'] in taxonomies_to_fetch:
        df.at[matching_indices, 'Host_{}_id'.format(record['Rank'])] = record['TaxId']
        df.at[matching_indices, 'Host_{}_name'.format(record['Rank'])] = record['ScientificName']
    ###Finally go through the lineage
    lineage = record['LineageEx']
    for i in lineage:
        if i['Rank'] in taxonomies_to_fetch:
            df.at[matching_indices, 'Host_{}_id'.format(i['Rank'])] = i['TaxId']
            df.at[matching_indices, 'Host_{}_name'.format(i['Rank'])] = i['ScientificName']
    time.sleep(10)

**Write the new and improved file!**

Ideally, the above code should never need to be run again but for the purposes of someone trying to replicate this code on newer verions of the database.

In [None]:
df['Host_superkingdom_name'].value_counts()

In [None]:
df.to_csv('../../Data/NCBI_phage_db/all_complete_phage_info_HOSTTAXONOMY_11_2020.tsv', sep='\t', index=False)

# Further processing of the large `df` that includes taxonomy info

Re-read the `df` from the file I just wrote and select the rows that I care about for a more useful working `df`

In [None]:
df = pd.read_csv('../../Data/NCBI_phage_db/all_complete_phage_info_HOSTTAXONOMY_11_2020.tsv', sep='\t')
print(df.shape)
df.head()

**Who are these eukaryotes?**

In [None]:
df[df['Host_superkingdom_name']!='Bacteria'][['Host','Host_species_name']]

**Get rid of 'em**

In [None]:
df = df[df['Host_superkingdom_name']=='Bacteria']
print(df.shape)

**Who avoided proper classification?**

In [None]:
# df[df['Host_genus_id'].isnull()==True][['Host', 'Host_genus_name']]
# df[df['Host_species_id'].isnull()==True][['Host', 'Host_species_name']]
df[df['Host_species_id'].isnull()==True]['Host'].value_counts()

**Get rid of 'em**

In [None]:
df = df[df['Host_species_id'].isnull()==False]
print(df.shape)
df['Host_species_id'] = df['Host_species_id'].astype(int)
print(df.shape)

**Weird genetic codes?**

In [None]:
df['GeneticCode'].value_counts()

**Get rid of 'em**

In [None]:
df = df[df['GeneticCode']==11]
print(df.shape)

**Write to file!**

In [None]:
df.to_csv('../../Data/NCBI_phage_db/paper_dataset_11_2020.tsv', sep='\t', index=False)

# Select the top host species and separate the genomes into individual directories

This will create a lot of different files! It will make directories for each of the `top_species` and populate these directories with all matching phage genomes in `.fasta` format.

In [None]:
vc = df['Host_species_id'].value_counts()
top_species = list(vc[vc >= 50].index)
print(len(top_species))
print(top_species[:10])

In [None]:
nts_dir = '../../Data/NCBI_phage_db/all_complete_phage_NTs_11_2020.fasta'
for host_taxid in top_species:
    save_dir = '../../Data/NCBI_phage_db/phage_genomes/{}_phage_genomes/'.format(host_taxid)
    if not os.path.exists(save_dir):
        os.mkdir(save_dir)
    temp_df = df[df['Host_species_id']==host_taxid]
    print(temp_df.shape)
    assert len(set(list(temp_df['Accession']))) == len(list(temp_df['Accession']))
    selected_accession_numbers = list(temp_df['Accession'])
    all_genomes = SeqIO.parse(nts_dir, 'fasta')
    found = 0
    for genome in all_genomes:
        simple_id = genome.id.split('.')[0]
        if simple_id in selected_accession_numbers:
            with open(save_dir+'{}.fasta'.format(simple_id), 'w') as outfile:
                SeqIO.write(genome, outfile, 'fasta')
                found += 1
    print(found)

In [None]:
df['Host_species_name'].value_counts().head(n=25)

**et voila**