In [None]:
%matplotlib inline

# Imports

The only strange import here is `BACPHLIP`. See: https://www.biorxiv.org/content/10.1101/2020.05.13.094805v1 for details and https://github.com/adamhockenberry/bacphlip for installation instructions

In [None]:
import bacphlip
import glob
import pandas as pd
from collections import Counter

# Run BACPHLIP lifestyle predictions

In [None]:
full_df = pd.read_csv('../Data/NCBI_phage_db/paper_dataset_11_2020_with_clusters.tsv', sep='\t')
full_df.head()

In [None]:
# host_species_list = [562]
host_species_list = list(set(full_df['Host_species_id']))
print(host_species_list)

**The folowing code block will take quite a while to run (perhaps a few hours) so be forewarned!**

In [None]:
phage_dir = '../Data/NCBI_phage_db/phage_genomes/'

for host_id in host_species_list:
    species_df = full_df[(full_df['Host_species_id']==host_id) & (full_df['cluster_representative']==1)]
    for index in species_df.index:
        virus_id = species_df.loc[index]['Accession']
        viral_fasta = phage_dir + '{}_phage_genomes/'.format(host_id) + '{}.fasta'.format(virus_id)
        print(viral_fasta)
        bacphlip.run_pipeline(viral_fasta)

# Compile results into a dataframe

In [None]:
full_listy = []
for host_id in host_species_list:
    species_df = full_df[(full_df['Host_species_id']==host_id) & (full_df['cluster_representative']==1)]
    for index in species_df.index:
        virus_id = species_df.loc[index]['Accession']
        bacphlip_file = phage_dir + '{}_phage_genomes/'.format(host_id) + '{}.fasta.bacphlip'.format(virus_id)
        temp_df = pd.read_csv(bacphlip_file, '\t')
        prob_diff = abs(temp_df.loc[0]['Virulent']-temp_df.loc[0]['Temperate'])
        lifestyle = ''
        if temp_df.loc[0]['Virulent']>temp_df.loc[0]['Temperate']:
            lifestyle = 'Virulent'
        else:
            lifestyle = 'Temperate'
        full_listy.append((host_id, virus_id, lifestyle, prob_diff))

In [None]:
df = pd.DataFrame(full_listy, columns=['host_id', 'phage_id', 'phage_lifestyle', 'lifestyle_probability_difference'])
df.head()

In [None]:
df.to_csv('../Data/lifestyle_results_BACPHLIP.tsv', sep='\t', index=False)

**View aggregate results/stats**

In [None]:
df[df['lifestyle_probability_difference']>=0.7]['phage_lifestyle'].value_counts()

In [None]:
full_listy = []
for host_id in host_species_list:
    print(host_id)
#     print(df[df['host_id']==host_id]['phage_lifestyle'].value_counts())
    print(df[(df['host_id']==host_id)].shape,
          df[(df['host_id']==host_id) & (df['lifestyle_probability_difference']>=0.95)].shape)