In [None]:
%matplotlib inline

# Initial notes

This code assumes that you have ran through the previous notebook `1-parse_NCBI_database.ipynb`

## Offline portion

I manually ran fastANI on the set of phage genomes (`.fasta` files) for each host.

For example:
```
find ./phage_genomes/562_phage_genomes/*.fasta -type f> file_listing.txt

fastANI --ql file_listing.txt --rl file_listing.txt --threads 10 --fragLen 300 --minFraction 0.8 -o 562_ANI.output
```

To run for all hosts, just replace both instances of `562` in the above code. Didn't seem worth writing a bash script so I manually ran this on a cluster.

# Add cluster information to the existing database

In [None]:
import pandas as pd
import glob
from Bio import SeqIO
import json

In [None]:
full_df = pd.read_csv('../Data/NCBI_phage_db/paper_dataset_11_2020.tsv', sep='\t')
print(full_df.shape)
full_df.head()

In [None]:
vc = full_df['Host_species_id'].value_counts()
taxonomy_list = list(vc[vc >= 50].index)
print(len(taxonomy_list))
print(taxonomy_list[:10])

# Looking at individual dataframes

In [None]:
taxon_id = '562'
input_df = pd.read_csv('../Data/NCBI_phage_db/fastANI_results/{}_ANI.output'.format(taxon_id), sep='\t', header=None)
print('Shape:', input_df.shape)
print('Non-null shape:', input_df[input_df[2].isnull()==False].shape)
input_df.head()

In [None]:
input_df['temp'] = input_df[0].str.split("/").str[-1]
input_df['query_seq'] = input_df['temp'].str.split('.').str[0]
input_df['temp'] = input_df[1].str.split("/").str[-1]
input_df['ref_seq'] = input_df['temp'].str.split('.').str[0]
input_df = input_df[['query_seq', 'ref_seq', 2, 3, 4]]
print(input_df[input_df[2]==100.].shape)
input_df = input_df[input_df['query_seq'] != input_df['ref_seq']]
print(input_df.shape)
input_df.head(n=20)

# Re-create the cd-hit-est algorithm for clustering and write clusters to a  file

There's probably a faster way to do this but it seemed straightforward enough to quickly write up the greedy incremental clustering approach. For each genome, I'm writing entries for two new columns: `arbitrary_cluster_id` and `ranking_in_cluster`.

In [None]:
#Set sequence identity threshold
thresh = 95

for taxon_id in taxonomy_list[:]:
    print(taxon_id)
    ###Read in the ANI output
    input_df = pd.read_csv('../Data/Other_possible_dbs/NCBI_phage_db/fastANI_results/{}_ANI.output'.format(taxon_id), sep='\t', header=None)
    ###Processing some columns
    input_df['temp'] = input_df[0].str.split("/").str[-1]
    input_df['query_seq'] = input_df['temp'].str.split('.').str[0]
    input_df['temp'] = input_df[1].str.split("/").str[-1]
    input_df['ref_seq'] = input_df['temp'].str.split('.').str[0]
    input_df = input_df[['query_seq', 'ref_seq', 2, 3, 4]]
    #Remove self vs self comparisons
    input_df = input_df[input_df['query_seq'] != input_df['ref_seq']]
    
    ###Make a copy of the input
    input_df_copy = input_df.copy(deep=True)
    ###Select the species I care about from the full dataframe
    species_df = full_df[full_df['Host_species_id']==taxon_id].sort_values('Length', ascending=False)
    #Establish buckets
    clusters = []
    found = []
    for index in species_df.index:
        temp_accession = species_df.loc[index]['Accession']
        ### Records should be sorted in order! (see above)
        if temp_accession in found:
            continue
        ###Find all instances of this record in the ANI dataframe
        temp_df = input_df_copy[(input_df_copy['query_seq']==temp_accession) | (input_df_copy['ref_seq']==temp_accession)]
        ###Select all cases where the identity is greater than the specified threshold
        temp_df = temp_df[temp_df[2] > thresh]
        ###Add all these hits and call them a cluster!
        temp_cluster = list(set(list(temp_df['query_seq'])+list(temp_df['ref_seq'])))
        ###Now try to expand that cluster
        stop = False
        while stop == False:
            starting = len(temp_cluster)
            temp_df = input_df_copy[(input_df_copy['query_seq'].isin(temp_cluster)) | 
                                        (input_df_copy['ref_seq'].isin(temp_cluster))]
            temp_df = temp_df[temp_df[2] > thresh]
            temp_cluster = list(set(list(temp_df['query_seq'])+list(temp_df['ref_seq'])))
            ###Break if I did not add anyone this iteration
            if len(temp_cluster) == starting:
                stop = True
                
        if len(temp_cluster) == 0:
            temp_cluster = [temp_accession]
            
        clusters.append(temp_cluster)
        found.extend(temp_cluster)
        input_df_copy = input_df_copy[input_df_copy['query_seq'].isin(temp_cluster) == False]
        input_df_copy = input_df_copy[input_df_copy['ref_seq'].isin(temp_cluster) == False]
    print(len(clusters), len(found))
    
    ###Add to the dataframe
    for i, cluster in enumerate(clusters):
        temp_df = species_df[species_df['Accession'].isin(cluster)]
        ###Sorting is done to percolate RefSeq entries to the top, followed by long genomes
        temp_df = temp_df.sort_values(['Sequence_Type', 'Length'], ascending=[False, False])
        for j, index in enumerate(temp_df.index):
            full_df.at[index, 'arbitrary_cluster_id'] = i + 1
            full_df.at[index, 'ranking_in_cluster'] = j + 1

    with open('../Data/Other_possible_dbs/NCBI_phage_db/fastANI_results/{}_clusters.json'.format(taxon_id), 'w') as outfile:
        json.dump(clusters, outfile)

# Write to file

In [None]:
full_df['arbitrary_cluster_id'] = full_df['arbitrary_cluster_id'].astype(pd.Int64Dtype())
full_df['ranking_in_cluster'] = full_df['ranking_in_cluster'].astype(pd.Int64Dtype())

full_df = full_df[full_df['Host_species_id'].isin(taxonomy_list)]

full_df.to_csv('../Data/Other_possible_dbs/NCBI_phage_db/'
               'paper_dataset_11_2020_with_clusters.tsv', sep='\t', index=False)

# Scratch from here on out...

## Clustering visualization

In [None]:
from matplotlib import pyplot as plt

In [None]:
taxon_id = 562
input_df = pd.read_csv('../Data/NCBI_phage_db/fastANI_results/{}_ANI.output'.format(taxon_id), sep='\t', header=None)
print('Shape:', input_df.shape)
print('Non-null shape:', input_df[input_df[2].isnull()==False].shape)
input_df['temp'] = input_df[0].str.split("/").str[-1]
input_df['query_seq'] = input_df['temp'].str.split('.').str[0]
input_df['temp'] = input_df[1].str.split("/").str[-1]
input_df['ref_seq'] = input_df['temp'].str.split('.').str[0]
input_df = input_df[['query_seq', 'ref_seq', 2, 3, 4]]
print(input_df.shape)

sim_df = input_df.pivot(index='query_seq', columns='ref_seq', values=2)
sim_matrix = sim_df.values

input_df.head()

In [None]:
with open('../Data/NCBI_phage_db/fastANI_results/562_clusters.json', 'r') as infile:
    clusters = json.load(infile)
temp = [str(item) for sublist in clusters for item in sublist if item in sim_df.index]
ordered_sim_matrix = sim_df.loc[temp][temp].values

In [None]:
print(len(sim_df.index), len(sim_df.index))

In [None]:
fig, ax_arr = plt.subplots(nrows=1, ncols=2, figsize=(16,8))
ax_arr[0].matshow(sim_matrix)
ax_arr[0].set_title('Unsorted in any seemingly logical way')
ax_arr[1].matshow(ordered_sim_matrix)
ax_arr[1].set_title('After greedy algorithm');

In [None]:
input_df[2].hist()

## Experiment with stricter sequence identity threshold

In [None]:
full_df = pd.read_csv('../Data/NCBI_phage_db/paper_dataset_11_2020.tsv', sep='\t')
print(full_df.shape)
full_df.head()

In [None]:
#Set sequence identity threshold
thresh = 80
taxon_id = 562

###Read in the ANI output
input_df = pd.read_csv('../Data/562_associated_data/{}_ANI_STRICT.output'.format(taxon_id), sep='\t', header=None)
###Processing some columns
input_df['temp'] = input_df[0].str.split("/").str[-1]
input_df['query_seq'] = input_df['temp'].str.split('.').str[0]
input_df['temp'] = input_df[1].str.split("/").str[-1]
input_df['ref_seq'] = input_df['temp'].str.split('.').str[0]
input_df = input_df[['query_seq', 'ref_seq', 2, 3, 4]]
#Remove self vs self comparisons
input_df = input_df[input_df['query_seq'] != input_df['ref_seq']]

###Make a copy of the input
input_df_copy = input_df.copy(deep=True)
###Select the species I care about from the full dataframe
species_df = full_df[full_df['Host_species_id']==taxon_id].sort_values('Length', ascending=False)
#Establish buckets
clusters = []
found = []
for index in species_df.index:
    temp_accession = species_df.loc[index]['Accession']
    ### Records should be sorted in order! (see above)
    if temp_accession in found:
        continue
    ###Find all instances of this record in the ANI dataframe
    temp_df = input_df_copy[(input_df_copy['query_seq']==temp_accession) | (input_df_copy['ref_seq']==temp_accession)]
    ###Select all cases where the identity is greater than the specified threshold
    temp_df = temp_df[temp_df[2] > thresh]
    ###Add all these hits and call them a cluster!
    temp_cluster = list(set(list(temp_df['query_seq'])+list(temp_df['ref_seq'])))
    ###Now try to expand that cluster
    stop = False
    while stop == False:
        starting = len(temp_cluster)
        temp_df = input_df_copy[(input_df_copy['query_seq'].isin(temp_cluster)) | 
                                    (input_df_copy['ref_seq'].isin(temp_cluster))]
        temp_df = temp_df[temp_df[2] > thresh]
        temp_cluster = list(set(list(temp_df['query_seq'])+list(temp_df['ref_seq'])))
        ###Break if I did not add anyone this iteration
        if len(temp_cluster) == starting:
            stop = True

    if len(temp_cluster) == 0:
        temp_cluster = [temp_accession]

    clusters.append(temp_cluster)
    found.extend(temp_cluster)
    input_df_copy = input_df_copy[input_df_copy['query_seq'].isin(temp_cluster) == False]
    input_df_copy = input_df_copy[input_df_copy['ref_seq'].isin(temp_cluster) == False]
print(len(clusters), len(found))

###Add to the dataframe
for i, cluster in enumerate(clusters):
    temp_df = species_df[species_df['Accession'].isin(cluster)]
    temp_df = temp_df.sort_values(['Sequence_Type', 'Length'], ascending=[False, False])
    for j, index in enumerate(temp_df.index):
        full_df.at[index, 'arbitrary_cluster_id'] = i + 1
        full_df.at[index, 'ranking_in_cluster'] = j + 1

**Explore cluster numbers**

In [None]:
full_df[full_df['Host_species_id']==562]['ranking_in_cluster'].value_counts()

In [None]:
testy_df = full_df[(full_df['Host_species_id']==562)&(full_df['ranking_in_cluster']==1.)]

In [None]:
testy_df

In [None]:
lifestyle_df = pd.read_csv('../Data/lifestyle_results_BACPHLIP.tsv', sep='\t')

In [None]:
alltogethernow = []
for index in testy_df.index:
    phage_id = testy_df.loc[index]['Accession']
    if lifestyle_df[lifestyle_df['phage_id']==phage_id].shape[0] == 1:
        alltogethernow.append(lifestyle_df[lifestyle_df['phage_id']==phage_id].iloc[0]['phage_lifestyle'])

In [None]:
from collections import Counter
Counter(alltogethernow)