In [1]:
%matplotlib inline

# Offline portion

I manually ran fastANI on the set of phage genomes (`.fasta` files) for each host.

For example:
```
find ./phage_genomes/562_phage_genomes/*.fasta -type f> file_listing.txt

fastANI --ql file_listing.txt --rl file_listing.txt --threads 10 --fragLen 300 --minFraction 0.8 -o 562_ANI.output
```

To run for all hosts, just replace both instances of `562` in the above code. Didn't seem worth writing a bash script for. 

# Now using that information to add a new column to the database

In [2]:
import pandas as pd
import glob
from Bio import SeqIO
import json

In [3]:
full_df = pd.read_csv('../Data/Other_possible_dbs/NCBI_phage_db/paper_dataset_11_2020.tsv', sep='\t')
print(full_df.shape)
full_df.head()

(12876, 35)


Unnamed: 0,Accession,SRA_Accession,Release_Date,Species,Genus,Family,Length,Sequence_Type,Nuc_Completeness,Genotype,...,Host_phylum_id,Host_phylum_name,Host_class_id,Host_class_name,Host_order_id,Host_order_name,Host_family_id,Host_family_name,Host_genus_id,Host_genus_name
0,NC_050154,,2020-08-13T00:00:00Z,Escherichia phage D6,,Myoviridae,91159,RefSeq,complete,,...,1224.0,Proteobacteria,1236.0,Gammaproteobacteria,91347.0,Enterobacterales,543.0,Enterobacteriaceae,561.0,Escherichia
1,NC_050143,,2020-08-10T00:00:00Z,Pseudomonas phage datas,Pbunavirus,Myoviridae,60746,RefSeq,complete,,...,1224.0,Proteobacteria,1236.0,Gammaproteobacteria,72274.0,Pseudomonadales,135621.0,Pseudomonadaceae,286.0,Pseudomonas
2,NC_050144,,2020-08-10T00:00:00Z,Pseudomonas phage Epa14,Pbunavirus,Myoviridae,65797,RefSeq,complete,,...,1224.0,Proteobacteria,1236.0,Gammaproteobacteria,72274.0,Pseudomonadales,135621.0,Pseudomonadaceae,286.0,Pseudomonas
3,NC_050145,,2020-08-10T00:00:00Z,Pseudomonas phage PaGU11,Pbunavirus,Myoviridae,65554,RefSeq,complete,,...,1224.0,Proteobacteria,1236.0,Gammaproteobacteria,72274.0,Pseudomonadales,135621.0,Pseudomonadaceae,286.0,Pseudomonas
4,NC_050146,,2020-08-10T00:00:00Z,Pseudomonas phage Epa7,Pbunavirus,Myoviridae,65629,RefSeq,complete,,...,1224.0,Proteobacteria,1236.0,Gammaproteobacteria,72274.0,Pseudomonadales,135621.0,Pseudomonadaceae,286.0,Pseudomonas


In [4]:
vc = full_df['Host_species_id'].value_counts()
taxonomy_list = list(vc[vc >= 50].index)
print(len(taxonomy_list))
print(taxonomy_list[:10])

42
[1772, 562, 28901, 2055, 1358, 287, 573, 104336, 1280, 1771959]


# Looking at individual dataframes

In [5]:
taxon_id = '562'
input_df = pd.read_csv('../Data/Other_possible_dbs/NCBI_phage_db/fastANI_results/{}_ANI.output'.format(taxon_id), sep='\t', header=None)
print('Shape:', input_df.shape)
print('Non-null shape:', input_df[input_df[2].isnull()==False].shape)
input_df.head()

Shape: (148432, 5)
Non-null shape: (148432, 5)


Unnamed: 0,0,1,2,3,4
0,./phage_genomes/562_phage_genomes/AF063097.fasta,./phage_genomes/562_phage_genomes/AF063097.fasta,100.0,111,111
1,./phage_genomes/562_phage_genomes/AF063097.fasta,./phage_genomes/562_phage_genomes/NC_001895.fasta,100.0,111,111
2,./phage_genomes/562_phage_genomes/AP019559.fasta,./phage_genomes/562_phage_genomes/NC_048627.fasta,100.0,369,369
3,./phage_genomes/562_phage_genomes/AP019559.fasta,./phage_genomes/562_phage_genomes/AP019559.fasta,100.0,369,369
4,./phage_genomes/562_phage_genomes/AP019559.fasta,./phage_genomes/562_phage_genomes/KJ190157.fasta,95.7867,334,369


In [6]:
input_df['temp'] = input_df[0].str.split("/").str[-1]
input_df['query_seq'] = input_df['temp'].str.split('.').str[0]
input_df['temp'] = input_df[1].str.split("/").str[-1]
input_df['ref_seq'] = input_df['temp'].str.split('.').str[0]
input_df = input_df[['query_seq', 'ref_seq', 2, 3, 4]]
print(input_df[input_df[2]==100.].shape)
input_df = input_df[input_df['query_seq'] != input_df['ref_seq']]
print(input_df.shape)
input_df.head(n=20)

(4046, 5)
(146959, 5)


Unnamed: 0,query_seq,ref_seq,2,3,4
1,AF063097,NC_001895,100.0,111,111
2,AP019559,NC_048627,100.0,369,369
4,AP019559,KJ190157,95.7867,334,369
5,AP019559,NC_024139,95.7867,334,369
6,AP019559,KY677846,95.3717,319,369
7,AP019559,NC_047822,95.3717,319,369
8,AP019559,NC_048818,95.2013,320,369
9,AP019559,MN316588,95.2013,320,369
10,AP019559,NC_047835,95.097,326,369
11,AP019559,MF402939,95.097,326,369


# Re-create the cd-hit-est algorithm for clustering

There's probably a faster way to do this but it seemed straightforward enough to quickly write up the greedy incremental clustering approach.

In [7]:
#Set sequence identity threshold
thresh = 95

for taxon_id in taxonomy_list[:]:
    print(taxon_id)
    ###Read in the ANI output
    input_df = pd.read_csv('../Data/Other_possible_dbs/NCBI_phage_db/fastANI_results/{}_ANI.output'.format(taxon_id), sep='\t', header=None)
    ###Processing some columns
    input_df['temp'] = input_df[0].str.split("/").str[-1]
    input_df['query_seq'] = input_df['temp'].str.split('.').str[0]
    input_df['temp'] = input_df[1].str.split("/").str[-1]
    input_df['ref_seq'] = input_df['temp'].str.split('.').str[0]
    input_df = input_df[['query_seq', 'ref_seq', 2, 3, 4]]
    #Remove self vs self comparisons
    input_df = input_df[input_df['query_seq'] != input_df['ref_seq']]
    
    ###Make a copy of the input
    input_df_copy = input_df.copy(deep=True)
    ###Select the species I care about from the full dataframe
    species_df = full_df[full_df['Host_species_id']==taxon_id].sort_values('Length', ascending=False)
    #Establish buckets
    clusters = []
    found = []
    for index in species_df.index:
        temp_accession = species_df.loc[index]['Accession']
        ### Records should be sorted in order! (see above)
        if temp_accession in found:
            continue
        ###Find all instances of this record in the ANI dataframe
        temp_df = input_df_copy[(input_df_copy['query_seq']==temp_accession) | (input_df_copy['ref_seq']==temp_accession)]
        ###Select all cases where the identity is greater than the specified threshold
        temp_df = temp_df[temp_df[2] > thresh]
        ###Add all these hits and call them a cluster!
        temp_cluster = list(set(list(temp_df['query_seq'])+list(temp_df['ref_seq'])))
        ###Now try to expand that cluster
        stop = False
        while stop == False:
            starting = len(temp_cluster)
            temp_df = input_df_copy[(input_df_copy['query_seq'].isin(temp_cluster)) | 
                                        (input_df_copy['ref_seq'].isin(temp_cluster))]
            temp_df = temp_df[temp_df[2] > thresh]
            temp_cluster = list(set(list(temp_df['query_seq'])+list(temp_df['ref_seq'])))
            ###Break if I did not add anyone this iteration
            if len(temp_cluster) == starting:
                stop = True
                
        if len(temp_cluster) == 0:
            temp_cluster = [temp_accession]
            
        clusters.append(temp_cluster)
        found.extend(temp_cluster)
        input_df_copy = input_df_copy[input_df_copy['query_seq'].isin(temp_cluster) == False]
        input_df_copy = input_df_copy[input_df_copy['ref_seq'].isin(temp_cluster) == False]
    print(len(clusters), len(found))
    
    ###Add to the dataframe
    for i, cluster in enumerate(clusters):
        temp_df = species_df[species_df['Accession'].isin(cluster)]
        temp_df = temp_df.sort_values(['Sequence_Type', 'Length'], ascending=[False, False])
        for j, index in enumerate(temp_df.index):
            full_df.at[index, 'arbitrary_cluster_id'] = i + 1
            full_df.at[index, 'ranking_in_cluster'] = j + 1

    with open('../Data/Other_possible_dbs/NCBI_phage_db/fastANI_results/{}_clusters.json'.format(taxon_id), 'w') as outfile:
        json.dump(clusters, outfile)

1772
253 2070
562
271 1473
28901
132 526
2055
153 457
1358
117 441
287
80 429
573
137 376
104336
70 281
1280
57 262
1771959
49 241
1308
55 200
1747
61 183
470
49 152
1313
43 141
1351
43 128
666
20 121
1428
39 115
1708203
1 115
1911
34 110
1314
36 104
599
19 91
1307
47 91
1665
49 88
552
31 88
1396
42 82
197
16 77
1464
11 74
317
33 68
1639
14 66
1423
32 66
996
2 65
670
30 63
623
25 62
644
30 62
1311
26 60
1334
33 58
305
29 58
29471
6 55
1833
11 55
36822
22 53
306
19 53
76594
18 52


# Write to file

In [17]:
full_df['arbitrary_cluster_id'] = full_df['arbitrary_cluster_id'].astype(pd.Int64Dtype())
full_df['ranking_in_cluster'] = full_df['ranking_in_cluster'].astype(pd.Int64Dtype())

full_df = full_df[full_df['Host_species_id'].isin(taxonomy_list)]

full_df.to_csv('../Data/Other_possible_dbs/NCBI_phage_db/'
               'paper_dataset_11_2020_with_clusters.tsv', sep='\t', index=False)

# Visualize what those results look like

In [None]:
# input_df = pd.read_csv('../Data/{}_fastANI.output'.format(taxon_id), sep='\t', header=None)
# print('Shape:', input_df.shape)
# print('Non-null shape:', input_df[input_df[2].isnull()==False].shape)
# input_df['temp'] = input_df[0].str.split("/").str[-1]
# input_df['query_seq'] = input_df['temp'].str.split('.').str[0]
# input_df['temp'] = input_df[1].str.split("/").str[-1]
# input_df['ref_seq'] = input_df['temp'].str.split('.').str[0]
# input_df = input_df[['query_seq', 'ref_seq', 2, 3, 4]]
# print(input_df.shape)

# sim_df = input_df.pivot(index='query_seq', columns='ref_seq', values=2)
# sim_matrix = sim_df.values

In [None]:
# temp = [str(item) for sublist in clusters for item in sublist if item in sim_df.index]
# ordered_sim_matrix = sim_df.loc[temp][temp].values

In [None]:
# print(len(sim_df.index), len(sim_df.index))

In [None]:
# from matplotlib import pyplot as plt

In [None]:
# fig, ax_arr = plt.subplots(nrows=1, ncols=2, figsize=(16,8))
# ax_arr[0].matshow(sim_matrix)
# ax_arr[0].set_title('Unsorted in any seemingly logical way')
# ax_arr[1].matshow(ordered_sim_matrix)
# ax_arr[1].set_title('After greedy algorithm')

In [None]:
# ordered_sim_matrix[150, 190]

In [None]:
# input_df[2].hist()

In [None]:
clusters

In [None]:
temp_df = species_df[species_df['Accession'].isin(clusters[0])]
temp_df


In [None]:
temp_df.sort_values(['Sequence_Type', 'Length'], ascending=[False, False])