In [1]:
%matplotlib inline

# Notes:

This notebook is for clustering FastANI to create CD-HIT-EST like results. FastANI was run with the following command:
    
    $fastANI --ql file_listing.txt --rl file_listing.txt --fragLen 300 --minFraction 0.8 -o first_pass_ANI.output
    
Whwere file_listing.txt is simply a list of all the nucleotide fastas to cluster and this command performs an all-vs-all comparison. I had to run FastANI on an off-site cluster to get the program to complete so full commands are not included here. Note, however that "minFraction" refers to the minimum fraction of the smallest genome sequence that was aligned.

In [2]:
import pandas as pd
from Bio import SeqIO
import glob

In [3]:
input_df = pd.read_csv('../Data/fastANI_output/first_pass_ANI.output', header=None, sep='\t')
input_df['temp'] = input_df[0].str.split("/").str[-1]
input_df['query_seq'] = input_df['temp'].str.split('.').str[0]
input_df['temp'] = input_df[1].str.split("/").str[-1]
input_df['ref_seq'] = input_df['temp'].str.split('.').str[0]
input_df = input_df[['query_seq', 'ref_seq', 2, 3, 4]]
print(input_df[input_df[2]==100.].shape)
input_df = input_df[input_df['query_seq'] != input_df['ref_seq']]
print(input_df.shape)
input_df.head(n=20)

(1045, 5)
(9904, 5)


Unnamed: 0,query_seq,ref_seq,2,3,4
2,minerva,optimus,96.5329,309,366
3,minerva,wanda,96.4687,317,366
4,minerva,baka,96.2789,309,366
6,island3,brujita,99.6888,156,157
8,NC_004587,NC_003157,99.9971,139,139
10,NC_024122,NC_016073,97.6094,480,543
11,NC_024122,NC_016570,97.549,485,543
12,NC_024122,NC_019530,97.3815,473,543
13,NC_024122,NC_019452,97.0824,480,543
14,NC_024122,NC_027119,96.5699,457,543


**Now, follow the CD-HIT-EST greedy algorithm to perform clustering**

In [4]:
records = []
for infile in glob.glob('../Data/phage_data_nmicro2017/phage_fasta_files/*.fasta'):
    tempy = SeqIO.read(infile, 'fasta')
    records.append((infile.split('/')[-1].split('.')[0], len(tempy.seq)))
records = sorted(records, key=lambda x: x[1], reverse=True) ###Critical step!
print('First, last record sequence length:', records[0][1], ',', records[-1][1])
len_dict = dict(records) #useful for sorting later

First, last record sequence length: 358663 , 5833


In [6]:
input_df_copy = input_df.copy(deep=True)
thresh = 80
clusters = []
found = []
for record in records: 
    ### Records must be sorted in order! (see above)
    if record[0] in found:
        continue
    ###Find all instances of this record in the dataframe
    temp_df = input_df_copy[(input_df_copy['query_seq']==record[0]) | (input_df_copy['ref_seq']==record[0])]
    ###Select all cases where the identity is greater than the specified threshold
    temp_df = temp_df[temp_df[2] > thresh]
    
    ###Two paths!
    ###If there are some cases
    if temp_df.shape[0] != 0:
        ###Find everyone involved
        all_seqs = list(set(list(temp_df['query_seq'])+list(temp_df['ref_seq'])))        
        ###Move my sequence (the longest) to the front of the list
        all_seqs.insert(0, all_seqs.pop(all_seqs.index(record[0])))
        
        ###Loop through everyone else
        stop = False
        while stop == False:
            current_len = len(all_seqs)
            added = 1
            for seq_id in all_seqs[added:]:
                ###Find all instances of this record in the dataframe
                temp_df = input_df_copy[(input_df_copy['query_seq']==seq_id) | (input_df_copy['ref_seq']==seq_id)]
                ###Select all cases where the identity is greater than the specified threshold
                temp_df = temp_df[temp_df[2] > thresh]
                if temp_df.shape[0] != 0:
                    ###Find everyone involved
                    all_seqs_rd2 = list(set(list(temp_df['query_seq'])+list(temp_df['ref_seq'])))        
                    for new_seq in all_seqs_rd2:
                        if new_seq not in all_seqs:
                            all_seqs.append(new_seq)
                added += 1
            final_len = len(all_seqs)
            if final_len == current_len:
                stop = True
        ###Finally, call it a cluster
        clusters.append(all_seqs)
        ###Note that everyone in the cluster is accounted for and should be skipped
        found.extend(all_seqs)
        ###Remove everyone that I found from the temporary dataframe
        input_df_copy = input_df_copy[input_df_copy['query_seq'].isin(all_seqs)==False]
        input_df_copy = input_df_copy[input_df_copy['ref_seq'].isin(all_seqs)==False]
    ###If the sequence is a beautiful little snowflake
    else:
        ###It's a unique cluster of one
        clusters.append([record[0]])
        ###That has been found, though it found no one
        found.append(record[0])
        ###Remove everyone that I found from the temporary dataframe
        input_df_copy = input_df_copy[input_df_copy['query_seq']!= record[0]]
        input_df_copy = input_df_copy[input_df_copy['ref_seq']!= record[0]]
for i, j in enumerate(clusters):
    clusters[i] = sorted(j, key=lambda x: len_dict[x], reverse=True)
    
print(len(clusters), sum([len(i) for i in clusters]))

480 1057


**Finally, write the clusters**

In [7]:
import json
with open('../Data/fastANI_output/clusters.json', 'w') as outfile:
    json.dump(clusters, outfile)