In [32]:
# Imports
import csv
from Bio import AlignIO
from sklearn.metrics import silhouette_score
from sklearn.cluster import AgglomerativeClustering
from Bio.Phylo.TreeConstruction import DistanceMatrix

In [33]:
def get_sequence_names(sample):
    # Read the alignment from the FASTA file corresponding to the sample
    alignment = AlignIO.read("data/fasta/" + sample + "_aligned.fasta", "fasta")
    # Extract sequence names (IDs) from the alignment
    sequence_names = [record.id for record in alignment]
    # Return the sequence names 
    return sequence_names

In [34]:
def load_distance_matrix(file_path):
    with open(file_path + ".csv", 'r') as file:
        reader = csv.reader(file)
        # Read the header
        names = next(reader)[1:]
        # Read the rows
        matrix_list = [list(map(float, row[1:])) for row in reader]
    # Convert list of lists to DistanceMatrix
    return DistanceMatrix(names, matrix_list)

In [35]:
def identify_clusters(distance_matrix):
    # Initialize variables
    best_number_clusters = 0
    best_silhouette_score = -1
    best_cluster_labels = None
    
    # Iterate over possible number of clusters
    for n_clusters in range(2, len(distance_matrix)):
        # Perform AgglomerativeClustering algorithm
        agglomerativeClustering = AgglomerativeClustering(n_clusters=n_clusters, metric='precomputed', linkage='complete')
        # Predict the cluster labels using the distance matrix
        cluster_labels = agglomerativeClustering.fit_predict(distance_matrix)
        # Calculate the silhouette score for the current clustering.
        silhouette_coefficient = round(silhouette_score(distance_matrix, cluster_labels, metric='precomputed'), 3)
        print("Number of clusters:", n_clusters, "\t Average silhouette score:", silhouette_coefficient)
        
        # Update the best silhouette score, best number of clusters, and the associated cluster labels 
        if silhouette_coefficient > best_silhouette_score:
            best_silhouette_score = silhouette_coefficient
            best_number_clusters = n_clusters
            best_cluster_labels = cluster_labels
            
    # Print the best results
    print("Best number of clusters:", best_number_clusters, "\t Best silhouette score:", best_silhouette_score, "\n")
    # Return the best (optimal) cluster labels.
    return best_cluster_labels

In [36]:
def save_clusters(sequence_names, cluster_labels):
    with open("data/clusters/" + sample + "_cluster.csv", 'w', newline='') as csvfile:
        csv_writer = csv.writer(csvfile)
        # Writing header
        csv_writer.writerow(['Sequence Name', 'Cluster Label'])
        # Writing data
        for sequence_name, cluster_label in zip(sequence_names, cluster_labels):
            csv_writer.writerow([sequence_name, cluster_label])

In [37]:
# Define a list of sample to process
samples = ["04.B1.W14.01_04.M1.W09.02", "05.B1.W14.04_05.M1.W08.03", 
           "27.B1.W13.06_27.M1.W10.07", "30.B1.W11.08_30.M1.W04.09", 
           "38.B1.W10.11_38.M1.W03.10", "39.B1.W11.12_39.M1.W03.13_39.M1.W05.14",
           "53.B1.W14.17_53.M1.W07.16", "56.B1.W09.22_56.M1.W03.21", 
           "63.B1.W09.29_63.M1.W02.30", "66.B1.W09.25_66.M1.W02.24"]

# Iterate through each sample in the list
for sample in samples: 
    # Print the current sample's name
    print("Sample:", sample)
    # Get the names of the sequences associated with the current sample
    sequence_names = get_sequence_names(sample)
    # Load the distance matrix associated with the current sample from a data folder
    distance_matrix = load_distance_matrix("data/matrices/" + sample + "_distance_matrix")
    # Cluster the sequences based on the loaded distance matrix
    cluster_labels =  identify_clusters(distance_matrix)
     # Save the determined clusters
    save_clusters(sequence_names, cluster_labels)

Sample: 04.B1.W14.01_04.M1.W09.02
Number of clusters: 2 	 Average silhouette score: 0.64
Number of clusters: 3 	 Average silhouette score: 0.671
Number of clusters: 4 	 Average silhouette score: 0.684
Number of clusters: 5 	 Average silhouette score: 0.589
Number of clusters: 6 	 Average silhouette score: 0.513
Number of clusters: 7 	 Average silhouette score: 0.37
Number of clusters: 8 	 Average silhouette score: 0.242
Number of clusters: 9 	 Average silhouette score: 0.149
Number of clusters: 10 	 Average silhouette score: 0.111
Number of clusters: 11 	 Average silhouette score: 0.062
Best number of clusters: 4 	 Best silhouette score: 0.684 

Sample: 05.B1.W14.04_05.M1.W08.03
Number of clusters: 2 	 Average silhouette score: 0.536
Number of clusters: 3 	 Average silhouette score: 0.541
Number of clusters: 4 	 Average silhouette score: 0.328
Number of clusters: 5 	 Average silhouette score: 0.233
Number of clusters: 6 	 Average silhouette score: 0.227
Number of clusters: 7 	 Average 