# Clustering

This script debugs the clustering algorithm to ensure that results are deterministic. It contains the following steps:

- Padding reads with the reference genome to fill empty spaces
- Sliding a window over the region of interest to identify reads

# Load modules

In [33]:
import numpy as np
import pandas as pd
import json
import os

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import KMeans
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()

# Setup

In [37]:
# Set Variables

# Input variables
run_number="run1"
chrom="chr11"
dis="sca"
# Setup
chrom_dis=f"{chrom}_{dis}"
datadir=f"/mnt/aretian/genomics/nanopore/{run_number}"

os.environ["run_number"]=run_number
os.environ["chrom_dis"]=chrom_dis
os.environ["datadir"]=datadir

# Import data

# Import Nanopore reads
nanopore_reads_path = f'{datadir}/{run_number}_{chrom_dis}_clean.csv'
nanopore_reads = pd.read_csv(nanopore_reads_path)

# Import reference genome for the location of interest without gaps
with open(f'{datadir}/{run_number}_{chrom}_reference_genome.json', 'r') as f:
    ref_genome_json = json.load(f)

ref_genome = ref_genome_json['reference_genome']

# Helper functions

# Define hyper-parameters
window_width = 5000 # 10K BP
jump_width = 1000 # 1k BP sliding window (ideally jump_width=1)

# check if the start of a sequence is in range
def check_sliding_window(read):
    read_filter = False

# #     Slow algorithm
#     window = set(list(range(left_bound,right_bound+1)))
#     read = set(list(range(read['POS'],read['END_POS']+1)))
#     if len(window & read)>0:
#         read_filter = True

    starts_inside = read['POS']>=left_bound and read['POS']<=right_bound
    ends_inside = read['END_POS']>=left_bound and read['END_POS']<=right_bound
    covers_window = read['POS']<left_bound and read['END_POS']>right_bound
    if starts_inside or ends_inside or covers_window:
        read_filter=True

    return read_filter

# Input: read --> Series
# Output: nucleotide_sequence_trimmed: String

def left_pad_read(read):
    nucleotide_sequence = list(read['SEQ']) # turn sequence string into a list
    genome_sequence = list(window_ref_genome)

    start = read['POS']-left_bound

    if start == 0:
        return read['SEQ'] # read starts on the start of the window

    elif start > 0: # sequence starts to the right of the bound
#         print('HERE1')
#         genome_fill = window_ref_genome[:start] # select portion of reference genome to pad the left with
        genome_fill = genome_sequence[:start] # select portion of reference genome to pad the left with
#         print('genome_fill type:', type(genome_fill))
#         print('nucleotide_sequence type:', type(nucleotide_sequence))
        nucleotide_sequence = genome_fill+nucleotide_sequence
        nucleotide_sequence_trimmed = ''.join(nucleotide_sequence) # keep entire sequence
        return nucleotide_sequence_trimmed

    elif start < 0: # sequence starts to the left of the bound
#         print('HERE2')
        nucleotide_sequence_trimmed = ''.join(nucleotide_sequence[np.abs(start):])
        return nucleotide_sequence_trimmed

# Input: read --> Series
# Output: nucleotide_sequence_trimmed: String

def right_pad_read(read):
    nucleotide_sequence = list(read['left_padded']) # turn sequence string into a list
    genome_sequence = list(window_ref_genome)

    end = right_bound-read['END_POS']

    if end == 0:
        return read['left_padded'] # read end on the end of the window

    elif end > 0: # sequence ends to the left of the bound
#         genome_fill = window_ref_genome[-end:] # select portion of reference genome to pad the right with
        genome_fill = genome_sequence[-end:] # select portion of reference genome to pad the right with
        nucleotide_sequence = nucleotide_sequence+genome_fill
        nucleotide_sequence_trimmed = ''.join(nucleotide_sequence) # keep entire sequence
        return nucleotide_sequence_trimmed

    elif end < 0: # sequence ends to the right of the bound
        nucleotide_sequence_trimmed = ''.join(nucleotide_sequence[:end])
        return nucleotide_sequence_trimmed

# Should be updated
NUCLEOTIDE_VOCABULARY = [
    'A','C','G','T','X'
]

# Not being used, instead, we use the sklearn one hot encoding
def nucleotide_to_one_hot(nucleotide_sequence):
    to_return = []
    for char in nucleotide_sequence:
        if char in NUCLEOTIDE_VOCABULARY:
            to_append = np.zeros(len(NUCLEOTIDE_VOCABULARY))
            to_append[NUCLEOTIDE_VOCABULARY.index(char)] = 1.
            to_return.append(to_append)
        else:
            raise ValueError('Could not one-hot code character {}'.format(char))
    return np.array(to_return)

#nucleotide_to_one_hot('GTCATACX') # uncomment example to see what the encoding does

# Input: read (Series)

# Text file for results
# def write_results_to_file(read, results_file):
#     results_file.write(f"{read['ID']},{read['kmeans_cls2']},{read['window_num']}\n")
#     TODO: Add window number
#     results_file.close()

# Debugging loops

The following cell contains two embedded for loops:
- Outer: clustering loop. Rerun over and over to check if results are the same.
- Inner: iterations: run the algorithm that iterates over the reads and performs clustering

In [237]:
total_loops = 20
iter_break = 30
# iter_break = 200
window_reads_list = []
onehot_list = []
pca_list = []


# User feedback
print("CLUSTERING ALGORITHM")
print("")
print(f"Total range: {max(nanopore_reads['END_POS']):,}")
print(f"Window width: {window_width}")
print(f"Jump width: {jump_width}")
total_jumps = round(max(nanopore_reads['END_POS'])/jump_width)
print(f"Number of iterations required: {total_jumps}")
print("")
print(f"Reading Nanopore reads from: {nanopore_reads_path}")
print(f"Writing results to file:     {results_file_path}")
print("")
print("0: Found 0 reads in window. Saving empty window.")
print("1: Found 1 read in window. Applying arbitrary cluster 0.")
print("*: Found more than 1 read in window and ran clustering. Saving clusters.")

print("")
print(f"Total Clustering loops: {total_loops}")

def string_to_list(read):
    return list(read['final_padded_read'])

import time

for i in range(total_loops):
    start = time.time()
    
    results_file_path = f"{datadir}/{run_number}_{chrom}_read_clusters_debug{i}.txt"
    
    window_reads_df = pd.DataFrame(columns = ['ID', 'kmeans_cls2', 'window_num'])
    onehot = []
    pca_df = pd.DataFrame(columns = ['PCA1', 'PCA2'])


    # Main Loop: padding and clustering

#     results_file = open(results_file_path,"w")
    # results_file = open(results_file_path,"a")

    # window_ref_genome_start = 0
    iter = 1
    empty_count = 0
    
    print("")
#     print(f"Iterations: ", end="")
    print(f"Clustering loop #{i}: ", end="")

    for left_bound in range(min(nanopore_reads['POS']),max(nanopore_reads['END_POS']),jump_width):
    # for left_bound in range(950000,970000,jump_width):

        progress = iter/total_jumps
    #     print(f" {progress:.0f}% ", end="")
        if round(progress*1000)%50==0:
            print(f" {round(progress*100):.0f}% ", end="")
        iter+=1

        right_bound = left_bound+window_width
        window_ref_genome = ref_genome[left_bound:left_bound+window_width]

        # Identify if each read is in the window: True/False
    #     print("Identifying if each read is in the window...")
        nanopore_reads['read_filter'] = nanopore_reads.apply(lambda x: check_sliding_window(x), axis=1)

        # Get them
        window_reads = nanopore_reads.loc[nanopore_reads['read_filter']==True] 
    #     print(f" {window_reads['ID'].unique()}-", end="")
        window_reads = window_reads.reset_index()

        # Display feedback to user every 100 empty windows
    #     if empty_count%100 == 0:
    #         print(empty_count)

        if len(window_reads) == 0:
            # There are no nanopore reads in this window (next window)
            empty_count += 1
            print(f"0", end="")

            # Record window with empty read
            data = {'ID':[np.nan],'kmeans_cls2':[np.nan], 'window_num':[iter]}
            empty_window_reads = pd.DataFrame(data)

#             empty_window_reads.apply(lambda x: write_results_to_file(x, results_file), axis=1) # write results to file
#             window_reads_df.append(empty_window_reads)
            window_reads_df = pd.concat([window_reads_df,empty_window_reads])

#             print(window_reads_df)
            continue 

        elif len(window_reads) == 1:
            # There is only one read in this window
            empty_count += 1
            print(f"1", end="")

            # Record window and assign arbitrary individual
            window_reads['window_num']=iter
            window_reads['kmeans_cls2']=0
#             window_reads.apply(lambda x: write_results_to_file(x, results_file), axis=1) # write results to file
#             window_reads_df.append(window_reads[['ID','kmeans_cls2','window_num']])
            window_reads_df = pd.concat([window_reads_df,window_reads[['ID','kmeans_cls2','window_num']]])
#             print(window_reads_df)

            continue

        # More than one read in window. Proceeding to padding and clustering

    #     Padding
        window_reads['left_padded'] = window_reads.apply(lambda x: left_pad_read(x), axis=1) # fill reference genome on the left of the read
        window_reads['right_padded'] = window_reads.apply(lambda x: right_pad_read(x), axis=1) # fill reference genome on the right of the read
        window_reads['final_padded_read'] = window_reads['right_padded']
        window_reads['FINAL_SEQ_LEN'] = window_reads['final_padded_read'].apply(lambda x: len(x)) # should always be 5k

        # not currently used. Should be used if we use the custom one-hot-encoding
    #     window_reads['one_hot_read_V1'] = window_reads['final_padded_read'].apply(lambda x: nucleotide_to_one_hot(x).flatten())  # apply one-hot encoding V1

    #     One hot encoding
#         unique_reads = []
#         for index, read in window_reads.iterrows(): # TODO: TRY TO USE LAMBDA FUNCTION IF POSSIBLE
#             unique_reads.append(list(read['final_padded_read']))
        
        # Lambda function strategy
#         window_reads['final_padded_read_list'] = window_reads.apply(lambda x: string_to_list(x), axis=1)
#         unique_reads = list(window_reads['final_padded_read_list'])
        
        # List comprehension strategy
        unique_reads = [list(row) for row in window_reads['final_padded_read']]
    
        X_onehot = encoder.fit_transform(unique_reads).toarray()
        
        # PCA
        pca = PCA(n_components=2, random_state=42)
        X_pca = pca.fit_transform(X_onehot)

        window_reads['PCA1'] = np.nan
        window_reads['PCA2'] = np.nan
        for ID in window_reads.index:
            window_reads.loc[ID,'PCA1'] = X_pca[ID][0] # TODO: change rounding if desired
            window_reads.loc[ID,'PCA2'] = X_pca[ID][1] # TODO: change rounding if desired

        # Standardizing the features
        X = window_reads[['PCA1','PCA2']]
        X = StandardScaler().fit_transform(X)

        # Run Kmeans
        model = KMeans(n_clusters=2, random_state=42)
#         model = KMeans(n_clusters=2, random_state=0)

        cls2 = model.fit(X)
        window_reads['kmeans_cls2'] = cls2.labels_
        window_reads['window_num'] = iter

        # Write results to file
#         window_reads.apply(lambda x: write_results_to_file(x, results_file), axis=1) # write results to file
#         print(window_reads[['ID','kmeans_cls2','window_num']])
#         window_reads_df.append(window_reads[['ID','kmeans_cls2','window_num']])
        window_reads_df = pd.concat([window_reads_df,window_reads[['ID','kmeans_cls2','window_num']]])
        onehot.append(X_onehot)
        pca_df = pd.concat([pca_df,window_reads[['PCA1','PCA2']]])

#         print(window_reads_df)

        print(f"*", end="")

    #     Break after N iterations
        if iter>iter_break:
            break

#     results_file.close()
    
    window_reads_list.append(window_reads_df)
    onehot_list.append(onehot)
    pca_list.append(pca_df)

    
    if i == 0:
        print("")
        print("Done with first loop.")  
    elif i > 0:
        # Check window reads
        if (window_reads_list[i] == window_reads_list[i-1]).all().all():
            print("")
            print("Window reads: OK")
        else:
            print("")
            print("Window reads: Found difference.")
    
        # One Hot Encoder
        for n in range(len(onehot_list[0])):
            if (onehot_list[i][n]==onehot_list[i-1][n]).all():
                pass
            else:
                print("One-Hot:      Found difference")
        print("One-Hot:      OK")
        
        # Check PCA
        if (pca_list[i] == pca_list[i-1]).all().all():
            print("PCA:          OK")
        else:
            print("PCA:          Found difference.")

#     print(f"Total time: {time.time() - start:.2f} seconds.")
#     print("")            
#     print("Finished running.")

CLUSTERING ALGORITHM

Total range: 3,965,548
Window width: 5000
Jump width: 1000
Number of iterations required: 3966

Reading Nanopore reads from: /mnt/aretian/genomics/nanopore/run1/run1_chr11_sca_clean.csv
Writing results to file:     /mnt/aretian/genomics/nanopore/run1/run1_chr11_read_clusters_debug2.txt

0: Found 0 reads in window. Saving empty window.
1: Found 1 read in window. Applying arbitrary cluster 0.
*: Found more than 1 read in window and ran clustering. Saving clusters.

Total Clustering loops: 20

Clustering loop #0:  0% 1111111111111111**************
Done with first loop.

Clustering loop #1:  0% 1111111111111111**************
Window reads: OK
One-Hot:      OK
PCA:          OK

Clustering loop #2:  0% 1111111111111111**************
Window reads: OK
One-Hot:      OK
PCA:          OK

Clustering loop #3:  0% 1111111111111111**************
Window reads: OK
One-Hot:      OK
PCA:          OK

Clustering loop #4:  0% 1111111111111111**************
Window reads: OK
One-Hot:   

# EXTRA CODE
Optimize number of clusters. Not needed in this case.

In [11]:
# from sklearn.metrics import silhouette_score
# from sklearn.preprocessing import StandardScaler
# from sklearn.cluster import KMeans

# # Standardizing the features
# X = test[['PCA1','PCA2']]
# X = StandardScaler().fit_transform(X)

# distortions = []

# for k in range(1, 5):
#     KMeans_model = KMeans(n_clusters=k, random_state=42)
#     KMeans_model.fit(X)
#     distortions.append(KMeans_model.inertia_)

# plt.plot(range(1, 5), distortions,  marker='o')
# plt.xlabel('Number of clusters (K)')
# plt.ylabel('Distortion')

# silhouette_scores = []
# for k in range(2, 5):
#     model = KMeans(n_clusters=k, random_state=42)
#     model.fit(X)
#     score = silhouette_score(X, model.labels_)
#     print("Silhouette Score for k = ", k, "is", score)
#     silhouette_scores.append(score)

# plt.plot(range(2, 5), silhouette_scores, marker='o')
# plt.xlabel('Number of clusters (K)')
# plt.ylabel('Silhouette score')

# from yellowbrick.cluster import KElbowVisualizer

# visualizer3 = KElbowVisualizer(KMeans(), k=(2,5))

# visualizer3.fit(X) # Fit the data to the visualizer

# model = KMeans(n_clusters=2, random_state=42)
# cls2 = model.fit(X)
# test['kmeans_cls2'] = cls2.labels_

# import seaborn as sns
# sns.scatterplot(data=window_reads,x='PCA1',y='PCA2',hue='kmeans_cls2')