In [1]:
import numpy as np
import pandas as pd
import sys
import re
import csv
import collections as co
import itertools as it
import umap
import hdbscan
import time 
import scipy.spatial.distance as ssd
from memory_profiler import memory_usage

In [2]:
class vectorizer(object):
    
    def __init__(self, k = 7, convert = 0):
    
        self.k = k
        self.convert = convert
        self.exist = co.defaultdict(int) 
        self.keys = list(self.exist.keys())
        self.col = len(self.keys)
        self.row = 0
        self.matrix = np.empty((self.row, self.col, ),dtype = "float32")
        self.amino = co.defaultdict(str, {
            'AAA':'K', 'AAC':'N', 'AAG':'K', 'AAT':'N',
            'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
            'AGA':'R', 'AGC':'S', 'AGG':'R', 'AGT':'S',
            'ATA':'I', 'ATC':'I', 'ATG':'M', 'ATT':'I',
            'CAA':'Q', 'CAC':'H', 'CAG':'Q', 'CAT':'H',
            'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
            'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
            'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
            'GAA':'E', 'GAC':'D', 'GAG':'E', 'GAT':'D',
            'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
            'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
            'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',    
            'TAA':'Y', 'TAC':'*', 'TAG':'*', 'TAT':'Y',
            'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
            'TGA':'*', 'TGC':'C', 'TGG':'W', 'TGT':'C',
            'TTA':'L', 'TTC':'F', 'TTG':'L', 'TTT':'F'
        })
                
    def translate(self, read):
    
        chain = ''

        for i in range(len(read) - 2):
            trip = read[i:i+3]
            chain += self.amino[trip]

        return(chain)
    
    
    def adjust_to_data(self, infile):
    
        self.row = infile.shape[0]
            
        for line, read in infile.itertuples(index=True, name=None):

            if self.convert == 1:
                seq = self.translate(read)
                del read

                num = len(seq) - self.k + 1

                for i in range(num):
                    kmer = seq[i:i+self.k]
                    self.exist[kmer] = 0

            else:
                seq = read
                del read

                num = len(seq) - self.k + 1

                if re.match('^[ACGT]*$', seq): 
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        self.exist[kmer] = 0
                else:
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        if re.match('^[ACGT]*$', kmer): 
                            self.exist[kmer] = 0
            
        self.keys = list(self.exist.keys())
        self.col = len(self.keys)
        self.matrix = np.empty((self.row, self.col, ), dtype="float32")
        
        del seq
    
    def calculate_frequence(self, infile):
        
        for line, read in infile.itertuples(index=True, name=None): 
                 
            if self.convert == 1:
                seq = self.translate(read)
                del read

                counts = self.exist.copy()
                num = len(seq) - self.k + 1

                for i in range(num):
                    kmer = seq[i:i+self.k]
                    counts[kmer] += 1

            else:
                seq = read
                del read

                counts = self.exist.copy()
                num = len(seq) - self.k + 1

                if re.match('^[ACGT]*$', seq): 
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        counts[kmer] += 1
                else:
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        if re.match('^[ACGT]*$', kmer): 
                            counts[kmer] += 1

            vector = np.array(list(counts.values()), dtype = "float32")/num

            self.matrix[line] = vector
            
            counts.clear()
            del vector
            del seq
            del counts
    
    
    def get_keys(self):
        
        return(self.keys)
    
    
    def get_matrix(self):
        
        return(self.matrix)

In [3]:
def main():

    print("Read input and settings file.", end = ' ')

    infile = 'Input/A.csv'   
    setfile = 'Input/settings.csv'
    outpath = 'Output/'

    settings = pd.read_csv(setfile, sep = ',', na_filter = False, index_col = 0)
    upload = pd.read_csv(infile, sep = ',', na_filter = False, header = None)
    
    print("Finished.")
    
    segment = 4
    print(f"Starting calculations for segment {segment}:")
    
    start_clust = time.perf_counter()
    
    parameter = settings.loc[segment].to_list()
    upload.columns = ['accession', 'strain', 'segment', 'protein', 'genus', 'subtype', 'date', 'host', 'curation', 'genome']
    subset = upload.query('segment == @segment').reset_index()

    sequence = subset[['genome']].copy()
    accession = subset[['accession']].copy()
    subtype = subset[['subtype']].copy()
    
    print("Nucleotide k-mer frequency calculation.", end = ' ')

    freq_nt = vectorizer(k = parameter[0].item(), convert = 0)
    freq_nt.adjust_to_data(sequence)
    freq_nt.calculate_frequence(sequence)

    matrix_nt = freq_nt.get_matrix()
    keys_nt = freq_nt.get_keys()

    del freq_nt

    print("Finished.")

    print("Running UMAP for dimension reduction.", end = ' ')

    matrix_nt_red = umap.UMAP(
        n_neighbors = parameter[1].item(),
        min_dist = parameter[2].item(),
        n_components = parameter[3].item(),
        random_state = parameter[4].item(),
        metric = parameter[5],
    ).fit_transform(matrix_nt)

    del matrix_nt

    print("Finished.")

    print("Aminoacid k-mer frequency calculation.", end = ' ')

    freq_aa = vectorizer(k = parameter[6].item(), convert = 1)
    freq_aa.adjust_to_data(sequence)
    freq_aa.calculate_frequence(sequence)

    matrix_aa = freq_aa.get_matrix()
    keys_aa = freq_aa.get_keys()

    del freq_aa

    print("Finished.")

    print("Running UMAP for dimension reduction.", end = ' ')

    matrix_aa_red = umap.UMAP(
        n_neighbors = parameter[7].item(),
        min_dist = parameter[8].item(),
        n_components = parameter[9].item(),
        random_state = parameter[10].item(),
        metric = parameter[11],
    ).fit_transform(matrix_aa)

    del matrix_aa

    print("Finished.")

    matrix = pd.concat([accession, pd.DataFrame(matrix_aa_red), pd.DataFrame(matrix_nt_red)], axis=1, copy = False, ignore_index = False).set_index('accession')

    print("Running HDBscan for clustering.", end = ' ')

    matrix_clust = hdbscan.HDBSCAN(
        min_samples = parameter[12].item(), #larger the value the more conservative the clustering (more points will be declared as noise)
        min_cluster_size = parameter[13].item(), #minimum size that can become a cluster
        cluster_selection_epsilon = parameter[14].item(), #don't seperate clusters with a distance less than value
        alpha = parameter[15].item(), #don't mess with this
    ).fit(matrix)

    print("Finished.")

    print("Centroid extraction.", end = ' ')

    clusterlabel = matrix_clust.labels_

    blank = pd.DataFrame(zip(clusterlabel, ['false'] * len(clusterlabel)), columns = ['cluster', 'centroid'])
    clusters = pd.concat([blank, subtype, accession], axis=1, copy = False).set_index('accession')

    num = clusters['cluster'].max()+1
    values = ['true']*num
    accessions = []
    exclude = []
    include = []
    overall_mean=0
    subs = co.defaultdict(list)

    for i in range(num):

        query = clusters.query('cluster == @i')
        match = query.index.values.tolist()
        sub = matrix.filter(items = match, axis=0)
        dist = ssd.cdist(sub, sub, metric = parameter[16])
        inner_mean = pd.DataFrame(dist, columns = match, index = match, dtype = 'float32').mean()
        accessions.append(inner_mean.idxmin())
        overall_mean = overall_mean + inner_mean.mean()

        for sub in query['subtype'].tolist():
            if re.match('^[H][0-9]+N[0-9]+$', sub): 
                subs['H'].append(re.search('[H][0-9]+', sub).group(0))
                subs['N'].append(re.search('[N][0-9]+', sub).group(0))
            else:
                subs['X'].append('X0')
                subs['X'].append('X0')

        if len(set(subs['H'])) == 1 and len(set(subs['N'])) == 1:
            exclude.append(2)
            if 'X' not in subs.keys():
                include.append(2)
        elif len(set(subs['H'])) == 1:
            exclude.append(1)
            if 'X' not in subs.keys():
                include.append(1)
        elif len(set(subs['N'])) == 1:
            exclude.append(0)
            if 'X' not in subs.keys():
                include.append(0)

        subs.clear()

    centroids = pd.DataFrame(values, columns=['centroid'], index = accessions)

    clusters.update(centroids)
    clusters.sort_values(by=['cluster', 'subtype']).to_csv(outpath + 'cluster.csv', index=True, header=True, sep=',')

    print("Finished.")

    stop_clust = time.perf_counter()
    print(f"Clustering done in {stop_clust - start_clust:0.2f} seconds.")
    diagnostic = co.Counter(clusterlabel)
    print(f"{str(len(clusterlabel))} sequences, {str(diagnostic[-1])} unclustered, {str(len(set(diagnostic)))} cluster.")
    print(f"Mean of inner cluster distance mean {overall_mean/num:0.10f}")
    print(f"{exclude.count(0) + exclude.count(2)}({include.count(0) + include.count(2)}) clusters containing matching NA types.")
    print(f"{exclude.count(1) + exclude.count(2)}({include.count(1) + include.count(2)}) clusters containing matching HA types.")

In [4]:
if __name__ == "__main__":

    #main()
    memory = memory_usage(main)
    print(f"Maximum memory used: {max(memory)/1000:0.2f} Gb.")

Read input and settings file. Finished.
Starting calculations for segment 4:
Nucleotide k-mer frequency calculation. Finished.
Running UMAP for dimension reduction. Finished.
Aminoacid k-mer frequency calculation. Finished.
Running UMAP for dimension reduction. Finished.
Running HDBscan for clustering. Finished.
Centroid extraction. Finished.
Clustering done in 1198.7109 seconds.
56617 sequences, 36 unclustered, 1052 cluster.
Mean of inner cluster distance mean 0.0000974158
779(692) clusters containing matching NA types.
1048(787) clusters containing matching HA types.
Maximum memory used: 16254.1406Mb.
