In [1]:
#HDB testing

In [1]:
import numpy as np
import pandas as pd
import sys
import re
import csv
import collections as co
import itertools as it
import umap
import hdbscan
import time 
from sklearn import metrics
import scipy.spatial.distance as ssd
#from memory_profiler import memory_usage
from plotnine.data import *
from plotnine import *
%matplotlib inline

In [2]:
class vectorizer(object):
    
    def __init__(self, k = 7, convert = 0):
    
        self.k = k
        self.convert = convert
        self.exist = co.defaultdict(int) 
        self.keys = list(self.exist.keys())
        self.col = len(self.keys)
        self.row = 0
        self.matrix = np.empty((self.row, self.col, ),dtype = "float32")
        self.amino = co.defaultdict(str, {
            'AAA':'K', 'AAC':'N', 'AAG':'K', 'AAT':'N',
            'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
            'AGA':'R', 'AGC':'S', 'AGG':'R', 'AGT':'S',
            'ATA':'I', 'ATC':'I', 'ATG':'M', 'ATT':'I',
            'CAA':'Q', 'CAC':'H', 'CAG':'Q', 'CAT':'H',
            'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
            'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
            'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
            'GAA':'E', 'GAC':'D', 'GAG':'E', 'GAT':'D',
            'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
            'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
            'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',    
            'TAA':'Y', 'TAC':'*', 'TAG':'*', 'TAT':'Y',
            'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
            'TGA':'*', 'TGC':'C', 'TGG':'W', 'TGT':'C',
            'TTA':'L', 'TTC':'F', 'TTG':'L', 'TTT':'F'
        })
                
    def translate(self, read):
    
        chain = ''

        for i in range(len(read) - 2):
            trip = read[i:i+3]
            chain += self.amino[trip]

        return(chain)
    
    
    def adjust_to_data(self, infile):
    
        self.row = infile.shape[0]
            
        for line, read in infile.itertuples(index=True, name=None):

            if self.convert == 1:
                seq = self.translate(read)
                del read

                num = len(seq) - self.k + 1

                for i in range(num):
                    kmer = seq[i:i+self.k]
                    self.exist[kmer] = 0

            else:
                seq = read
                del read

                num = len(seq) - self.k + 1

                if re.match('^[ACGT]*$', seq): 
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        self.exist[kmer] = 0
                else:
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        if re.match('^[ACGT]*$', kmer): 
                            self.exist[kmer] = 0
            
        self.keys = list(self.exist.keys())
        self.col = len(self.keys)
        self.matrix = np.empty((self.row, self.col, ), dtype="float32")
        
        del seq
    
    def calculate_frequence(self, infile):
        
        for line, read in infile.itertuples(index=True, name=None): 
                 
            if self.convert == 1:
                seq = self.translate(read)
                del read

                counts = self.exist.copy()
                num = len(seq) - self.k + 1

                for i in range(num):
                    kmer = seq[i:i+self.k]
                    counts[kmer] += 1

            else:
                seq = read
                del read

                counts = self.exist.copy()
                num = len(seq) - self.k + 1

                if re.match('^[ACGT]*$', seq): 
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        counts[kmer] += 1
                else:
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        if re.match('^[ACGT]*$', kmer): 
                            counts[kmer] += 1

            vector = np.array(list(counts.values()), dtype = "float32")/num

            self.matrix[line] = vector
            
            counts.clear()
            del vector
            del seq
            del counts
    
    
    def get_keys(self):
        
        return(self.keys)
    
    
    def get_matrix(self):
        
        return(self.matrix)

In [3]:
#Data Upload

upload = pd.read_csv('../../A.csv', sep = ',', na_filter = False, header = None)
upload.columns = ['accession', 'strain', 'segment', 'protein', 'genus', 'subtype', 'date', 'host', 'curation', 'genome']
upload.query('curation == "Pass"', inplace = True)

subset = upload.query('segment == 4').reset_index()

sequence = subset[['genome']].copy()
accession = subset[['accession']].copy()
subtype = subset[['subtype']].copy()

In [4]:
#Frequency Calculation

freq = vectorizer(k = 7, convert = 0)
freq.adjust_to_data(sequence)
freq.calculate_frequence(sequence)

matrix = freq_nt.get_matrix()
keys = freq_nt.get_keys()

In [116]:
#Dimension reduction with UMAP

reduced = umap.UMAP(
    n_neighbors = 30,
    min_dist = 0.0,
    n_components = 20,
    random_state = None,
    metric = 'cosine',
).fit_transform(matrix)
   
dataframe = pd.concat([accession, pd.DataFrame(reduced)], axis=1, copy = False, ignore_index = False).set_index('accession')

In [115]:
#Clustering with HDBSCAN

dbcv = 0.0
epsilon = 0.0
while epsilon <= 3.0:
    
    clusterer = hdbscan.HDBSCAN(
        min_samples = 1, #larger the value the more conservative the clustering (more points will be declared as noise)
        min_cluster_size = 2, #minimum size that can become a cluster
        cluster_selection_epsilon = epsilon, #don't seperate clusters with a distance less than value
        alpha = 1.0, #don't mess with this
        gen_min_span_tree = True
    ).fit(dataframe)
    
    dbcv = clusterer.relative_validity_
    label = clusterer.labels_
    
    diagnostic = co.Counter(label)
    
    print(f"{epsilon:0.2f} {dbcv:0.4f} {diagnostic[-1]} {len(set(diagnostic))-1}")
    epsilon += 0.05

0.00 0.3566 12085 13526
0.05 0.4052 1237 1987
0.10 0.5272 232 1142
0.15 0.6137 57 842
0.20 0.6457 24 725
0.25 0.6715 12 633
0.30 0.6909 7 578
0.35 0.7352 4 543
0.40 0.7397 3 521
0.45 0.7509 2 505
0.50 0.7563 2 486
0.55 0.7654 2 475
0.60 0.7813 2 461
0.65 0.8163 2 450
0.70 0.8192 2 445
0.75 0.8320 0 434
0.80 0.8381 0 430
0.85 0.8394 0 426
0.90 0.8390 0 422
0.95 0.8457 0 418
1.00 0.8530 0 415
1.05 0.8533 0 412
1.10 0.8488 0 411
1.15 0.8465 0 408
1.20 0.8543 0 404
1.25 0.8576 0 403
1.30 0.8624 0 400
1.35 0.8624 0 400
1.40 0.8624 0 400
1.45 0.8624 0 400
1.50 0.8624 0 396
1.55 0.8624 0 396
1.60 0.8628 0 394
1.65 0.8628 0 394
1.70 0.8628 0 394
1.75 0.8617 0 393
1.80 0.8629 0 392
1.85 0.8655 0 389
1.90 0.8695 0 388
1.95 0.8580 0 387
2.00 0.8532 0 386
2.05 0.8478 0 385
2.10 0.8423 0 383
2.15 0.8417 0 381
2.20 0.8439 0 380
2.25 0.8439 0 380
2.30 0.8439 0 380
2.35 0.8439 0 380
2.40 0.8457 0 377
2.45 0.8443 0 374
2.50 0.8417 0 373
2.55 0.8388 0 372
2.60 0.8298 0 369
2.65 0.8265 0 368
2.70 0.8265 

In [92]:
#Evaluation

print(f"DBCV: {matrix_clust.relative_validity_:0.6f}")

#print(f"Silhouette: {metrics.silhouette_score(matrix,clusterlabel):0.6f}")

diagnostic = co.Counter(clusterlabel)
print(f"Unclustered: {diagnostic[-1]}")

print(f"Cluster: {len(set(diagnostic))-1}")
    

DBCV: 0.033802
Unclustered: 0
Cluster: 1


In [93]:
len(matrix_clust.exemplars_)

13176

In [77]:
def exemplars(cluster_id, condensed_tree):
    raw_tree = condensed_tree._raw_tree
    # Just the cluster elements of the tree, excluding singleton points
    cluster_tree = raw_tree[raw_tree['child_size'] > 1]
    # Get the leaf cluster nodes under the cluster we are considering
    leaves = hdbscan.plots._recurse_leaf_dfs(cluster_tree, cluster_id)
    # Now collect up the last remaining points of each leaf cluster (the heart of the leaf)
    result = np.array([])
    for leaf in leaves:
        max_lambda = raw_tree['lambda_val'][raw_tree['parent'] == leaf].max()
        points = raw_tree['child'][(raw_tree['parent'] == leaf) &
                                   (raw_tree['lambda_val'] == max_lambda)]
        result = np.hstack((result, points))
    return result.astype(np.int)


In [83]:
x[0]

array([[ 4.85216093,  2.5750246 ,  4.97009897, 12.76915169,  6.566854  ,
         2.62767673,  4.3326149 ,  4.69362736,  2.86707449,  1.97168362,
         0.54293221,  4.63730049,  3.50527024,  2.7053144 ,  3.96359444,
         3.56041551,  2.82128739,  4.01322556,  4.30470181,  5.1190505 ],
       [ 4.85342741,  2.57323861,  4.97303915, 12.7682209 ,  6.56515884,
         2.62856627,  4.33245754,  4.69344044,  2.86640334,  1.97073472,
         0.5408926 ,  4.63664675,  3.50679851,  2.70577931,  3.96458411,
         3.55961752,  2.82133985,  4.012187  ,  4.30458164,  5.11897182]])

In [None]:
#Sanity check NA/HA

clusters = pd.concat([pd.DataFrame(clusterlabel, columns = ['cluster']), subtype, accession], axis=1, copy = False).set_index('accession')

num = clusters['cluster'].max()+1
values = ['True']*num
accessions = []
exclude = []
subs = co.defaultdict(list)

for i in range(num):

    query = clusters.query('cluster == @i')
    match = query.index.values.tolist()
    sub = matrix.filter(items = match, axis=0)
    dist = ssd.cdist(sub, sub, metric = 'cosine')
    inner_mean = pd.DataFrame(dist, columns = match, index = match, dtype = 'float32').mean()
    accessions.append(inner_mean.idxmin())
    
    for sub in query['subtype'].tolist():
        if re.match('^[H][0-9]+N[0-9]+$', sub): 
            subs['H'].append(re.search('[H][0-9]+', sub).group(0))
            subs['N'].append(re.search('[N][0-9]+', sub).group(0))
        else:
            subs['X'].append('X0')
            subs['X'].append('X0')

    if len(set(subs['H'])) == 1 and len(set(subs['N'])) == 1:
        exclude.append(2)
    elif len(set(subs['H'])) == 1:
        exclude.append(1)
    elif len(set(subs['N'])) == 1:
        exclude.append(0)

    subs.clear()

centroids = pd.DataFrame(values, columns=['centroid'], index = accessions)
clusters.update(centroids)

print(f"Matching NA types: {exclude.count(0) + exclude.count(2)}")
print(f"Matching HA types: {exclude.count(1) + exclude.count(2)}")

In [None]:
#Plot cluster size density

density = pd.DataFrame(result.value_counts(subset=['segment', 'cluster']), columns = ['size'])
density.reset_index(level = ["segment", "cluster"], inplace=True)

(ggplot(density, aes(x="size", colour = "factor(segment)", fill = "factor(segment)")) 
 + labs(
    x="Cluster Size",
    y="Density",
    fill="Segment",
    colour="Segment",
    title="Cluster Size Distribution",
 )
 + geom_density(alpha = 0.1)
 + scale_x_log10()
 + scale_y_continuous()
)