# K-mer Evolution Notebook

## Notes

- HDBSCAN github errors!
    - need to find version without problems
    - if now finding one revert back to MA version
- better inclusion of R, N, ... in the kmer
    - status: finished
- evolution on reading frame
    - difficult, ORF notebook necessary for it
- status: unfinished

## Blueprint

![Class](Clusterer.svg)

In [57]:
import numpy as np
import pandas as pd
import itertools as it
from Bio import SeqIO
from Bio.Seq import Seq
import math
import re
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
import multiprocessing as mp
import hdbscan
import progressbar

In [118]:
k = 7 
split = '|' 
quality = {'4':2, 'Pass':8}
variable = 0.9
min_clust = 2
sample = 1
num_clust =  60
n_components = 50
state = 1.0
alpha = 0.05
beta = 0.025

In [106]:
class Vectors(object):
    
    def __init__(self, k = 7, split = None, quality = {'':0}, variable = 0.9, state = 1.0, alpha = 0.05, beta = 0.025):
    
        self.k = k
        self.quality = quality
        self.split = split
        self.variable = variable
        self.nucleotides = ['A', 'C', 'G', 'T']
        self.substit = dict.fromkeys(map(ord, self.nucleotides), None)
        self.exist = dict.fromkeys(map(''.join, it.product(self.nucleotides, repeat = self.k)), 0)        
        self.col = len(self.exist.keys())
        self.state = state
        self.nucex = {
            'A':['A'],
            'C':['C'],
            'G':['G'],
            'T':['T'],
            'R':['A', 'G'],
            'Y':['C', 'T'],
            'W':['A', 'T'],
            'S':['C', 'G'],
            'M':['A', 'C'],
            'K':['G', 'T'],
            'B':['G', 'C', 'T'],
            'H':['A', 'C', 'T'],
            'D':['A', 'G', 'T'],
            'V':['A', 'C', 'G'],
            'N':['A', 'C', 'G', 'T'],
        } 
        self.nucmut = {
            'A':['C', 'G', 'T'],
            'C':['A', 'G', 'T'],
            'G':['A', 'C', 'T'],
            'T':['A', 'C', 'G'],
        } 
        self.nucval = {
            ('A','C'):beta,
            ('A','G'):alpha,
            ('A','T'):beta,
            ('C','A'):beta,
            ('C','G'):beta,
            ('C','T'):alpha,
            ('G','A'):alpha,
            ('G','C'):beta,
            ('G','T'):beta,
            ('T','A'):beta,
            ('T','C'):alpha,
            ('T','G'):beta,
        }
    
    def countRows(self, infile):
        
        sequences = {}
        index = []
        row = 0
        for entry in SeqIO.parse(infile,'fasta'):
            
            name = entry.name
            head = name.split(self.split)
            sequence = str(entry.seq)
            missing = len(sequence.translate(self.substit))
            fracture = float(len(sequence)/missing) if missing else 0 
            accession = head[0]
            
            try:
                if all([re.match(i, head[self.quality[i]], re.IGNORECASE) for i in self.quality]) == True and fracture <= self.variable:
                    row += 1
                    sequences[accession] = sequence
                    index.append(accession)
            except:
                pass
        
        return(row, index, sequences)
    
    def calculateFrequence(self, infile):
        
        row, index, sequences = self.countRows(infile)
        matrix = np.empty((row, self.col, ),dtype = 'float32')
        
        pos = 0
        widgets = [' [', progressbar.Timer(format = 'elapsed time: %(elapsed)s'), '] ', progressbar.Bar('#'),' (', progressbar.ETA(), ') ', ]
        bar = progressbar.ProgressBar(max_value = len(index), widgets = widgets).start()

        for accession in index:
            
            sequence = sequences[accession]
            for i in range(len(sequence) - self.k + 1):
                kmer = sequence[i:i+self.k]
                main = map(''.join, it.product(*[self.nucex.get(j) for j in kmer]))

                for sub in main:
                    self.exist[sub] += self.state
                    for l, nuc in enumerate(sub):
                        for mut in self.nucmut[nuc]:
                            mutation = sub[:l] + mut + sub[l+1:]
                            self.exist[mutation] += self.nucval[(nuc,mut)]

            matrix[pos] = np.fromiter(self.exist.values(), dtype = 'float32', count = self.col)/sum(self.exist.values())
            
            self.exist.update((k,0) for k in self.exist.keys())
            bar.update(pos)
            pos += 1

        bar.finish()

        return(index, matrix)

In [107]:
def Cluster(linkage, min_clust, num_clust):

    x = 0.0
    y = 1.0
    cluster = linkage.get_clusters(cut_distance = x, min_cluster_size = min_clust)
    n = cluster.max().item()

    while n != num_clust:

        if n < num_clust and n != -1:
            x = x - y
            y = y * 0.1

        else:
            x = x + y

        cluster = linkage.get_clusters(cut_distance = x, min_cluster_size = min_clust)
        n = cluster.max().item()
        
    return(cluster)

In [109]:
vectors = Vectors(k = k, split = split, quality = quality, variable = variable, state = state, alpha = alpha, beta = beta) #FASTER it can still be faster
index, matrixl1 = vectors.calculateFrequence('C.fasta')

 [elapsed time: 0:00:00] |##################################| (Time:  0:00:00) 


In [62]:
pca = PCA(n_components = 50)
matrixpca = pca.fit_transform(matrixl1)
variance = pca.explained_variance_ratio_.sum()

In [63]:
matrixl2 = normalize(matrixpca, norm='l2')

In [64]:
hdbinit = hdbscan.HDBSCAN(min_samples = sample, min_cluster_size = min_clust, gen_min_span_tree = True, metric = 'euclidean').fit(matrixl2)

In [65]:
linkage = hdbinit.single_linkage_tree_

In [66]:
cluster = Cluster(linkage, min_clust, num_clust) #error correction ergänzen, wenn er die Zahl nicht finden kann (kleine Stickprobe)

In [67]:
framecl = pd.DataFrame(zip(index, cluster), columns = ['accession', 'cluster']).set_index('accession')
framelk = linkage.to_pandas().set_index('parent', inplace = False)

In [68]:
framecl.to_csv('cluster.csv', index=True, header=True, sep=',', mode='w')
framelk.to_csv('linkage.csv', index=True, header=True, sep=',', mode='w')

## Garbage Place