In [1]:
!awk '/^[>;]/ { if (seq) { print seq }; seq=""; print } /^[^>;]/ { seq = seq $0 } END { print seq }' ../../A.fasta > ../../A.tmp 
!sed -i 's/ /_/g;N;s/\n/,/g;s/|/,/g' ../../A.tmp
!awk -F , 'NF == 10' <../../A.tmp > ../../A.csv

In [3]:
import numpy as np
import pandas as pd
import sys
import re
import csv
import collections as co
import itertools as it
import umap
import hdbscan
import time 
import scipy.spatial.distance as ssd
from Bio.Align.Applications import MafftCommandline
#from memory_profiler import memory_usage

In [3]:
class vectorizer(object):
    
    def __init__(self, k = 7, convert = 0):
    
        self.k = k
        self.convert = convert
        self.exist = co.defaultdict(int) 
        self.keys = list(self.exist.keys())
        self.col = len(self.keys)
        self.row = 0
        self.matrix = np.empty((self.row, self.col, ),dtype = "float32")
        self.amino = co.defaultdict(str, {
            'AAA':'K', 'AAC':'N', 'AAG':'K', 'AAT':'N',
            'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
            'AGA':'R', 'AGC':'S', 'AGG':'R', 'AGT':'S',
            'ATA':'I', 'ATC':'I', 'ATG':'M', 'ATT':'I',
            'CAA':'Q', 'CAC':'H', 'CAG':'Q', 'CAT':'H',
            'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
            'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
            'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
            'GAA':'E', 'GAC':'D', 'GAG':'E', 'GAT':'D',
            'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
            'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
            'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',    
            'TAA':'Y', 'TAC':'*', 'TAG':'*', 'TAT':'Y',
            'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
            'TGA':'*', 'TGC':'C', 'TGG':'W', 'TGT':'C',
            'TTA':'L', 'TTC':'F', 'TTG':'L', 'TTT':'F'
        })
                
    def translate(self, read):
    
        chain = ''

        for i in range(len(read) - 2):
            trip = read[i:i+3]
            chain += self.amino[trip]

        return(chain)
    
    
    def adjust_to_data(self, infile):
    
        self.row = infile.shape[0]
            
        for line, read in infile.itertuples(index=True, name=None):

            if self.convert == 1:
                seq = self.translate(read)
                del read

                num = len(seq) - self.k + 1

                for i in range(num):
                    kmer = seq[i:i+self.k]
                    self.exist[kmer] = 0

            else:
                seq = read
                del read

                num = len(seq) - self.k + 1

                if re.match('^[ACGT]*$', seq): 
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        self.exist[kmer] = 0
                else:
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        if re.match('^[ACGT]*$', kmer): 
                            self.exist[kmer] = 0
            
        self.keys = list(self.exist.keys())
        self.col = len(self.keys)
        self.matrix = np.empty((self.row, self.col, ), dtype="float32")
        
        del seq
    
    def calculate_frequence(self, infile):
        
        for line, read in infile.itertuples(index=True, name=None): 
                 
            if self.convert == 1:
                seq = self.translate(read)
                del read

                counts = self.exist.copy()
                num = len(seq) - self.k + 1

                for i in range(num):
                    kmer = seq[i:i+self.k]
                    counts[kmer] += 1

            else:
                seq = read
                del read

                counts = self.exist.copy()
                num = len(seq) - self.k + 1

                if re.match('^[ACGT]*$', seq): 
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        counts[kmer] += 1
                else:
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        if re.match('^[ACGT]*$', kmer): 
                            counts[kmer] += 1

            vector = np.array(list(counts.values()), dtype = "float32")/num

            self.matrix[line] = vector
            
            counts.clear()
            del vector
            del seq
            del counts
    
    
    def get_keys(self):
        
        return(self.keys)
    
    
    def get_matrix(self):
        
        return(self.matrix)

In [4]:
def main():

    print('Read input and settings file.', end = ' ')

    infile = '../../A.csv'   
    setfile = 'Input/settings.csv'
    outpath = 'Output/'
    threads = 8

    settings = pd.read_csv(setfile, sep = ',', na_filter = False, index_col = 0)
    upload = pd.read_csv(infile, sep = ',', na_filter = False, header = None)
    upload.columns = ['accession', 'strain', 'segment', 'protein', 'genus', 'subtype', 'date', 'host', 'curation', 'genome']
    upload.query('curation == "Pass"', inplace = True)
    segments = settings.index.values.tolist()
    clusters = co.defaultdict(list)
    mafft_dict = co.defaultdict(list)

    exec_time = 0
    increment = 0

    print('Finished.')

    #segments = [6]
    for segment in segments:

        print(f'Starting calculations for segment {segment}:')

        start_clust = time.perf_counter()
        parameter = settings.loc[segment].to_list()
        setting = [para if type(para) == str else para.item() for para in parameter]

        subset = upload.query('segment == @segment').reset_index()

        sequence = subset[['genome']].copy()
        accession = subset[['accession']].copy()
        subtype = subset[['subtype']].copy()

        subset.set_index('accession', inplace = True)

        print('- Nucleotide k-mer frequency calculation.', end = ' ')

        freq_nt = vectorizer(k = setting[0], convert = 0)
        freq_nt.adjust_to_data(sequence)
        freq_nt.calculate_frequence(sequence)

        matrix_nt = freq_nt.get_matrix()
        keys_nt = freq_nt.get_keys()

        del freq_nt

        print('Finished.')

        print('- Running UMAP for dimension reduction.', end = ' ')

        matrix_nt_red = umap.UMAP(
            n_neighbors = setting[1],
            min_dist = setting[2],
            n_components = setting[3],
            random_state = setting[4],
            metric = setting[5],
        ).fit_transform(matrix_nt)

        del matrix_nt

        print('Finished.')

        print('- Aminoacid k-mer frequency calculation.', end = ' ')

        freq_aa = vectorizer(k = setting[6], convert = 1)
        freq_aa.adjust_to_data(sequence)
        freq_aa.calculate_frequence(sequence)

        matrix_aa = freq_aa.get_matrix()
        keys_aa = freq_aa.get_keys()

        del freq_aa

        print('Finished.')

        print('- Running UMAP for dimension reduction.', end = ' ')

        matrix_aa_red = umap.UMAP(
            n_neighbors = setting[7],
            min_dist = setting[8],
            n_components = setting[9],
            random_state = setting[10],
            metric = setting[11],
        ).fit_transform(matrix_aa)

        del matrix_aa

        print('Finished.')

        matrix = pd.concat([accession, pd.DataFrame(matrix_aa_red), pd.DataFrame(matrix_nt_red)], axis=1, copy = False, ignore_index = False).set_index('accession')

        print('- Running HDBscan for clustering.', end = ' ')

        matrix_clust = hdbscan.HDBSCAN(
            min_samples = setting[12], #larger the value the more conservative the clustering (more points will be declared as noise)
            min_cluster_size = setting[13], #minimum size that can become a cluster
            cluster_selection_epsilon = setting[14], #don't seperate clusters with a distance less than value
            alpha = setting[15], #don't mess with this
        ).fit(matrix)

        print('Finished.')

        print('- Centroid extraction and alignment.', end = ' ')

        clusterlabel = matrix_clust.labels_

        blank = pd.DataFrame(zip(clusterlabel, ['False'] * len(clusterlabel)), columns = ['cluster', 'centroid'])
        clusters[segment] = pd.concat([blank, subtype, accession], axis=1, copy = False).set_index('accession')

        num = clusters[segment]['cluster'].max()+1
        values = ['True']*num
        accessions = []
        exclude = []
        include = []
        overall_mean=0
        subs = co.defaultdict(list)

        align_dict = co.defaultdict(list)
        msa_dict = co.defaultdict(str)

        for i in range(num):

            query = clusters[segment].query('cluster == @i')
            match = query.index.values.tolist()
            sub = matrix.filter(items = match, axis=0)
            dist = ssd.cdist(sub, sub, metric = setting[16])
            inner_mean = pd.DataFrame(dist, columns = match, index = match, dtype = 'float32').mean()
            accessions.append(inner_mean.idxmin())
            overall_mean = overall_mean + inner_mean.mean()

            for sub in query['subtype'].tolist():
                if re.match('^[H][0-9]+N[0-9]+$', sub): 
                    subs['H'].append(re.search('[H][0-9]+', sub).group(0))
                    subs['N'].append(re.search('[N][0-9]+', sub).group(0))
                else:
                    subs['X'].append('X0')
                    subs['X'].append('X0')

            if len(set(subs['H'])) == 1 and len(set(subs['N'])) == 1:
                exclude.append(2)
                if 'X' not in subs.keys():
                    include.append(2)
            elif len(set(subs['H'])) == 1:
                exclude.append(1)
                if 'X' not in subs.keys():
                    include.append(1)
            elif len(set(subs['N'])) == 1:
                exclude.append(0)
                if 'X' not in subs.keys():
                    include.append(0)

            subs.clear()

            mafft_sub = subset.filter(items = match, axis=0)

            fasta = mafft_sub[['genome']].copy()

            fasta.to_csv('genome.fasta', header=None, index=True, sep='\n', mode='w')

            mafft_cline = MafftCommandline(input='genome.fasta', thread=threads)
            stdout, stderr = mafft_cline()

            for j in stdout.split('\n'):
                if j == '':
                    pass
                elif j[0] == '>':
                    accession = j
                else:
                    msa_dict[accession] += j

            alignment = pd.DataFrame.from_dict(msa_dict, orient='index', columns=['alignment'])
            alignment.index.rename('accession', inplace=True)

            msa_dict.clear()

            align_dict[i] = alignment  

        mafft_dict[segment] = pd.concat(align_dict)
        mafft_dict[segment].index.set_names(['cluster', 'accession'], inplace=True)
        mafft_dict[segment].reset_index(level = 'cluster', inplace=True)
        mafft_dict[segment]['cluster'] = mafft_dict[segment]['cluster'] + increment

        centroids = pd.DataFrame(values, columns=['centroid'], index = accessions)

        clusters[segment].loc[clusters[segment]['cluster'] != -1, ['cluster']] = clusters[segment].loc[clusters[segment]['cluster'] != -1, ['cluster']] + increment
        clusters[segment].update(centroids)

        increment += num - 1

        print('Finished.')

        stop_clust = time.perf_counter()
        exec_clust = stop_clust - start_clust
        exec_time = exec_time + exec_clust

        print(f'- Clustering and alignment done in {exec_clust:0.2f} seconds.')
        diagnostic = co.Counter(clusterlabel)
        print(f'- {str(len(clusterlabel))} sequences, {str(diagnostic[-1])} unclustered, {str(len(set(diagnostic)))} cluster.')
        print(f'- Mean of inner cluster distance mean {overall_mean/num:0.10f}')
        print(f'- {exclude.count(0) + exclude.count(2)}({include.count(0) + include.count(2)}) clusters containing matching NA types.')
        print(f'- {exclude.count(1) + exclude.count(2)}({include.count(1) + include.count(2)}) clusters containing matching HA types.')
        print('Finished.')

    result_cluster = pd.concat(clusters)
    result_cluster.index.set_names(['segment', 'accession'], inplace=True)
    result_cluster.reset_index(level = 'segment', inplace=True)  
    result_cluster.sort_values(by=['segment', 'cluster', 'subtype'], inplace = True)
    result_cluster.to_csv(outpath + 'cluster.csv', index=True, header=True, sep=',', mode='w')

    result_msa = pd.concat(mafft_dict)
    result_msa.index.set_names(['segment', 'accession'], inplace=True)
    result_msa.reset_index(level = ['segment'], inplace=True)  
    result_msa.sort_values(by=['segment', 'cluster', 'accession', 'alignment'], inplace = True)
    result_msa.to_csv(outpath + 'alignment.csv', index=True, header=True, sep=',', mode='w')

    print(f'Overall execution time {exec_time:0.2f} seconds.')

In [5]:
if __name__ == "__main__":

    main()
    #memory = memory_usage(main)
    #print(f"Maximum memory used: {max(memory)/1000:0.2f} Gb.")

Read input and settings file. Finished.
Starting calculations for segment 1:
- Nucleotide k-mer frequency calculation. Finished.
- Running UMAP for dimension reduction. Finished.
- Aminoacid k-mer frequency calculation. Finished.
- Running UMAP for dimension reduction. 

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!


Finished.
- Running HDBscan for clustering. Finished.
- Centroid extraction and alignment. Finished.
- Clustering and alignment done in 1831.45 seconds.
- 55436 sequences, 41 unclustered, 881 cluster.
- Mean of inner cluster distance mean 0.0000639726
- 558(516) clusters containing matching NA types.
- 576(526) clusters containing matching HA types.
Finished.
Starting calculations for segment 2:
- Nucleotide k-mer frequency calculation. Finished.
- Running UMAP for dimension reduction. Finished.
- Aminoacid k-mer frequency calculation. Finished.
- Running UMAP for dimension reduction. Finished.
- Running HDBscan for clustering. Finished.
- Centroid extraction and alignment. Finished.
- Clustering and alignment done in 1912.56 seconds.
- 55292 sequences, 38 unclustered, 885 cluster.
- Mean of inner cluster distance mean 0.0000689872
- 603(564) clusters containing matching NA types.
- 616(576) clusters containing matching HA types.
Finished.
Starting calculations for segment 3:
- Nucleot

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!
failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!


Finished.
- Running HDBscan for clustering. Finished.
- Centroid extraction and alignment. Finished.
- Clustering and alignment done in 1915.32 seconds.
- 55351 sequences, 40 unclustered, 882 cluster.
- Mean of inner cluster distance mean 0.0000759190
- 613(562) clusters containing matching NA types.
- 618(567) clusters containing matching HA types.
Finished.
Starting calculations for segment 4:
- Nucleotide k-mer frequency calculation. Finished.
- Running UMAP for dimension reduction. Finished.
- Aminoacid k-mer frequency calculation. Finished.
- Running UMAP for dimension reduction. Finished.
- Running HDBscan for clustering. Finished.
- Centroid extraction and alignment. Finished.
- Clustering and alignment done in 1587.13 seconds.
- 55281 sequences, 34 unclustered, 817 cluster.
- Mean of inner cluster distance mean 0.0000723253
- 622(548) clusters containing matching NA types.
- 815(605) clusters containing matching HA types.
Finished.
Starting calculations for segment 5:
- Nucleot

failed. This is likely due to too small an eigengap. Consider
adding some noise or jitter to your data.

Falling back to random initialisation!


Finished.
- Running HDBscan for clustering. Finished.
- Centroid extraction and alignment. Finished.
- Clustering and alignment done in 1679.24 seconds.
- 55541 sequences, 59 unclustered, 905 cluster.
- Mean of inner cluster distance mean 0.0001031753
- 633(583) clusters containing matching NA types.
- 648(594) clusters containing matching HA types.
Finished.
Starting calculations for segment 6:
- Nucleotide k-mer frequency calculation. Finished.
- Running UMAP for dimension reduction. Finished.
- Aminoacid k-mer frequency calculation. Finished.
- Running UMAP for dimension reduction. Finished.
- Running HDBscan for clustering. Finished.
- Centroid extraction and alignment. Finished.
- Clustering and alignment done in 1574.45 seconds.
- 55075 sequences, 83 unclustered, 920 cluster.
- Mean of inner cluster distance mean 0.0000566989
- 916(683) clusters containing matching NA types.
- 700(623) clusters containing matching HA types.
Finished.
Starting calculations for segment 7:
- Nucleot

In [None]:
#wie viele H's? 18! N11

In [6]:
alignment = pd.read_csv('Output/alignment.csv', sep = ',', na_filter = False, header = 0, index_col = 0)
cluster = pd.read_csv('Output/cluster.csv', sep = ',', na_filter = False, header = 0, index_col = 0)

In [4]:
alignment

Unnamed: 0_level_0,segment,cluster,alignment
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
>MK265637,1,0,tcaaatatattcaatatggagagaataaaagagctgagagacctaa...
>MK265655,1,0,tcaaatatattcaatatggagagaataaaagagctgagagacctaa...
>MK364024,1,0,tcaaatatattcaatatggagagaataaaagagctgagagacctaa...
>MK364444,1,0,tcaaatatattcaatatggagagaataaaagagctgagagacctaa...
>MK364640,1,0,tcaaatatattcaatatggagagaataaaagagctgagagacctaa...
...,...,...,...
>MH598129,8,7055,------------gtgacaaaaacataatggattccaacactgtgtc...
>MH620640,8,7055,agcaaaagcagggtgacaaaaacataatggattccaacactgtgtc...
>MK902671,8,7055,--------------------------atggattccaacactgtgtc...
>MN055190,8,7055,agcaaaagcagggtgacaaaaacataatggattccaacactgtgtc...


In [7]:
cluster

Unnamed: 0_level_0,segment,cluster,centroid,subtype
accession,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
>KF918700,1,-1,False,H1N1
>HQ263280,1,-1,False,H1N1
>CY064994,1,-1,False,H1N1
>LN846491,1,-1,False,H1N1
>CY065595,1,-1,False,H1N1
...,...,...,...,...
>CY102078,8,7055,False,mixed
>CY136447,8,7055,False,mixed
>CY116805,8,7055,False,mixed
>CY181173,8,7055,False,mixed


In [10]:
cluster.query('segment == 4 & cluster != -1')['cluster'].max() 

3457

In [12]:
cluster.query('segment == 4 & cluster == 3210')['subtype']

accession
>CY053174    H1N1
>CY053190    H1N1
>CY053198    H1N1
>CY053230    H1N1
>CY053222    H1N1
             ... 
>CY051463    H1N1
>CY051455    H1N1
>CY051447    H1N1
>CY051471    H1N1
>CY051487    H1N1
Name: subtype, Length: 186, dtype: object

In [8]:
alignment.query('cluster == 20')['alignment']

accession
>CY212951    tcaattatattcagcatggaaagaataaaagaactacggaatctaa...
>CY213159    tcaattatattcagcatggaaagaataaaagaactacggaatctaa...
>CY213951    tcaattatattcagcatggaaagaataaaagaactacggaatctaa...
>CY214159    tcaattatattcagcatggaaagaataaaagaactacggaatctaa...
>CY214583    tcaattatattcagcatggaaagaataaaagaactacggaatctaa...
                                   ...                        
>MG928474    tcaattatattcagcatggaaagaataaaagaactacggaatctaa...
>MG928482    tcaattatattcagcatggaaagaataaaagaactacggaatctaa...
>MH079590    tcaattatattcagcatggaaagaataaaagaactacggaatctaa...
>MN571974    ---------------atggaaagaataaaagaactacggaatctaa...
>MN572942    ---attatattcagcatggaaagaataaaagaactacggaatctaa...
Name: alignment, Length: 74, dtype: object