In [1]:
import functools as ft
import numpy as np
import pandas as pd
import multiprocessing as mp
import sys
import re
import csv
import collections as co
import itertools as it
import umap
import hdbscan
import time 
import difflib
import scipy.spatial.distance as ssd

In [2]:
class vectorizer(object):
    
    #def __init__(self, procs = 8, k = 7, convert = 0, row = 0, index = [], exist = co.defaultdict(int)):
    def __init__(self, k = 7, convert = 0):
    
        self.k = k
        self.convert = convert
        self.exist = co.defaultdict(int) 
        self.keys = list(self.exist.keys())
        self.col = len(self.keys)
        self.row = 0
        self.matrix = np.empty((self.row, self.col, ),dtype = "float32")
        self.subtype = np.empty((self.row, 1, ),dtype = "object")
        self.index = np.empty(self.row, dtype = "object")
        self.amino = co.defaultdict(str, {
            'AAA':'K', 'AAC':'N', 'AAG':'K', 'AAT':'N',
            'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
            'AGA':'R', 'AGC':'S', 'AGG':'R', 'AGT':'S',
            'ATA':'I', 'ATC':'I', 'ATG':'M', 'ATT':'I',
            'CAA':'Q', 'CAC':'H', 'CAG':'Q', 'CAT':'H',
            'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
            'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
            'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
            'GAA':'E', 'GAC':'D', 'GAG':'E', 'GAT':'D',
            'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
            'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
            'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',    
            'TAA':'Y', 'TAC':'*', 'TAG':'*', 'TAT':'Y',
            'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
            'TGA':'*', 'TGC':'C', 'TGG':'W', 'TGT':'C',
            'TTA':'L', 'TTC':'F', 'TTG':'L', 'TTT':'F'
        })
                
    def translate(self, read):
    
        chain = ''

        for i in range(len(read) - 2):
            trip = read[i:i+3]
            chain += self.amino[trip]

        return(chain)
    
    
    def adjust_to_data(self, infile):
    
        self.row = sum(1 for l in open(infile))
        data = pd.read_csv(infile, chunksize = 10000, sep = ';', na_filter = False, header = None)
        
        for chunk in data:

            for line, info, read in chunk.itertuples(index=True, name=None):
                
                if self.convert == 1:
                    #seq = self.translate(re.sub('[^ACGT]+', '', read))
                    seq = self.translate(read)
                    del read
                    
                    num = len(seq) - self.k + 1
                    
                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        self.exist[kmer] = 0
                    
                else:
                    #seq = re.sub('[^ACGT]+', '', read)
                    seq = read
                    del read

                    num = len(seq) - self.k + 1

                    if re.match('^[ACGT]*$', seq): 
                        for i in range(num):
                            kmer = seq[i:i+self.k]
                            self.exist[kmer] = 0
                    else:
                        for i in range(num):
                            kmer = seq[i:i+self.k]
                            if re.match('^[ACGT]*$', kmer): 
                                self.exist[kmer] = 0
            
        self.keys = list(self.exist.keys())
        self.col = len(self.keys)
        self.index = np.empty(self.row, dtype = "object")
        self.matrix = np.empty((self.row, self.col, ), dtype="float32")
        self.subtype = np.empty((self.row, 1, ), dtype = "object")
        
        del seq
    
    def calculate_frequence(self, infile):
        
        data = pd.read_csv(infile, chunksize = 10000, sep = ';', na_filter = False, header = None)
        
        for chunk in data:

            for line, info, read in chunk.itertuples(index=True, name=None):      
                
                if self.convert == 1:
                    #seq = self.translate(re.sub('[^ACGT]+', '', read))
                    seq = self.translate(read)
                    del read
                
                    counts = self.exist.copy()
                    num = len(seq) - self.k + 1

                    for i in range(num):
                        kmer = seq[i:i+self.k]
                        counts[kmer] += 1
                            
                else:
                    #seq = re.sub('[^ACGT]+', '', read)
                    seq = read
                    del read
                
                    counts = self.exist.copy()
                    num = len(seq) - self.k + 1

                    if re.match('^[ACGT]*$', seq): 
                        for i in range(num):
                            kmer = seq[i:i+self.k]
                            counts[kmer] += 1
                    else:
                        for i in range(num):
                            kmer = seq[i:i+self.k]
                            if re.match('^[ACGT]*$', kmer): 
                                counts[kmer] += 1

                vector = np.array(list(counts.values()), dtype = "float32")/num
                
                self.index[line] = info.split('|')[0][1:]
                self.matrix[line] = vector
                self.subtype[line] = info.split('|')[5]
                
                counts.clear()
                del vector
                del seq
                del counts
    
    def get_index(self):
        
        return(self.index)
    
    
    def get_keys(self):
        
        return(self.keys)
    
    
    def get_subtype(self):
        
        return(self.subtype)
    
    def get_matrix(self):
        
        return(self.matrix)

In [3]:
class extractor(object):
    
    def __init__(self):
        
        self.cities = co.defaultdict(list)
        self.subcountries = co.defaultdict(list)
        self.countries = co.defaultdict(list)
        
        self.cities_names = []
        self.subcountries_names = []
        self.countries_names = []
        
        self.matrix_genome = np.empty((0, 0, ), dtype=object)
        self.matrix_strain = np.empty((0, 0, ), dtype=object)
        self.matrix_head = np.empty((0, 0, ), dtype=object)
        
        self.index_accession = np.empty(0, dtype=object)
        self.index_strain = np.empty(0, dtype=object)
  

    def init_matrix(self, infile):
        
        lines = sum(1 for l in open(infile))
        
        self.matrix_genome = np.empty((lines, 1, ), dtype=object)
        self.matrix_strain = np.empty((lines, 6, ), dtype=object)
        self.matrix_head = np.empty((lines, 2, ), dtype=object)
        
        self.index_accession = np.empty(lines, dtype=object)
        self.index_strain = np.empty(lines, dtype=object)
    
    
    def fill_dicts(self, worldfile):
        
        data = pd.read_csv(worldfile, chunksize = 10000, sep = ',', na_filter = False)
        
        for split in data:

            for city, country, subcountry, geonameid in split.itertuples(index=False, name=None):

                self.cities[city].append([city, subcountry, country])
                self.subcountries[subcountry]. append(['null', subcountry, country])
                self.countries[country].append(['null', 'null', country])

        self.cities_names = list(self.cities.keys())
        self.subcountries_names = list(self.subcountries.keys())
        self.countries_names = list(self.countries.keys())

        
    def destination(self, entry):

        match_subcountry = difflib.get_close_matches(entry, self.subcountries_names, 1, 0.9)
        if not match_subcountry:
            match_city = difflib.get_close_matches(entry, self.cities_names, 1, 0.9)
            if not match_city:
                match_country = difflib.get_close_matches(entry, self.countries_names, 1, 0.9)
                if not match_country:
                    result = ['null', 'null', 'null']
                else:
                    match = match_country[0]
                    result = self.countries[match]
            else:
                match = match_city[0]
                result = self.cities[match]
        else:
            match = match_subcountry[0]
            result = self.subcountries[match]

        if any(isinstance(i, list) for i in result):
            output = result[0]
        else:
            output = result

        return(output)
    
    
    def process_rows(self, chunk):

        line = chunk[0]
        info = chunk[1]
        read = chunk[2]

        head = info.split('|')

        accession = head[0][1:]
        strain = head[1]
        segment = head[2]
        organism = head[4]
        subtype = head[5]
        if subtype == 'NA' or subtype == 'nan':
            subtype = 'null'
        host = head[7]

        info = strain.split('/')

        spec = info[0]
        del info[0]
        year = info[-1]
        del info[-1]

        if year.isdecimal():
            if len(year) == 2:
                year = '19'+year 
        else:
            year = 'null'

        if not info:
            pos = ['null', 'null', 'null']
        else:
            for i in info:
                pos = self.destination(i)
                if not all([item == 'null' for item in pos]):
                    break

        values = [accession, read, spec, pos[0], pos[1], pos[2], year, host, strain, segment]

        return(line, values)

    
    def input_sequences(self, infile, procs):
        
        data = pd.read_csv(infile, chunksize = 10000, sep = ';', na_filter = False, header = None)
        
        #dt = np.dtype([('R','u1'), ('G','u1'), ('B','u1'), ('A','u1')])
        
        for chunk in data:
            
            with mp.Pool(procs) as pool:
                
                for i, values in pool.map(self.process_rows, chunk.itertuples(index=True, name=None)):
            
                    self.matrix_genome[i] = np.array([values[1]])
                    self.matrix_strain[i] = np.array([values[2], values[3], values[4], values[5], values[6], values[7]])
                    self.matrix_head[i] = np.array([values[8], values[9]])
                    self.index_accession[i] = values[0]
                    self.index_strain[i] = values[8]
            
        
    def get_genomes(self):
        
        genomes = pd.DataFrame(self.matrix_genome, index = self.index_accession, columns = ['genome'])
        
        return(genomes)
    
    def get_strains(self):
        
        strains = pd.DataFrame(self.matrix_strain, index = self.index_strain, columns = ['species', 'city', 'subcountry', 'country', 'year', 'host']).drop_duplicates()
        
        return(strains)
    
    def get_header(self):
        
        header = pd.DataFrame(self.matrix_head, self.index_accession, columns = ['strain', 'segment'])
        
        return(header)

In [66]:
def main():

    print("Welcome. Read input and settings file.", end = ' ')
    #error checkup?

    infile = '../../../Desktop/Masterthesis_V5/A_HA.csv'   
    outfile = 'output.csv'
    setfile = 'settings.csv'
    #worldfile = 'cities.csv'
    
    settings = pd.read_csv(setfile, sep = ',', na_filter = False, index_col = 0)
    parameter = settings.loc[4].to_list()

    print("Finished.")

    start = time.perf_counter()

    print("Nucleotide k-mer frequency calculation.", end = ' ')

    freq_nt = vectorizer(k = parameter[0].item(), convert = 0)
    freq_nt.adjust_to_data(infile)
    freq_nt.calculate_frequence(infile)

    matrix_nt = freq_nt.get_matrix()
    index_nt = freq_nt.get_index()   
    subtype_nt = freq_nt.get_subtype()
    keys_nt = freq_nt.get_keys()

    del freq_nt

    print("Finished.")

    print("Running UMAP for dimension reduction.", end = ' ')

    matrix_nt_red = umap.UMAP(
        n_neighbors = parameter[1].item(),
        min_dist = parameter[2].item(),
        n_components = parameter[3].item(),
        random_state = parameter[4].item(),
        metric = parameter[5],
    ).fit_transform(matrix_nt)

    del matrix_nt

    print("Finished.")

    print("Aminoacid k-mer frequency calculation.", end = ' ')

    freq_aa = vectorizer(k = parameter[6].item(), convert = 1)
    freq_aa.adjust_to_data(infile)
    freq_aa.calculate_frequence(infile)

    matrix_aa = freq_aa.get_matrix()
    index_aa = freq_aa.get_index()
    keys_aa = freq_aa.get_keys()

    del freq_aa

    print("Finished.")

    print("Running UMAP for dimension reduction.", end = ' ')

    matrix_aa_red = umap.UMAP(
        n_neighbors = parameter[7].item(),
        min_dist = parameter[8].item(),
        n_components = parameter[9].item(),
        random_state = parameter[10].item(),
        metric = parameter[11],
    ).fit_transform(matrix_aa)

    del matrix_aa

    print("Finished.")

    matrix_aa_ind = pd.DataFrame(matrix_aa_red, index = index_aa)
    matrix_nt_ind = pd.DataFrame(matrix_nt_red, index = index_nt)

    matrix = pd.concat([matrix_nt_ind, matrix_aa_ind], axis=1, copy = False, ignore_index = True) #falsches Ergebnis? checken ob ignore_index = Fehler

    print("Running HDBscan for clustering.", end = ' ')

    matrix_clust = hdbscan.HDBSCAN(
        min_samples = parameter[12].item(), #larger the value the more conservative the clustering (more points will be declared as noise)
        min_cluster_size = parameter[13].item(), #minimum size that can become a cluster
        cluster_selection_epsilon = parameter[14].item(), #don't seperate clusters with a distance less than value
        alpha = parameter[15].item(), #don't mess with this
    ).fit(matrix)

    print("Finished.")

    print("Centroid extraction.", end = ' ')

    clusterlabel = matrix_clust.labels_

    blank = pd.DataFrame(zip(clusterlabel, ['false'] * len(clusterlabel)), index = index_nt, columns = ['cluster', 'centroid'])
    subtype = pd.DataFrame(subtype_nt, index = index_nt, columns = ['subtype'])
    clusters = pd.concat([blank, subtype], axis=1, copy = False)

    num = clusters['cluster'].max()+1
    values = ['true']*num
    accessions = []
    exclude = []
    include = []

    for i in range(num):

        query = clusters[clusters.cluster == i]
        match = query.index.values.tolist()
        sub = matrix.filter(items = match, axis=0)
        dist = ssd.cdist(sub, sub, metric = parameter[16])
        accessions.append(pd.DataFrame(dist, columns = match, index = match, dtype = 'float32').mean().idxmin())
        subs = co.defaultdict(list) 

        for sub in query['subtype'].tolist():
            if re.match('^[H][0-9]+N[0-9]+$', sub): 
                subs['H'].append(re.search('[H][0-9]+', sub).group(0))
                subs['N'].append(re.search('[N][0-9]+', sub).group(0))
            else: #mixed types sind nicht zwingend falsch, schwierig zu bewerten ob sie beachtet werden oder nicht
                subs['X'].append('X0')
                subs['X'].append('X0')

        if len(set(subs['H'])) == 1 and len(set(subs['N'])) == 1:
            exclude.append(2)
            if 'X' not in subs.keys():
                include.append(2)
        elif len(set(subs['H'])) == 1:
            exclude.append(1)
            if 'X' not in subs.keys():
                include.append(1)
        elif len(set(subs['N'])) == 1:
            exclude.append(0)
            if 'X' not in subs.keys():
                include.append(0)

    centroids = pd.DataFrame(values, columns=['centroid'], index = accessions)

    clusters.update(centroids)
    #clusters.sort_values(by=['cluster', 'subtype']).to_csv(outfile, index=True, header=True, sep=',')
    
    stop = time.perf_counter()

    print("Finished.")
    print(f"Clustering done in {stop - start:0.4f} seconds.")
    diagnostic = co.Counter(clusterlabel)
    print(f"{str(len(clusterlabel))} sequences, {str(diagnostic[-1])} unclustered, {str(len(set(diagnostic)))} cluster.")
    print(f"{exclude.count(0) + exclude.count(2)}({include.count(0) + include.count(2)}) clusters containing matching NA types.")
    print(f"{exclude.count(1) + exclude.count(2)}({include.count(1) + include.count(2)}) clusters containing matching HA types.")
    
    tables = extractor()
    tables.fill_dicts(worldfile)
    tables.init_matrix(infile)
    tables.input_sequences(infile, 8)
    
    header = tables.get_header()
    strains = tables.get_strains()
    genomes = tables.get_genomes()
    informations = pd.concat([header, clusters], axis=1, copy = False)
    
    #informations.sort_values(by=['cluster', 'subtype']).to_csv(outfile, index=True, header=True, sep=',')
    #strains.to_csv(outfile, index=True, header=True, sep=',')
    #genomes.to_csv(outfile, index=True, header=True, sep=',')

In [67]:
if __name__ == "__main__":

    main()

Welcome. Read input and settings file. Finished.
Nucleotide k-mer frequency calculation. Finished.
Running UMAP for dimension reduction. Finished.
Aminoacid k-mer frequency calculation. Finished.
Running UMAP for dimension reduction. Finished.
Running HDBscan for clustering. Finished.
Centroid extraction. Finished.
Clustering done in 1120.5723 seconds.
56600 sequences, 35 unclustered, 831 cluster.
630(556) clusters containing matching NA types.
827(618) clusters containing matching HA types.
