In [40]:
import numpy as np
import pandas as pd
import itertools as it
from Bio import SeqIO
from Bio.Seq import Seq
import math
import re
from sklearn.preprocessing import normalize

In [122]:
class Frequency(object):
    
    def __init__(self, k = 7, split = None, segment = None, quality = None, qualpos = 0, segpos = 0, variable = 0.9):
    
        self.k = k
        self.qualpos = qualpos
        self.segpos = segpos
        self.split = split
        self.segment = segment
        self.quality = '.*(?:% s).*' % ''.join(quality)
        self.variable = variable
        
        self.nucleotides = ['A', 'C', 'G', 'T']
        self.substit = dict.fromkeys(map(ord, self.nucleotides), None)
        self.exist = dict.fromkeys(map(''.join, it.product(self.nucleotides, repeat = self.k)), 0)        
        self.col = len(self.exist.keys())

        self.nucex = {
            'A':['A'],
            'C':['C'],
            'G':['G'],
            'T':['T'],
            'R':['A', 'G'],
            'Y':['C', 'T'],
            'W':['A', 'T'],
            'S':['C', 'G'],
            'M':['A', 'C'],
            'K':['G', 'T'],
            'B':['G', 'C', 'T'],
            'H':['A', 'C', 'T'],
            'D':['A', 'G', 'T'],
            'V':['A', 'C', 'G'],
            'N':['A', 'C', 'G', 'T'],
        } 
        self.nucmut = {
            'A':['C', 'G', 'T'],
            'C':['A', 'G', 'T'],
            'G':['A', 'C', 'T'],
            'T':['A', 'C', 'G'],
        } 
    
    def checkup(self, name):
        
        try:
            if self.segment and self.quality:    
                head = name.split(self.split)
                if head[self.segpos] == self.segment and re.match(self.quality, head[self.qualpos], re.IGNORECASE):
                    return(True)
                else:
                    return(False)

            elif self.segment or self.quality:
                head = name.split(self.split)
                
                if head[self.segpos] == self.segment or re.match(self.quality, head[self.qualpos], re.IGNORECASE):
                    return(True)
                else:
                    return(False)

            else:
                return(True)
        except:
            return(False)
            
    def countRows(self, infile):
        
        row = 0
        for entry in SeqIO.parse(infile,'fasta'):
            
            name = entry.name
            sequence = str(entry.seq)
            missing = len(sequence.translate(self.substit))
            fracture = float(len(sequence)/missing) if missing else 0 
            
            if self.checkup(name) == True and fracture <= self.variable:
                row += 1
                
        return(row)
    
    def calculateFrequence(self, infile):
        
        row = self.countRows(infile)
        index = np.empty(row, dtype = '<U16')
        matrix = np.empty((row, self.col, ),dtype = '<f8')
        
        pos = 0
        for entry in SeqIO.parse(infile,'fasta'):
            
            name = entry.name
            sequence = str(entry.seq)
            accession = name.split(self.split)[0]
            missing = len(sequence.translate(self.substit))
            fracture = float(len(sequence)/missing) if missing else 0 
            
            if self.checkup(name) == True and fracture <= self.variable:
                for i in range(len(sequence) - self.k + 1):
                    
                    kmer = sequence[i:i+self.k]
                    
                    if fracture == 0:
                        main = [kmer]
                        size = 1
                    
                    else:
                        main = map(''.join, it.product(*[self.nucex.get(j) for j in kmer]))
                        size = np.prod([len(self.nucex.get(k)) for k in kmer])
                        
                    for sub in main:
                        self.exist[sub] += float(1/size)
                    
                    #     for l, nuc in enumerate(main):
                    #         self.exist[main] += (1-self.mutfac)/size
                    #         for mutation in map(''.join, it.product(*[[main[:l]], nucmut.get(nuc), [main[l+1:]]])):
                    #             self.exist[mutation] += self.mutfac/(size*12)

                # matrix[pos] = normalize(np.fromiter(self.exist.values(), dtype = '<f8', count = self.col).reshape(1, -1), norm = 'l1')
                matrix[pos] = np.fromiter(self.exist.values(), dtype = '<f8', count = self.col)
                index[pos] = accession
                
                self.exist.update((k,0) for k in self.exist.keys())
                pos += 1
            
        return(index, matrix)

In [123]:
Freq = Frequency(k = 7, split = '|', segment = '4', quality = 'Pass', qualpos = 8, segpos = 2, variable = 0.9)

In [125]:
# %%timeit
index, matrix = Freq.calculateFrequence('A.fasta')

13 ms ± 453 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
