In [2]:
import numpy as np
import pandas as pd
import itertools as it
from Bio import SeqIO
import math
import re
from sklearn.preprocessing import normalize

In [172]:
class Frequency(object):
    
    def __init__(self, k = 7, segment = None, quality = None, split = None, qualpos = None, segpos = None, mutfac = 0.0):
    
        self.k = k
        #self.infile = infile
        self.qualpos = qualpos
        self.segpos = segpos
        self.split = split
        self.segment = segment
        self.quality = quality
        self.mutfac = mutfac
        
        self.nucleotides = ['A', 'C', 'G', 'T']
        self.exist = dict.fromkeys(map(''.join, it.product(self.nucleotides, repeat = self.k)), 0)        
        self.col = len(self.exist.keys())
        
        self.aminoacids = {
            'AAA':'K', 'AAC':'N', 'AAG':'K', 'AAT':'N',
            'ACA':'T', 'ACC':'T', 'ACG':'T', 'ACT':'T',
            'AGA':'R', 'AGC':'S', 'AGG':'R', 'AGT':'S',
            'ATA':'I', 'ATC':'I', 'ATG':'M', 'ATT':'I',
            'CAA':'Q', 'CAC':'H', 'CAG':'Q', 'CAT':'H',
            'CCA':'P', 'CCC':'P', 'CCG':'P', 'CCT':'P',
            'CGA':'R', 'CGC':'R', 'CGG':'R', 'CGT':'R',
            'CTA':'L', 'CTC':'L', 'CTG':'L', 'CTT':'L',
            'GAA':'E', 'GAC':'D', 'GAG':'E', 'GAT':'D',
            'GCA':'A', 'GCC':'A', 'GCG':'A', 'GCT':'A',
            'GGA':'G', 'GGC':'G', 'GGG':'G', 'GGT':'G',
            'GTA':'V', 'GTC':'V', 'GTG':'V', 'GTT':'V',    
            'TAA':'Y', 'TAC':'*', 'TAG':'*', 'TAT':'Y',
            'TCA':'S', 'TCC':'S', 'TCG':'S', 'TCT':'S',
            'TGA':'*', 'TGC':'C', 'TGG':'W', 'TGT':'C',
            'TTA':'L', 'TTC':'F', 'TTG':'L', 'TTT':'F',
            #'NNN':'X', 'RAY':'B', 'SAR':'Z'
        } #erst den amino drueberschmeissen, dann exchange dann koennte RAY und SAR erhalten bleiben
        self.nucex = {
            'A':['A'],
            'C':['C'],
            'G':['G'],
            'T':['T'],
            'R':['A', 'G'],
            'Y':['C', 'T'],
            'W':['A', 'T'],
            'S':['C', 'G'],
            'M':['A', 'C'],
            'K':['G', 'T'],
            'B':['G', 'C', 'T'],
            'H':['A', 'C', 'T'],
            'D':['A', 'G', 'T'],
            'V':['A', 'C', 'G'],
            'N':['A', 'C', 'G', 'T'],
        } 
        self.nucmut = {
            'A':['C', 'G', 'T'],
            'C':['A', 'G', 'T'],
            'G':['A', 'C', 'T'],
            'T':['A', 'C', 'G'],
        } 
        self.blosum = pd.DataFrame(
            np.array([
                [4, -1, -2, -2, 0, -1, -1, 0, -2, -1, -1, -1, -1, -2, -1, 1, 0, -3, -2, 0, -2, -1, 0, -4],
                [-1, 5, 0, -2, -3, 1, 0, -2, 0, -3, -2, 2, -1, -3, -2, -1, -1, -3, -2, -3, -1, 0, -1, -4],
                [-2, 0, 6, 1, -3, 0, 0, 0, 1, -3, -3, 0, -2, -3, -2, 1, 0, -4, -2, -3, 3, 0, -1, -4],
                [-2, -2, 1, 6, -3, 0, 2, -1, -1, -3, -4, -1, -3, -3, -1, 0, -1, -4, -3, -3, 4, 1, -1, -4],
                [0, -3, -3, -3, 9, -3, -4, -3, -3, -1, -1, -3, -1, -2, -3, -1, -1, -2, -2, -1, -3, -3, -2, -4],
                [-1, 1, 0, 0, -3, 5, 2, -2, 0, -3, -2, 1, 0, -3, -1, 0, -1, -2, -1, -2, 0, 3, -1, -4],
                [-1, 0, 0, 2, -4, 2, 5, -2, 0, -3, -3, 1, -2, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4],
                [0, -2, 0, -1, -3, -2, -2, 6, -2, -4, -4, -2, -3, -3, -2, 0, -2, -2, -3, -3, -1, -2, -1, -4],
                [-2, 0, 1, -1, -3, 0, 0, -2, 8, -3, -3, -1, -2, -1, -2, -1, -2, -2, 2, -3, 0, 0, -1, -4],
                [-1, -3, -3, -3, -1, -3, -3, -4, -3, 4, 2, -3, 1, 0, -3, -2, -1, -3, -1, 3, -3, -3, -1, -4],
                [-1, -2, -3, -4, -1, -2, -3, -4, -3, 2, 4, -2, 2, 0, -3, -2, -1, -2, -1, 1, -4, -3, -1, -4],
                [-1, 2, 0, -1, -3, 1, 1, -2, -1, -3, -2, 5, -1, -3, -1, 0, -1, -3, -2, -2, 0, 1, -1, -4],
                [-1, -1, -2, -3, -1, 0, -2, -3, -2, 1, 2, -1, 5, 0, -2, -1, -1, -1, -1, 1, -3, -1, -1, -4],
                [-2, -3, -3, -3, -2, -3, -3, -3, -1, 0, 0, -3, 0, 6, -4, -2, -2, 1, 3, -1, -3, -3, -1, -4],
                [-1, -2, -2, -1, -3, -1, -1, -2, -2, -3, -3, -1, -2, -4, 7, -1, -1, -4, -3, -2, -2, -1, -2, -4],
                [1, -1, 1, 0, -1, 0, 0, 0, -1, -2, -2, 0, -1, -2, -1, 4, 1, -3, -2, -2, 0, 0, 0, -4],
                [0, -1, 0, -1, -1, -1, -1, -2, -2, -1, -1, -1, -1, -2, -1, 1, 5, -2, -2, 0, -1, -1, 0, -4],
                [-3, -3, -4, -4, -2, -2, -3, -2, -2, -3, -2, -3, -1, 1, -4, -3, -2, 11, 2, -3, -4, -3, -2, -4],
                [-2, -2, -2, -3, -2, -1, -2, -3, 2, -1, -1, -2, -1, 3, -3, -2, -2, 2, 7, -1, -3, -2, -1, -4],
                [0, -3, -3, -3, -1, -2, -2, -3, -3, 3, 1, -2, 1, -1, -2, -2, 0, -3, -1, 4, -3, -2, -1, -4],
                [-2, -1, 3, 4, -3, 0, 1, -1, 0, -3, -4, 0, -3, -3, -2, 0, -1, -4, -3, -3, 4, 1, -1, -4],
                [-1, 0, 0, 1, -3, 3, 4, -2, 0, -3, -3, 1, -1, -3, -1, 0, -1, -3, -2, -2, 1, 4, -1, -4],
                [0, -1, -1, -1, -2, -1, -1, -1, -1, -1, -1, -1, -1, -1, -2, 0, 0, -2, -1, -1, -1, -1, -1, -4],
                [-4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, -4, 1]
            ]), 
            columns = ['A' , 'R' , 'N' , 'D' , 'C' , 'Q' , 'E' , 'G' , 'H' , 'I' , 'L' , 'K' , 'M' , 'F' , 'P' , 'S' , 'T' , 'W' , 'Y' , 'V' , 'B' , 'Z' , 'X' , '*'],
            index = ['A' , 'R' , 'N' , 'D' , 'C' , 'Q' , 'E' , 'G' , 'H' , 'I' , 'L' , 'K' , 'M' , 'F' , 'P' , 'S' , 'T' , 'W' , 'Y' , 'V' , 'B' , 'Z' , 'X' , '*']
        )
    
    def checkup(self, name):
        
        try:
            if self.segment and self.quality:    
                head = name.split(self.split)
                if head[self.segpos] == self.segment and re.match('(?:% s)' % '|'.join(self.quality), head[self.qualpos], re.IGNORECASE):
                    return(True)
                else:
                    return(False)

            elif self.segment or self.quality:
                head = entry.split(self.split)
                if head[self.segpos] == self.segment or re.match('(?:% s)' % '|'.join(self.quality), head[self.qualpos], re.IGNORECASE):
                    return(True)
                else:
                    return(False)

            else:
                return(True)
        except:
            return(False)
            
    def countRows(self, infile):
        
        row = 0
        for entry in SeqIO.parse(infile,'fasta'):
            if self.checkup(entry.name) == True:
                    row += 1
                
        return(row)
    
    def calculateFrequence(self, infile):
        
        row = self.countRows(infile)
        index = np.empty((row, 1, ), dtype = '<U16')
        matrix = np.empty((row, self.col, ),dtype = '<f8')
        
        pos = 0
        for entry in SeqIO.parse(infile,'fasta'):
            if self.checkup(entry.name) == True:
                sequence = str(entry.seq)
                accession = entry.name.split('|')[0]

                for i in range(len(sequence) - self.k + 1):
                    for main in map(''.join, it.product(*[self.nucex.get(j) for j in sequence[i:i+self.k]])):
                        size = np.prod([len(self.nucex.get(k)) for k in main])
                        for l, nuc in enumerate(main):
                            self.exist[main] += (1-self.mutfac)/size
                            for mutation in map(''.join, it.product(*[[main[:l]], nucmut.get(nuc), [main[l+1:]]])):
                                self.exist[mutation] += self.mutfac/(size*12)

                matrix[pos] = normalize(np.fromiter(self.exist.values(), dtype = '<f8', count = self.col).reshape(1, -1), norm = 'l1')
                #maybe change for simple calculation depends on ram usage here (copy or view?)
                index[pos] = accession
                
                self.exist.update((k,0) for k in self.exist.keys())
                pos = pos + 1
            
        return(index, matrix)

In [173]:
Freq = Frequency(k = 7, segment = '4', quality = 'Pass', split = '|', qualpos = 8, segpos = 2, mutfac = 0.2)

In [178]:
index, matrix = Freq.calculateFrequence('C.fasta')

In [116]:
for main in map(''.join, it.product(*[nucex.get(j, j) for j in seq])):
    print(f'main: {main}')
    for k, nuc in enumerate(main):
        for mutation in map(''.join, it.product(*[[main[:k]], nucmut[nuc], [main[k+1:]]])):
            print(f'mutation: {mutation}')

main: AAGT
mutation: CAGT
mutation: GAGT
mutation: TAGT
mutation: ACGT
mutation: AGGT
mutation: ATGT
mutation: AAAT
mutation: AACT
mutation: AATT
mutation: AAGA
mutation: AAGC
mutation: AAGG
main: AGGT
mutation: CGGT
mutation: GGGT
mutation: TGGT
mutation: AAGT
mutation: ACGT
mutation: ATGT
mutation: AGAT
mutation: AGCT
mutation: AGTT
mutation: AGGA
mutation: AGGC
mutation: AGGG


mutationswahrscheinlichkeit durch anzahl in den Vektor, fertig 

In [120]:
from Bio.Seq import Seq

seq = Seq("CCTCAGCGAGGACAGCAAGGGACTAGCCAGGAGGGAGAACAGAAACTCCAGAACATCTTGGAAATAGCTCCCAGAAAAGCAAGCAGCCAACCAGGCAGGTTCTGTCCCTTTCACTCACTGGCCCAAGGCGCCACATCTCCCTCCAGAAAAGACACCATGAGCACAGAAAGCATGATCCGCGACGTGGAACTGGCAGAAGAGGCACTCCCCCAAAAGATGGGGGGCTTCCAGAACTCCAGGCGGTGCCTATGTCTCAGCCTCTTCTCATTCCTGCTTGTGGCAGGGGCCACCACGCTCTTCTGTCTACTGAACTTCGGGGTGATCGGTCCCCAAAGGGATGAGAAGTTCCCAAATGGCCTCCCTCTCATCAGTTCTATGGCCCAGACCCTCACACTCAGATCATCTTCTCAAAATTCGAGTGACAAGCCTGTAGCCCACGTCGTAGCAAACCACCAAGTGGAGGAGCAGCTGGAGTGGCTGAGCCAGCGCGCCAACGCCCTCCTGGCCAACGGCATGGATCTCAAAGACAACCAACTAGTGGTGCCAGCCGATGGGTTGTACCTTGTCTACTCCCAGGTTCTCTTCAAGGGACAAGGCTGCCCCGACTACGTGCTCCTCACCCACACCGTCAGCCGATTTGCTATCTCATACCAGGAGAAAGTCAACCTCCTCTCTGCCGTCAAGAGCCCCTGCCCCAAGGACACCCCTGAGGGGGCTGAGCTCAAACCCTGGTATGAGCCCATATACCTGGGAGGAGTCTTCCAGCTGGAGAAGGGGGACCAACTCAGCGCTGAGGTCAATCTGCCCAAGTACTTAGACTTTGCGGAGTCCGGGCAGGTCTACTTTGGAGTCATTGCTCTGTGAAGGGAATGGGTGTTCATCCATTCTCTACCCAGCCCCCACTCTGACCCCTTTACTCTGACCCCTTTATTGTCTACTCCTCAGAGCCCCCAGTCTGTATCCTTCTAACTTAGAAAGGGGATTATGGCTCAGGGTCCAACTCTGTGCTCAGAGCTTTCAACAACTACTCAGAAACACAAGATGCTGGGACAGTGACCTGGACTGTGGGCCTCTCATGCACCACCATCAAGGACTCAAATGGGCTTTCCGAATTCACTGGAGCCTCGAATGTCCATTCCTGAGTTCTGCAAAGGGAGAGTGGTCAGGTTGCCTCTGTCTCAGAATGAGGCTGGATAAGATCTCAGGCCTTCCTACCTTCAGACCTTTCCAGATTCTTCCCTGAGGTGCAATGCACAGCCTTCCTCACAGAGCCAGCCCCCCTCTATTTATATTTGCACTTATTATTTATTATTTATTTATTATTTATTTATTTGCTTATGAATGTATTTATTTGGAAGGCCGGGGTGTCCTGGAGGACCCAGTGTGGGAAGCTGTCTTCAGACAGACATGTTTTCTGTGAAAACGGAGCTGAGCTGTCCCCACCTGGCCTCTCTACCTTGTTGCCTCCTCTTTTGCTTATGTTTAAAACAAAATATTTATCTAACCCAATTGTCTTAATAACGCTGATTTGGTGACCAGGCTGTCGCTACATCACTGAACCTCTGCTCCCCACGGGAGCCGTGACTGTAATCGCCCTACGGGTCATTGAGAGAAATAA")


table = 1
min_pro_len = 100

for strand, nuc in [(+1, seq), (-1, seq.reverse_complement())]:
    for frame in range(3):
        for pro in nuc[frame:].translate(table).split("*"):
            if len(pro) >= min_pro_len:
                print("%s...%s - length %i, strand %i, frame %i" % (pro[:30], pro[-3:], len(pro), strand, frame))

PQRGQQGTSQEGEQKLQNILEIAPRKASSQ...IAL - length 287, strand 1, frame 0
PLYCLLLRAPSLYPSNLERGLWLRVQLCAQ...ILP - length 106, strand 1, frame 1
PGGRTETPEHLGNSSQKSKQPTRQVLSLSL...PKG - length 104, strand 1, frame 2
GWIRSQAFLPSDLSRFFPEVQCTAFLTEPA...LIW - length 115, strand 1, frame 2
