# Week 3

## Functions from previous weeks

Find all (k, d)-motifs in a collection of strings

In [4]:
def FrequentWordsWithMismatches(
    Text, k, d, print_output=False
):
    Patterns = list()
    freqMap = dict()
    n = len(Text)

    for i in range(0, n-k+1):
        Pattern = Text[i:i+k] # перебираем паттерны по строке
        neighborhood = Neighbors(Pattern, d) # создаем множество соседей - паттернов, которые отличаются от нашего паттерна на d

        for j in range(len(neighborhood) - 1): # проверяем всех соседей по словарю, добавляем +1 счет
            neighbor = neighborhood[j]

            if neighbor not in freqMap:
                freqMap[neighbor] = 1
            else:
                freqMap[neighbor] += 1

    maxPattern = MaxFreq(freqMap) # находим максимумальное значение по строке в словаре и выводим все паттерны, у которых столько совпадений со строкой
    for key in freqMap:
        if freqMap[key] == maxPattern:
            Patterns.append(key)

    if print_output:
        print(*Patterns)
    else:
        return Patterns


def HammingDistance(p, q):
    hammingDist = 0

    for i in range(len(p)):
        if p[i] != q[i]:
            hammingDist += 1

    return hammingDist


def Neighbors(Pattern, d):
    if d == 0:
        return Pattern
    if len(Pattern) == 1:
        return {'A', 'T', 'G', 'C'}
    
    Neighborhood = set()
    NeighborSuffix = Neighbors(Suffix(Pattern), d)

    for Neighbor in NeighborSuffix:
        if HammingDistance(Suffix(Pattern), Neighbor) < d:
            for i in 'ATGC':
                Neighborhood.add(i+Neighbor)
        else:
            Neighborhood.add(Pattern[0] + Neighbor)
            
    return list(Neighborhood)


def MaxFreq(Dictionary: dict):
    '''
    Function to return max value from dictionary
    '''

    maxValue = max(
        Dictionary, key=Dictionary.get
    )
    return Dictionary[maxValue]


def SortPatternsDict(Dictionary):
    '''
    Function return dictionary sorted by value.
    '''

    sortedDict = {
        key: value for key, value in sorted(Dictionary.items(), key=lambda item: item[1], reverse=True)
    }
    return sortedDict
    
    
def Suffix(Pattern):
    return Pattern[1:]

## Greedy Algorithms

Greedy algorithms select the “most attractive” alternative at each iteration. For example, a greedy algorithm in chess might attempt to capture an opponent’s most valuable piece at every move

Instead, they are often fast heuristics that trade accuracy for speed in order to find an approximate solution

Nevertheless, for many biological problems that we will study in this book, greedy algorithms will prove quite useful.

In [None]:
Profile = {
    base: [
        float(probability) for probability in input(f"Enter probabilities for {base}").split()
    ] for base in 'ACGT'
}
Profile

{'A': [], 'C': [], 'G': [], 'T': []}

**ProfileMostProbable** calculates probability (**CalculateProbability**) of every k-mer based on ACGT-Profile and returns k-mer with highest probability. If there will be multiple k-mers with the same prob, returns first.

In [6]:
from math import prod


def ProfileMostProbable(DnaString, k, Profile:dict):

    kMerList = [ DnaString[i:i+k] for i in range(len(DnaString) - k + 1) ]

    mostProbable = kMerList[0]
    probability = CalculateProbability(kMerList[0], k, Profile) # назначаем изначальные данные для первого k-мера

    for kMer in kMerList[1:]:
        probkMer = CalculateProbability(kMer, k, Profile)
        if probkMer > probability:
            probability = probkMer
            mostProbable = kMer

    return mostProbable


def CalculateProbability(Pattern, k, Profile:dict):
    return prod([Profile[Pattern[i]][i] for i in range(k)])


ProfileMostProbable(
    DnaString='ACCTGTTTATTGCCTAAGTTCCGAACAAACCCAATATAGCCCGAGGGCCT',
    k=5, Profile={
        'A': [0.2, 0.2, 0.3, 0.2, 0.3],
        'C': [0.4, 0.3, 0.1, 0.5, 0.1],
        'G': [0.3, 0.3, 0.5, 0.2, 0.4],
        'T': [0.1, 0.2, 0.1, 0.1, 0.2]
    })

'CCGAG'

We will also need **CreateProfile** function to build ACGT-Profile for list of Motifs, and *Score* function to calculate Score 

In [7]:
from collections import Counter
import numpy as np


def CreateProfile(DnaList):
    Profile = { base: [ # делаем словарь основание-список вероятностей по позициям для каждой колонки нуклеотидов из списка последовательностей
        Nucleotides.count(base) / len(Nucleotides)
        for Nucleotides in [''.join([Dna[i] for Dna in DnaList]) for i in range(len(DnaList[0]))]
    ] for base in 'ACGT' }
    return Profile 


# def Score(Motifs):
#     Concensus = MedianStringMotif(Motifs, len(Motifs[0]))
#     print("Concensus Motif is:", Concensus)
#     return sum([1 for line in Motifs for i in range(len(line)) if line[i] != Concensus[i] ])


def Score(Motifs):
    motifsList = [
        [Motifs[i][j] for i in range(len(Motifs))] for j in range(len(Motifs[0]))
    ]
    Score = 0
    for column_i in motifsList:
        count = Counter(column_i)
        maxFreq = count.most_common(1)[0][1]
        Score += len(column_i) - maxFreq
    return Score

In [8]:
CreateProfile(["ACCT", "AGGA", "ACGT", "AGGT"])

{'A': [1.0, 0.0, 0.0, 0.25],
 'C': [0.0, 0.5, 0.25, 0.0],
 'G': [0.0, 0.5, 0.75, 0.0],
 'T': [0.0, 0.0, 0.0, 0.75]}

Now we try **GreedyMotifSearch** - an Algorithm that takes list with one k-mer from first string and creates its Profile. Then it calculates MostProbable k-mer for second line based on Current Profile, adds that k-mer to list, then recalculates Profile, calculates MostProbable for third line etc.

After finishing with all lines it checks, whether Score of our Current k-mer list is better (lower) than Score of Best k-mer list, and if true, rewrites Best List. Then it repeats the whole cycle with next k-mer till the end.

In [9]:
def GreedyMotifSearch(DnaList, k, t, print_output=False):
    BestMotifs = [Dna[0:k] for Dna in DnaList] # начинаем с первой строки из списка ДНК, делим ее на k-меры
    BestScore = Score(BestMotifs) # считаем Score для этих k-меров
    
    for kMer in [DnaList[0][i:i+k] for i in range(len(DnaList[0]) - k + 1)]: # для каждого k-мера смотрим остальные строки из списка ДНК
        Motifs = list() 
        Motifs.append(kMer) # берем k-мер из первой строки
        for i in range(1, t):
            CurrentProfile = CreateProfile(Motifs) # высчитываем профиль по текущему списку мотивов (вероятности ACGT) 
            Motif = ProfileMostProbable(DnaList[i], k, CurrentProfile) # высчитываем наиболее вероятный k-мер для строки
            Motifs.append(Motif) # добавляем k-мер к списку мотивов, 
            # идем к след строке, для нее обновляем профиль и на его основании находим наиболее вероятный k-mer
        
        CurrentScore = Score(Motifs) # пройдя по всем строкам и получив все мотивы, проверяем их Score
        if BestScore > CurrentScore: # если Score ниже, обновляем лучший набор мотивов и лучший счет
            BestMotifs = Motifs
            BestScore = CurrentScore
            
    if print_output == True:
        print("Best k-mer Motifs are:")
        print(*BestMotifs)
        print("Best Score is:", BestScore)
    else:
        return BestMotifs

In [None]:
GreedyMotifSearch(
    k=12, t=25, DnaList=[line.strip() for line in input("Enter space-separated sequences:").split()], print_output=True
)

Best k-mer Motifs are:
ATTTCTTCTAAA TTTGCATCTACA TTTACATCTACA GTTTCCTCTACA CTTACGTCTATA GTTCCCTCTAGA TTTCCTTCTAGA ATTCCTTCTAGA TTTTCTTCTAGA GTTTCATCTATA GTTACCTCTAGA GTTACGTCTAGA ATTCCATCTAGA CTTCCGTCTAAA GTTGCATCTAGA TTTACATCTATA GTTACTTCTACA TTTCCATCTACA ATTGCCTCTACA TTTACGTCTAGA TTTGCCTCTAGA TTTCCATCTATA CTTACCTCTAGA ATTACTTCTATA TTTCCGTCTAGA
Best Score is: 61


In contrast to **MedianString**, **GreedyMotifSearch** is fast and can be run with k = 15 to solve the Subtle Motif Problem (recall that we settled for k = 13 in the case of MedianString). However, it trades speed for accuracy and returns, for example, *gtAAAtAgaGatGtG* (total distance: 58), which is very different from the true implanted motif *AAAAAAAAGGGGGGG*

### Current problem: 
If we have 0 probability of nucleotide, we face 0 probability for the whole motif, even though other probabilities can be very good.

In order to improve this unfair scoring, bioinformaticians often substitute zeroes with small numbers called pseudocounts.

In the case of motifs, pseudocounts often amount to adding 1 (or some other small number) to each element of Count(Motifs). 

The argument for Pseudocounts were that if we know about success and failure, then these events have already been observed. Thus we can add Pseudocounts

So we would change **CreateProfile function to use Pseudocounts**, and implement it in ***GreedyMotifSearch***

In [10]:
def CreateProfilePseudocounts(DnaList):
    Profile = { base: [ # делаем словарь основание-список вероятностей по позициям для каждой колонки нуклеотидов из списка последовательностей
        (Nucleotides.count(base)+1) / (len(DnaList[0])+len(DnaList))
        for Nucleotides in [''.join([Dna[i] for Dna in DnaList]) for i in range(len(DnaList[0]))]
    ] for base in 'ACGT' }
    return Profile 

In [11]:
CreateProfilePseudocounts(['ACCT'])

{'A': [0.4, 0.2, 0.2, 0.2],
 'C': [0.2, 0.4, 0.4, 0.2],
 'G': [0.2, 0.2, 0.2, 0.2],
 'T': [0.2, 0.2, 0.2, 0.4]}

In [None]:
def GreedyMotifSearchPseudocounts(DnaList, k, t, print_output=False):
    BestMotifs = [Dna[0:k] for Dna in DnaList] # начинаем с первой строки из списка ДНК, делим ее на k-меры
    BestScore = Score(BestMotifs) # считаем Score для этих k-меров
    
    for kMer in [DnaList[0][i:i+k] for i in range(len(DnaList[0]) - k + 1)]: # для каждого k-мера смотрим остальные строки из списка ДНК
        Motifs = list() 
        Motifs.append(kMer) # берем k-мер из первой строки
        for i in range(1, t):
            CurrentProfile = CreateProfilePseudocounts(Motifs) # высчитываем профиль по текущему списку мотивов (вероятности ACGT) 
            Motif = ProfileMostProbable(DnaList[i], k, CurrentProfile) # высчитываем наиболее вероятный k-мер для строки
            Motifs.append(Motif) # добавляем k-мер к списку мотивов, 
            # идем к след строке, для нее обновляем профиль и на его основании находим наиболее вероятный k-mer
        
        CurrentScore = Score(Motifs) # пройдя по всем строкам и получив все мотивы, проверяем их Score
        if BestScore > CurrentScore: # если Score ниже, обновляем лучший набор мотивов и лучший счет
            BestMotifs = Motifs
            BestScore = CurrentScore
            
    if print_output == True:
        print("Best k-mer Motifs are:")
        print(*BestMotifs)
        print("Best Score is:", BestScore)
    else:
        return BestMotifs

In [None]:
GreedyMotifSearchPseudocounts(
    k=12, t=25, print_output=True, DnaList=[ i.strip() for i in input().split()]
)

Best k-mer Motifs are:
ATTTCTTCTAAA TTTGCATCTACA TTTACATCTACA GTTTCCTCTACA CTTACGTCTATA GTTCCCTCTAGA TTTCCTTCTAGA ATTCCTTCTAGA TTTTCTTCTAGA GTTTCATCTATA GTTACCTCTAGA GTTACGTCTAGA ATTCCATCTAGA CTTCCGTCTAAA GTTGCATCTAGA TTTACATCTATA GTTACTTCTACA TTTCCATCTACA ATTGCCTCTACA TTTACGTCTAGA TTTGCCTCTAGA TTTCCATCTATA CTTACCTCTAGA ATTACTTCTATA TTTCCGTCTAGA
Best Score is: 61
