# Week 4

Note that RandomizedMotifSearch may change all t strings Motifs in a single iteration. This strategy may prove reckless, since some correct motifs (captured in Motifs) may potentially be discarded at the next iteration. GibbsSampler is a more cautious iterative algorithm that discards a single k-mer from the current set of motifs at each iteration and decides to either keep it or replace it with a new one. This algorithm thus moves with more caution in the space of all motifs, as illustrated below.

In [1]:
from collections import Counter
import numpy as np
import math


def CalculateProbability(Pattern, k, Profile:dict):
    return math.prod([Profile[Pattern[i]][i] for i in range(k)])


def CreateProfile(DnaList):
    Profile = { base: [ # делаем словарь основание-список вероятностей по позициям для каждой колонки нуклеотидов из списка последовательностей
        Nucleotides.count(base) / len(Nucleotides)
        for Nucleotides in [''.join([Dna[i] for Dna in DnaList]) for i in range(len(DnaList[0]))]
    ] for base in 'ACGT' }
    return Profile 


def CreateProfilePseudocounts(DnaList):
    Profile = { base: [ # делаем словарь основание-список вероятностей по позициям для каждой колонки нуклеотидов из списка последовательностей
        (Nucleotides.count(base)+1) / (len(DnaList[0])+len(DnaList))
        for Nucleotides in [''.join([Dna[i] for Dna in DnaList]) for i in range(len(DnaList[0]))]
    ] for base in 'ACGT' }
    return Profile 


def Score(Motifs):
    motifsList = [
        [Motifs[i][j] for i in range(len(Motifs))] for j in range(len(Motifs[0]))
    ]
    Score = 0
    for column_i in motifsList:
        count = Counter(column_i)
        maxFreq = count.most_common(1)[0][1]
        Score += len(column_i) - maxFreq
    return Score


def ProfileMostProbable(DnaString, k, Profile:dict):

    kMerList = [ DnaString[i:i+k] for i in range(len(DnaString) - k + 1) ]

    mostProbable = kMerList[0]
    probability = CalculateProbability(kMerList[0], k, Profile) # назначаем изначальные данные для первого k-мера

    for kMer in kMerList[1:]:
        probkMer = CalculateProbability(kMer, k, Profile)
        if probkMer > probability:
            probability = probkMer
            mostProbable = kMer

    return mostProbable

In [2]:
Score(['ACCT', 'ATGT', 'GCGT', 'ACGA', 'AGGT'])

5

In [3]:
CreateProfile(
    ['ACCT', 'ATGT', 'GCGT', 'ACGA', 'AGGT']
)

{'A': [0.8, 0.0, 0.0, 0.2],
 'C': [0.0, 0.6, 0.2, 0.0],
 'G': [0.2, 0.2, 0.8, 0.0],
 'T': [0.0, 0.2, 0.0, 0.8]}

In general, we can begin from a collection of randomly chosen k-mers Motifs in Dna, construct Profile(Motifs), and use this profile to generate a new collection of k-mers:

In [4]:
def Consensus(Motifs:list):
    Profile = CreateProfile(Motifs)
    
    return Profile


надо дописать конценсус

In [5]:
from random import choices, randint


def RandomNumber(N):
    return randint(0, N-1) # выводим рандомный номер в диапазоне 0:N


def RandomProbabilities(probabilityList):
    return choices(
        population=range(len(probabilityList)), weights=[prob/sum(probabilityList) for prob in probabilityList], k=1)[0]


print(RandomProbabilities(probabilityList=[0.1, 0.0005, 0.9, 0.001]))

2


In [6]:
RandomNumber(5)

4

Разобраться и переписать Random !

In [7]:
import random

def Random(probabilities):
    summand=sum(probabilities)
    for i in range(len(probabilities)):
        probabilities[i]=probabilities[i]/(summand)
    random_number=random.random()
    counter=0
    for j in range(len(probabilities)):
        if random_number>=counter and random_number<(counter+probabilities[j]):
            return j
        else:
            counter+=probabilities[j]

In [8]:
def GibbsSampler(DnaList, k, t, N):
    Motifs = [  # делаем список случайных мотивов длинной k
        Dna[position:position+k] for Dna in DnaList for position in [RandomNumber(len(DnaList[0])-k+1)]
    ]
    bestMotifs = Motifs[:]
    bestScore = Score(bestMotifs) # назначаем его лучшим и делаем ему счет

    for loop in range(N):
        dnaNumber = RandomNumber(t) # выбираем одну строку из внесенного списка, убираем ее k-мер из рассчета
        Motifs.pop(dnaNumber)
        currentProfile = CreateProfilePseudocounts(Motifs) # считаем профиль из t-1 k-меров
        newIndex = Random(probabilities=[ # считаем вероятность каждого k-мера из удаленной строки
            CalculateProbability(Pattern=DnaList[dnaNumber][s:s+k], k=k, Profile=currentProfile) 
            for s in range(len(DnaList[dnaNumber])-k+1) # на основании полученных вероятностей выдаем случайный взвешенный индекс 
        ])

        Motifs.insert(dnaNumber, DnaList[dnaNumber][newIndex:newIndex+k]) # вставляем на место удаленного k-мера полученный, считаем Score
        currentScore = Score(Motifs)

        if currentScore < bestScore: # делаем все выше случайно, чтобы сохранить возможность выйти из локального оптимума и найти глобальный
            bestMotifs = Motifs[:]
            bestScore = currentScore
        
    return [bestMotifs, bestScore]


def MultipleGibsSampler(DnaList, k, t, N, repeats, print_output=False):
    lastMotifs, lastScore = GibbsSampler( # делаем изначалальный набор мотивов
        DnaList, k, t, N
    ) 
    print("Initial iteration number:", 1)
    print("Initial score is", lastScore)

    for index in range(repeats-1):    # нужное количетво раз итерируемся и находим новый список bestMotifs и его Score
        bestMotifs, bestScore = GibbsSampler(
            DnaList, k, t, N
        )
        if bestScore < lastScore: # если вдруг этот список лучше, записываем его
            lastMotifs = bestMotifs[:]
            print(f"Changing score from {lastScore} to {bestScore}")
            lastScore = bestScore
        print("Completed iteration number", index+2)
    
    print('Complete!')
    if print_output:
        print(*lastMotifs)
        print("Score:", lastScore)
    else:
        return lastMotifs

In [9]:
# import inspect
# print(inspect.getsource(GibbsSampler))

In [10]:
import cProfile, pstats
pr = cProfile.Profile()
pr.enable()

MultipleGibsSampler(
    DnaList=[i.strip() for i in input().split()],
    k=15, t=20, N=2000, repeats=20,
    print_output=True
)

pr.disable()

Initial iteration number: 1
Initial score is 100
Completed iteration number 2
Completed iteration number 3
Changing score from 100 to 87
Completed iteration number 4
Changing score from 87 to 78
Completed iteration number 5
Completed iteration number 6
Changing score from 78 to 64
Completed iteration number 7
Completed iteration number 8
Completed iteration number 9
Completed iteration number 10
Completed iteration number 11
Completed iteration number 12
Completed iteration number 13
Completed iteration number 14
Completed iteration number 15
Completed iteration number 16
Completed iteration number 17
Completed iteration number 18
Completed iteration number 19
Completed iteration number 20
Complete!
TAGACAGCATGGAGT TCCTCGGTATACCGT TCCAGAAGGTACCGT TCCAGAGACAACCGT TCCAGTTCATACCGT ATAAGAGTATACCGT TCCAGAGTATATACT TCCAGAGTATACTTA TCCAGAGTATCTAGT TGGGGAGTATACCGT GCCAGAGTATACCCC TCGGTAGTATACCGT TCCGCTGTATACCGT TCCAGAGTCGGCCGT TCCAGATCGTACCGT TCCAGAGTAGCTCGT GGCAGAGTATACCGA TCCACCATATACCGT TCC

In [11]:
sortby = 'tottime'
ps = pstats.Stats(pr).sort_stats(sortby)
print(ps.print_stats())

         57235065 function calls (57235057 primitive calls) in 78.129 seconds

   Ordered by: internal time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
 12640000   29.671    0.000   29.671    0.000 /var/folders/xf/nzdtc5ts6vb6rqb4szc5hcjc0000gn/T/ipykernel_44456/4167868547.py:7(<listcomp>)
 12640000   13.232    0.000   48.121    0.000 /var/folders/xf/nzdtc5ts6vb6rqb4szc5hcjc0000gn/T/ipykernel_44456/4167868547.py:6(CalculateProbability)
    40000    9.003    0.000   57.123    0.001 /var/folders/xf/nzdtc5ts6vb6rqb4szc5hcjc0000gn/T/ipykernel_44456/4242661832.py:12(<listcomp>)
 12640000    5.218    0.000    5.218    0.000 {built-in method math.prod}
    40000    3.052    0.000    3.212    0.000 /var/folders/xf/nzdtc5ts6vb6rqb4szc5hcjc0000gn/T/ipykernel_44456/3566973495.py:3(Random)
   160000    1.940    0.000    7.136    0.000 /var/folders/xf/nzdtc5ts6vb6rqb4szc5hcjc0000gn/T/ipykernel_44456/4167868547.py:21(<listcomp>)
   160000    1.933    0.000    3.343    0.