## Task Name: Trie Contruction


In [65]:


def buildTrie (patterns):
    trie = {0:{}}
    idx = 1
    for p in patterns : 
        cur = 0 #root 
        for base in p: 
            if base not in trie[cur]:
                trie [cur][base] = idx #new node
                trie[idx] = {}
                cur = idx 
                idx += 1
            else :
                cur = trie[cur][base]
    
    return trie 




In [66]:
buildTrie(["ATAGA",
"ATC",
"GAT"])

{0: {'A': 1, 'G': 7},
 1: {'T': 2},
 2: {'A': 3, 'C': 6},
 3: {'G': 4},
 4: {'A': 5},
 5: {},
 6: {},
 7: {'A': 8},
 8: {'T': 9},
 9: {}}

## Task Name : Implement Randomized Motif Search

In [67]:
import numpy as np
def random_string_generator(DNA,k) :
    random_motifs = []
    for d in DNA: 
        rand_idx = np.random.randint(0, len(d) - k)
        rand_motif = d[rand_idx: rand_idx+k]
        random_motifs.append(rand_motif)
    return random_motifs
def CountMatrix(motifs) :
    k = len(motifs[0])
    t = len(motifs)
    count_dict = {"A":[0]*k,"G":[0]*k,"T":[0]*k,"C":[0]*k}
    for i in range(t):
        for j in range(k):
            symbol = motifs[i][j]
            count_dict[symbol][j] += 1
    return count_dict

def Profile(motifs):
    count_dict = CountMatrix(motifs)
    t = len(motifs)
    k = len(motifs[0])

    for i in count_dict:
        for j in range(k):
            count_dict[i][j] = float(count_dict[i][j])/ float(t)
    return count_dict

def calculateProbability(p, profile):
    prob = 1.0
    for i in range(len(p)): 
        prob *= profile[p[i]][i]
    return prob 

def profileMostProbableKmer(s, k , profile) :
    max_prob = -1.0
    best_kmer = ""
    for i in range(len(s) - k + 1):
        pat = s[i:i+k]
        prob = calculateProbability(pat, profile)
        if prob >= max_prob :
            best_kmer = pat 
            max_prob = prob
    return best_kmer

def Consensus(motifs):
    profile = Profile(motifs)
    cons = ""
    k = len(motifs[0])
    for i in range(k) :
        max_prob = -1.0
        bp = ""
        for j in profile : 
            if profile[j][i] > max_prob : 
                max_prob = profile[j][i]
                bp = j 
        cons += bp 
    return cons 

def Score(motifs):
    consensus = Consensus(motifs)
    count = CountMatrix(motifs)
    score = len(consensus) * len(motifs)
    k = 0
    for symbol in consensus:
        score -= count[symbol][k]
        k+=1
    
    return score

def randomizedMotifSearch(k, t, DNA):
    random_motifs = random_string_generator(DNA, k)
    # print(random_motifs)
    BestMotifs = random_motifs

    while True :
        profile  = Profile(random_motifs)
        random_motifs = []
        for i in range(t):
            pat = profileMostProbableKmer(DNA[i],k, profile)
            random_motifs.append(pat)
        if Score(random_motifs) < Score(BestMotifs) :
            BestMotifs = random_motifs
        else :
            return BestMotifs

In [68]:
randomizedMotifSearch(8, 5,
["CGCCCCTCTCGGGGGTGTTCAGTAAACGGCCA",
"GGGCGAGGTATGTGTAAGTGCCAAGGTGCCAG",
"TAGTACCGAGACCGAAAGAAGTATACAGGCGT",
"TAGATCAAGTTTCAGGTGCACGTCGGTGAACC",
"AATCCACCAGCTCCACGTGCAATGTTGGCCTA"])

['CTCTCGGG', 'GGCGAGGT', 'CGAGACCG', 'CACGTCGG', 'CTCCACGT']