## Helper Utils

In [40]:
import numpy as np
import random
import collections
def Count(Motifs):
    count = {} # initializing the count dictionary
    k = len(Motifs[0])
    count = {'A':[0]*k,'C':[0]*k,'G':[0]*k,'T':[0]*k}
    
    t = len(Motifs)
    for i in range(t):
        for j in range(k):
            symbol = Motifs[i][j]
            count[symbol][j] += 1
    return count

def Profile(Motifs):
    t = len(Motifs)
    k = len(Motifs[0])
    profile = Count(Motifs)
    for i in profile:  
        for j in range(k):
            profile[i][j] = profile[i][j]//t   
    return profile

def calculateProbability(pattern, profile_mat):
    temp_pr = 1.0
    for j in range(len(pattern)):
        if(pattern[j] == "A"):
            temp_pr  = temp_pr * profile_mat ["A"][j]
        elif(pattern[j] == "C"):
            temp_pr  = temp_pr * profile_mat ["C"][j]
        elif(pattern[j] == "G"):
            temp_pr  = temp_pr * profile_mat ["G"][j]
        elif(pattern[j] == "T"):
            temp_pr  = temp_pr * profile_mat ["T"][j]
    return temp_pr

def ProfileMostProbableKmer(text, k, profile):
    max_p = -1
    most_probable_kmer = ''
    for i in range(len(text)-k+1):
        pattern = text[i: i+k]
        p = calculateProbability(pattern, profile)
        if p > max_p:
            max_p = p
            most_probable_kmer = text[i:i+k]            
    return most_probable_kmer

def MotifFromProfile(Dna, k, profile):
    motifs=[]
    for i in range(len(Dna)):
        motifs.append(ProfileMostProbableKmer(Dna[i],k,profile))
    return motifs

def RandomMotifs(Dna, k):
    result = []
    for string in Dna:
        index = np.random.randint(0, len(string) - k)
        result.append(string[index:index + k])
    return result

def Consensus(Motifs):
    k = len(Motifs[0])
    count = Count(Motifs)
    consensus = ""
    for j in range(k):
        m = 0
        frequentSymbol = ""
        for symbol in "ACGT":
            if count[symbol][j] > m:
                m = count[symbol][j]
                frequentSymbol = symbol
        consensus += frequentSymbol
    return consensus

def Score(Motifs):
    # Insert code here
    consensus = Consensus(Motifs) #consensus string
    counts = Count(Motifs) # count matrix
    score = len(Motifs)*len(consensus) 
    i = 0
    for symbol in consensus:  
        score -= counts[symbol][i] 
        i += 1
    return score 

### Randomized Motif Search

In [7]:
def RandomizedMotifSearch(Dna,k, t):
    motifs = RandomMotifs(Dna,k)
    bestMotifs = motifs
    while True:
        profile_mat = Profile(motifs)
        motifs = MotifFromProfile(Dna,k,profile_mat)
        if Score(motifs) < Score(bestMotifs):
            bestMotifs = motifs
        else:
            return bestMotifs

### Randomized Motif Search with Gibbs Sampler

In [1]:
def GibbsSampler(Dna,k,t,N):
    motifs = RandomMotifs(Dna,k)
    bestMotifs = motifs
    for run in range(N):
        i = np.random.randint(t)
        motifs_except_ith = np.delete(motifs,i)
        profile_mat = Profile(motifs_except_ith)
        motifs[i] = ProfileMostProbableKmer(Dna[i],k,profile_mat)
        if Score(motifs) < Score(bestMotifs):
            bestMotifs = motifs
    return bestMotifs

### Test Code for Motif Search

In [22]:
if __name__ == "__main__":
    k,t = [int(a) for a in input().strip().split(" ")]
    Dna = []
    for _ in range(t):
        Dna.append(input())

    N = 1000
    BestMotifs = RandomizedMotifSearch(Dna, k, t)
    for i in range(N):
        m = RandomizedMotifSearch(Dna, k, t)
        if Score(m) < Score(BestMotifs):
            BestMotifs = m

    for i in BestMotifs:
        print(i)

ValueError: invalid literal for int() with base 10: ''

### Generate the k-mer Composition of a String

In [26]:
def KmerComposition(k,Text):
    kmers = []
    for i in range( len(Text) - k + 1):
        kmer = Text[i : i +k]
        kmers.append(kmer)

    return kmers

print(KmerComposition(5, "ABCDEFGHIJK"))

['ABCDE', 'BCDEF', 'CDEFG', 'DEFGH', 'EFGHI', 'FGHIJ', 'GHIJK']


### Reconstruct a String from its Genome Path

In [34]:
def geneReconstruct(kmers):
    text = kmers[0]
    for i in range(1,len(kmers)):
        print(kmers[i][-1:])
        text += kmers[i][-1:]
    return text

geneReconstruct(["ACCGA",
"CCGAA",
"CGAAG",
"GAAGC",
"AAGCT"])

A
G
C
T


'ACCGAAGCT'

### Construct the Overlap Graph of a Collection of k-mers

In [39]:
def get_prefix (pattern):
    return pattern[:len(pattern) - 1]
def get_suffix (pattern):
    return pattern[1:]

def overlapGraph(patterns):
    adj_matrix = [[0 for i in range(len(patterns))] for j in range(len(patterns))]

    for i in range ( len (patterns)):
        for j in range (len (patterns)):
            if i != j and get_suffix(patterns[i]) == get_prefix(patterns[j]) :
                adj_matrix[i][j] = 1

    for m in range (len(adj_matrix)):
        for n in range (len(adj_matrix)) :
            if adj_matrix[m][n] == 1:
                print (patterns[m], " -> ", patterns[n])



overlapGraph(["ATGCG", "GCATG", "CATGC", "AGGCA", "GGCAT"])

GCATG  ->  CATGC
CATGC  ->  ATGCG
AGGCA  ->  GGCAT
GGCAT  ->  GCATG


### Construct the De Bruijn Graph of a String

In [43]:

def constructDeBruijn(k, Text):
    adj_list_pair = {}
    for i in range (len(Text) - k + 1):
        kmer = Text[i : i +k]
        prefix = get_prefix(kmer)
        suffix = get_suffix (kmer)
        if prefix not in adj_list_pair:
            adj_list_pair[prefix] = (suffix,)
        else :
            adj_list_pair[prefix] += (suffix, )
    
    od = collections.OrderedDict(sorted(adj_list_pair.items()))
    for key in od.keys():
        print(str(key) + " -> ")
        if len(adj_list_pair[key]) == 1:
            print(adj_list_pair[key][0] + "\n")
        else:
            adj_list_pair[key] = sorted(adj_list_pair[key])
            print(adj_list_pair[key][0])
            del adj_list_pair[key][0]
            for item in adj_list_pair[key]:
                print("," + str(item))

constructDeBruijn(4,"AAGATTCTCTAC" )
        

AAG -> 
AGA

AGA -> 
GAT

ATT -> 
TTC

CTA -> 
TAC

CTC -> 
TCT

GAT -> 
ATT

TCT -> 
CTA
,CTC
TTC -> 
TCT



### Construct the De Bruijn Graph of a Collection of k-mers

In [62]:
def constructDeBruijnFromKmers (kmers):
    adj_list = {}
    for kmer in kmers:
        suffix = get_suffix(kmer)
        prefix = get_prefix(kmer)
        adj_list[prefix] = []
        adj_list[suffix] = []
    for kmer in kmers:
        suffix = get_suffix(kmer)
        prefix = get_prefix(kmer)
        adj_list[prefix].append(suffix)
    return adj_list

constructDeBruijnFromKmers(["GAGG", "CAGG","GGGG","GGGA","CAGG","AGGG","GGAG"])

{'GAG': ['AGG'],
 'AGG': ['GGG'],
 'CAG': ['AGG', 'AGG'],
 'GGG': ['GGG', 'GGA'],
 'GGA': ['GAG']}