# Assignment #1
## Greedy Motif Search

In [1]:
from Bio import SeqIO
import numpy as np
import math

In [44]:
def totalcount(x, pseudocount):
    x=list(x)
    counts = np.array([
        x.count('A'),
        x.count('T'),
        x.count('G'),
        x.count('C')
        ])
    finalcount = (counts + pseudocount)/(len(x) + 4*pseudocount)
    return finalcount


def profile(DNA, k, pos, log_odd=True):
    N = len(DNA)# or pos0
    x = np.empty((N, k), dtype=str) #matrix with k-mers
    W = np.empty((4, k)) #position weight matrix
    for i in range(N):
        x[i, :] = list(DNA[i][pos[i]:pos[i]+k])
    for j in range(k):
        W[:,j] = totalcount(x[:,j], pseudocount=1)
    if log_odd==True: # log-odds matrix
        W2 = np.log(W*4)/np.log(4)
    else: 
        W2 = W
    return W2

def llr(profile, s):
    ref = {
        'A':0,
        'T':1,
        'G':2,
        'C':3
    }
    llratio = 0
    for i in range(len(s)):
        llratio += profile[ref[s[i]], i]
    return llratio

def motifpos(profile, DNA):
    N = len(DNA)
    k = profile.shape[1]
    pos = np.empty(N, dtype=int)
    for i in range(N):
        maxllr = -math.inf
        for j in range(len(DNA[i])-k+1):
            llrval = llr(profile, DNA[i][j:j+k])
            if llrval > maxllr:
                maxllr = llrval
                pos[i] = j
    return pos  

def GreedySearch(DNA, k):
    N = len(DNA)
    pos = np.array([np.random.randint(0, len(DNA[i]) - k) for i in range(N)])
    while True:
        old_pos = pos.copy()
        pwm = profile(DNA, k, pos)
        pos = motifpos(pwm, DNA)
        if np.array_equal(old_pos, pos):
            break
    return pos

In [76]:
strong = [str(record.seq) for record in SeqIO.parse("synth_50_strong.fa", "fasta")]
weak = [str(record.seq) for record in SeqIO.parse("synth_50_weak.fa", "fasta")]

k=20
strong_pos = GreedySearch(strong, k)
weak_pos = GreedySearch(weak, k)
    
print(f"{k}-mers for 'synth_50_strong.fa':")
for i in range(len(strong)):
    print(strong[i][strong_pos[i]:strong_pos[i]+k])

print(f"{k}-mers for 'synth_50_weak.fa':") 
for i in range(len(weak)):
    print(weak[i][weak_pos[i]:weak_pos[i]+k])

20-mers for 'synth_50_strong.fa':
ATGATACCAGGGCAAGTTAA
GGCAACAATGGCTACGAGCG
TAGAGCAAATCGTACGAAAT
GTCTACATCGCGAGGGAGAT
ATGATAAGTCTCCGGTATTG
GTGCGTTGCCGCTTGTCTTG
ATGATAACCTACGAATAAAG
GATATACCTCGCAAATATAA
TTTCACTAATCACAATTGTA
GCGATTTATGCGGGCGAAAT
20-mers for 'synth_50_weak.fa':
CTCAAAATCGCTTGAAATAT
AGATTATGCTGTCCGTCAAT
GTGATAGTATCACCCGTCTA
AGGCAGCACCGAGTCAGGAC
ATAGCAAACGTCGTAAGAGT
TTCTTAGCAAGAGATTCTGC
TTGCCGCTCAGTTCGGATCC
TACCTATGCGCACTATTCAT
GGATTGACCACACCGTTCCG
AGCCCGCGCGCGGCGCATGT
