# <u> BA1A - Compute the Number of Times a Pattern Appears in a Text </u>

In [None]:
def pattern_count(text, pattern):
  count, n, k = 0, len(text), len(pattern)

  for i in range(n-k+1):
    if text[i:i+k] == pattern:
      count += 1
  
  return count


# ===================== MORE PYTHONIC ==========================
def pattern_count_short(text, pattern):
  return sum(1 for i in range(len(text)- len(pattern)+1) if text[i: i+len(pattern)] == pattern)


In [None]:
# DRIVER CODE
text_judge = "CTTTCACGTTCTCACGTTATCACGTTTGTCACGTTTCACGTTTGGTCACGTTGCTCATCACGTTCGGTACGACACCCTCACGTTGGAAGTAGATCACGTTATCACGTTTTCACGTTTCTCACGTTTTCACGTTTACACTCACGTTTAGGACGTCGTCGATTCAAATCACGTTGCTTCGATCACGTTTTTCCGCCACTCACGTTTTCACGTTTCACGTTTGTATGTCACGTTTCACGTTCGCTCACGTTCTCACGTTTCACGTTTCACGTTAGAATCACGTTTCTCGGTTCACGTTCTCACGTTTCACGTTTCACGTTTTCACGTTGTCCTCTGTGCGACGTCTTCACGTTTCACGTTTCACGTTCTTATACCGTCACGTTTCACGTTCCCATCCAAGAATTAAATGGGTGTTGAGCAAGCTCACGTTGATCACGTTCTCACGTTTATCACGTTGCTTCTCTGCCTCACGTTTCACGTTGTCACGTTTCACGTTTCGTCCATCACGTTGATCACGTTCACAGTTCACGTTGACAATCACGTTAACTGCTCACGTTTCACGTTCTCACGTTGTTCACGTTTCGGTCACGTTGTTCTCACGTTTCCCTTCACGTTGTCACGTTTCACGTTTCACGTTTCTCATTCACGTTTCACGTTTCACGTTTCACGTTTCACGTTTCACGTTATCACGTTTTTGATAGTATCACGTTCCGTTCACGTTCTCACGTTTCACGTTGAATCACGTTGTGAATCACGTTACTCACGTTTCAGATATTCACGTTTCACGTTCTCAAATACGCTCACGTTCCTACGTCTCACGTTCAAGCTCACGTTTCACGTTTCACGTTTCACGTTTCACGTTATCACGTTGTCACGTTTCACGTTTTCACGTTCTCACGTTCCGTCACGTTTTTATCTCACGTTGTTCACGTTGTCACGTTTGAATCACGTTTCACGTTTCACGTT"
pattern_judge = "TCACGTTTC"

pattern_count_short(text_judge, pattern_judge)

36

<hr>

# <u> BA1B - Find the Most Frequent Words in a String </u>

In [None]:
def most_freq_kmer(text, k):
  n, mx = len(text), 0
  kmer_freq, most_freq = {}, []
  
  for i in range(n-k+1):
    pattern = text[i:i+k]
    kmer_freq[pattern] = pattern_count(text, pattern)

    mx = max(kmer_freq[pattern], mx)

  for key in kmer_freq:
    if kmer_freq[key] == mx:
      most_freq.append(key)

  
  return ' '.join(most_freq)


# ===================== MORE PYTHONIC ==========================
from collections import Counter

def most_freq_kmer_short(text , k):
  kmer_counter = Counter([text[i:i+k] for i in range(len(text)-k+1)]).most_common()

  return " ".join([element[0]  for element in kmer_counter if element[1] == kmer_counter[0][1]])

In [None]:
# DRIVER CODE
text = "ACGTTGCATGTCGCATGATGCATGAGAGCT"
k = 4

text_judge = "GTACGTTCAAGTACGTTCAACGCTCGAATTCACCGGACCCGTTATACCGCTCGAATCTGAAAATACTGAAAATACTGAAAATATCACCGGACCCGCTCGAATCGCTCGAATCGCTCGAATCGCTCGAATCGTTATACTCACCGGACCCGTTATACCGTTATACTCACCGGACCCTGAAAATACTGAAAATACGCTCGAATGTACGTTCAACGTTATACCGCTCGAATCTGAAAATACGTTATACGTACGTTCAATCACCGGACCTCACCGGACCGTACGTTCAAGTACGTTCAAGTACGTTCAACGCTCGAATCTGAAAATACGCTCGAATCGCTCGAATCTGAAAATACTGAAAATAGTACGTTCAACGTTATACCGCTCGAATCGCTCGAATGTACGTTCAACGTTATACCGTTATACCGCTCGAATTCACCGGACCCGTTATACTCACCGGACCCTGAAAATACGTTATACCGCTCGAATCTGAAAATACTGAAAATATCACCGGACCGTACGTTCAATCACCGGACCTCACCGGACCCTGAAAATACGCTCGAATCGTTATACGTACGTTCAATCACCGGACCGTACGTTCAACTGAAAATACGTTATACGTACGTTCAACGCTCGAATCGTTATACCTGAAAATATCACCGGACCCGTTATACCTGAAAATACGCTCGAATGTACGTTCAAGTACGTTCAAGTACGTTCAATCACCGGACCCGTTATACCGCTCGAATCTGAAAATAGTACGTTCAACGCTCGAATCGTTATACCGCTCGAATTCACCGGACCCGTTATACTCACCGGACCCGCTCGAATTCACCGGACCTCACCGGACCCGCTCGAATGTACGTTCAACGCTCGAATCGTTATACTCACCGGACCTCACCGGACCGTACGTTCAACGCTCGAATGTACGTTCAAGTACGTTCAATCACCGGACCCGCTCGAATCGTTATAC"
k_judge = 13

most_freq_kmer_short(text_judge , k_judge)

'CGTTATACCGCTC GTTATACCGCTCG TTATACCGCTCGA TATACCGCTCGAA ATACCGCTCGAAT'

<hr>

# <u> B1AC - Find the Reverse Complement of a String </u>

In [42]:
def reverse_complement(pattern):
  complement_map = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}

  return ''.join([complement_map[letter] for letter in pattern][::-1])


In [43]:
# DRIVER CODE
pattern = "AAAACCCGGT"

pattern_judge = "CACGTTTACATTACGACTAGTTGTCAAGGGTGGGACGAATAGCGCATTAGTAAAGCGGGCTTACCGTAAGGGAGGAGCGCATGATACGCTACGAGCATTTAGTAATACCGTAGCTTGGGGGAGCGGAACCACGTAATACCAGTAGACCAAAACAGCGATTCCTCATCGGGTGATCGCGCTGCTAATTCATGAGCACAGATGGATTAATGCCTGCGGCGCGGTTCTAAGACAATGAAGATCTATCTTCTGTGTACATTTAAGGTATAGAACCTCTTCGTCAATGGCGGGACCGCCAGTGCTCTAAACTTTGGATGTGTCGGGAATAGTATGCGAATTTTCAGTGACATTGTGTCCCGCTGATGACCTTTTTGCGCGGTTTGTAGACCATCGGCTTATCGGGCGTGGTAAGGGTCCCTCTGGCTAAATCTATCGGATCTCTGCCAAACATTGGGGCCCGAGGAATACGGCGATCGTCTGCTACGTACCTGTTCGATAACATAGAACTTTCTCCTGAATGTGGAACAAGGTAACGGTAATTAAACTGCCTCGGAACCCTAGGGGGCGTATTGACAGTCGGGGTCTAGCATGGCCATCTAACGCGAACAGCGCGGAGAACAAGTGGAGTACCATCACATCAGCCTACGGGAGGACTATGCGCGGAGAGGTCAAGACCGTAGGTCGGATAGTGCGTTTTGGCTCCGACCTTCAAGAGTGCAAATACCAGGCGTGCGGCCGCAAGAGGTTTTTCTGGAATTCTAGTTTATCGGCAGGGTGCATCTGATTCTCTACTTCACTTCCTCCTTGGTCATGCCGCTCTTACGCAAAGTCCCGCTTAGCAAGGACCAGCTGCACTGTCATTCTATCTCGTGGTCCTCGGCTACTAATGAGTCCACCCACATCTCAATCTCCAGTGCATACTGCCACTATAATAAGTTCCTTAGTTATACAGCTAAGTTCCCGTTAGTAGTAATTCCTTTACGCTCTTGTGATACTCTTTACGACTTTGCGTCGGAACCACTCCTACAACTATGCGTATCTAGTTGTCGTGTGGGGATATCCAGAACATGATCTCGGAAGCTAAGGTTGGCTGCAATATCGGAACGCTACAAGTATTTCCCTTCCCTACGCGCAGGTTAGGTCAGAGATCGCCTCCGGGATTCGTCATTTACGGTAAATCACCAGGTAAAGGGTGAGTAATTGTCCCAGGTCAAGAGCAGGGTTTACAAGCCTCAGAGACAATACTTATGACCCACGGGAGTCATGTTCCTGCTCTCCCTATTTGGTGGTTCAAGTCGTGGTCTCCAATGCCCGTACACTGGACTCATCCCCTCAGAATTGCATTCTACGTGCGCTCTGACAGGTCAATTAACTTACTTAGAGCGCGAAGTCTAAAAATATGTAAGATCCAGTGTCCTCGTACGTTGCGACTCCGCGTAGATCGCTATTGTCGACTCTGTCGCTAGCTACGTATCTACGGATCCACAACGATTGAGATAAAAGTGATAAACCCTCGGGCGGGCTCGTAGACCTTGAATATTGATGTGAGATATTGCACGCTGAGGCTGAAACTTGTAGCTACCTACGGGTCCTATTGAAAGAACTTTTTGCTAAAGTAAATTTCAGCATGCACGGTCTCGACCTCGGTAGCCCCAAACGCTCCTAGGTTGTTAAGCCAATCTTTTTTTTTACACCTTGGGTACTTCTTCGGGGTACTTGTACCGTCTCGGCGCTTTGGCCTGCATACCAATTCACTACTGGGCAAACTGACTTTGATTATCGCAGACCGGAAGCGTCGAGATTGCTTCTATTTATGTGACTGGCGGAGCTTCGAATACGTCTAGAGCATAAATAGCCGGTCTGGATAGGTTGCGTCTACAACATCCACCAGGGTATAGTCCCGCTCGCAAATGCGCCTATGTCATCCCGCTGAGGTGGCAAGGTACATAAGAGTGTGTTGTACCAAAACGTTTAAAATTGATCATCCGCGGATGGGGGCCATAACTCCGATGATCCCTAGCCGAGTTGAAATTCACATTGTGAATCGGCGACCAGCCGTCTTCAATCTTTCGATCCGGGCGAGTCCACGCTCTATCCCTATTAGAATCTAGCTCTAATTAACAGTGTACGTAGAGGGCCGCCGAAGAGTTCATTCTAACACTGAGTCTCCTTTATGTTCGTGGTATCCTGGGGTGCACGTCAATCCATTAGAGTCCGGGTGGAGATATCGGATCTTTCTACGGTCGAGATGGGCGCTTGCATGACGCCGCTAAAGTTTGGCTTTGACGGTAAGTATCCAAATAAGAATAGCTTGATTGTTTTGCACTTAGTTTGGTTATGTTTCGAGATATAGGTCGTACGACTTTGACCTGTTGTCCGATGGCTGGTCCCTATTTGGATTGGCGGCCAATGTGAGGGGTGATTAGTCACCTCTCCGATCGGGTGTCAGGCTTAGTATAGATCATCTCGTTCCCTTTGAAAATATGCAAGTAAGATAGGTCGGTGAGCTCTGGACGGGGTGGTCCTCTATTGTTGCAATACAATGCCAATTGGTGTCTTTAGTCACTCTAATTACAGACTTCACTCGATATCGAAGGCAGGAAACGTTTCTAGTCGTCAGGGCCGTCAGGACGAGGCAGCGTCGCGTCAGTCGACTTTATGTTTACTCTGTGAGGCCGAGCATAGGCTAATTGACTTCGAGGCTTCGGTCGTTAAGTATTCTTGGACTGCGGGTTCAATTAAAGAAGGCAAGGTAACGCATCGCCTCAACAAAGTGTTAGTGCCGTTCGTCATAACGCTCGGCAAAATAAAGATCACAGATTATGCGCCAGATTGCACTCATAGCCTAGCACGACTTGAGGCGACTACCGAAGTGGTCCCGAAAGAACCCTATATCACCAGAACTAACTTCAGTCTTGATTCTCTGTATGTACTCACGCGGCGTTCGTCATATCCAATGTAGTCTTAATCTGCTGAGCCGTGGGTGAAAAGAGACAGATCTCCCCACGATTCGTACTCTCAGGTATCCCGGCCCTTGACATTATAAGGGCGAGTGCGTCGCCGTGTGCTTGCGGCCCTTCTCGTAGTCTTTGAGCAACTTGCAGGCACTAGGGATGTTGTTAATGGTTGAGCATAACGCGCACTTCACGGCCCGCATGTGGTCGTAGTCCGTGTTCATCCGGTACCGCGATTGTATTACAGAGATGACACTTGTCCGCCACAGCCACTTGAGTGGGTATGATTAAAGCTCAGGCTTTTCTGACGCGCGGGTTATGTCAACCGAGTAGGGCTATATGAAGGCTACTTATAATACAAATTCGTCACATGCCCGGTCTTCATGTGTAATACCATCAAGCATGCCGACCACCATGCCCGGTTAGCTTTGCAAGGCGACAAGGGCTACAAAGATACACCATACTATCGGTCAGCGTTACGCGTATCGGACACAATACGACTGTACAGGGTATAATATCGTCCAGAGGCCCGCGACCAATCCTGGTGAGCCAGTTCTCCTAAGAATTAAGGTGGGCAGCAGACCTTTAGGCAAACTAGAGGCCATGCGGATTGATCTTAAGTCTGGCGAACCAGAACAGCTGCAAATAAATTGCAACATTTACTCAGGAGCTAGATGACCACTCCGACATACTATTAAGGGCCTTTTCCAGTCTCCAGCGCCGAAACTAGGCTATTATGGAAGACCCACATCTTGTATTAGTAGAGGAGCTGCAATATCGTTTGAGCGCTTCTGCACGGGCACAGAAAGATCACTCTTCAGCGTGACGCGATGGGCAGAAGGAATCTGCTCTGTTGCGCTGGAGCAGGAGGCCTTCCTCGGCACGACGATTTCCGATATGTTTAGGAAAACCGCAGGCGGAGTAATCATTATGACCTCGTGTACGTCTTGCACTGTCCAGCCGTGACCAAGATTTTGTGATGTTAACGCCGTTTTTCTAGCCGCCTAGTAAGAACGAAGTACGTAGACAGATCCTAGTCCACAGTTAGGGAGCAACTCAACCAAACTGCAACAACATTAGCCTATACGCCCTTGGACTCCAACACCTCGCAACGATATTCGCTACCCTTCAGTATCCGGTCGGCGTCGTCCCCGCCCAATAACGTCGGAGCTAGTTTACACATTCACGAGTATGGATAACTCGTGTGCTATGGAATCTACTATTAGAGGATCCTGCGATAAAGTCTGGGGCTAATCACAGGAATTCGCTTGCGCATTGCGATGCTCTGCGTCTGACGCGCTGCATGGTGGTCGTTTACGAACAAGAGTCGCGGGAATCCTGCATAAGACATGTCAAAGGTACCAGCCTCACCCTGGAATATAAGGATTCTCGTGGAGAATCAAACTGGAAGAGAATCATATTAGAGGGGGGGGCAGAACGAACGACAATAAGGGACCCGCGATGAGATCACTCGCAGTTTGGGGGGAACATTCGGCGAAATAGCATAAGGCCGAACTCCCTGGTAAATATAGTAAAACGTGCAATTACCTTAAATTCCTGTGTAGCGTTGCGAGCACACGGCTACATCGCCACGTCGGACCGTAAATCACGTGGGCAGGAGGACGGGACTTATTGCAGAAAGCTAGATAACCACTGGAGATACCGGTGCTCATCCGCCTCCTCAGTCTCTGCCAACTCATACACCGAAACGACTGCAAACGGTGCGTTGCTGCCAGTTACTCTAATGTTGATAGAGCAAATGTTCCGGATGGCCACAGGTAAAATGCTCAGAGCCTGTCCGATTGTTCCTGGATGGATGTTGATATTCAGACGTGTCGAACGGGCAATCGGTCTGAAGGTAAAATTTCCGAGTATGACTTAAGTTAAACCCATGACGAAGCCTCGTCTATTACACCTCCATAGACCTAGGCTATAGGCGCTTGAAACCGTATCCAACAGGCGCCATCTGGCTAACTCAAGAGAAAGTACAGAATCATGTGGGAAGTCTTAAGGAGTAGTAGAGTACCACATCCACATCTGTGAACGTCGCACTTAGCCCCGGGAATACGCAGCTAATTACGGTCCCCTGACTCTAGTATTGTCTTTGACATTGACGCGTCTGCTGTCAATTAGTTGACAGGAAGTATGGCCCAGCGTGGTCATAGGGCCTTGGAGCGTTAAGCTCCCGAAATTCGGGTTAGTACGTAAACCAAGATATCCCCATGCGTTTGAGCTACTAGTCCTGCAACAAGTCCATATAAAATGTGATATTTCTGAGTCGAAACCTCGAAAAACAGTCCTGCGACCCGCATGAGGTGGAGTTGCCTTGCGCTACGAGGGGCAAGACATGTTGTATGGTAGCTCAGGCTAACCAAAAGTACGTTACCGAGGGCGCCTATCTCGGAATGGTAAAGCCCTTATTACGATTGGCCCGATCATAGGCTGTCTGACGCCTTTGGGTTGAGCAATGTGTATCCGCCAATAACTAAGCTCATTGGCAGATCCGCCCCGGTGCCCCGCCGTGATTCAAACTTTTCCTCGTCTTGCGAATTTGTGTCACACGTACGGCTACTCCCTCAATGTCATGTCTGTAACTGAAGCATTTCAGCTACACACAAATCTCACCCCGAAACGCAAACCGATACCGCTCAGTCGAGAATCCAATCGCTTAGTGAAGAAACAACTGAGTTCACCGGGAGATTTGTTTTGATCTGATAGTTCATGCTGTTTCCTTCGTGGCGGCTGCACCACCGTAGACCGTTGGAAAACTCTCTGCAGGGTATGCGGTTAGTTTAAAAGGACGGAAGAAGTGTCTTACATACACGAATTAAACTTTATTTGGTTCTACAATGGGTCCCTTGCCACCGGACAACATCCCATTCATCGGCCCATACTGGTCGAACGGCTACTGCGAAGAGAAGAACATGTGTCTGCCTCGTGGTATTGCGCCTTAGATTTACCAACCGCGTATTGACCTCTCAGACCTAATAATAATTTTCCCATAAAGGACACGGTCCCGTTGTGCGGCCCTCACTAATAATGCGGACCATGTATCTACGCTTCGGAAACTGGCTGGTTGGCCCACCATCAGGAATCATCCAAGTGGGGGGTTCCTCTGGCCGGCCGCCTGTTACCGAAAACGACTTTTGCGGGCACTAATGGTTAAAGAATCACCTAATTGATCGATGTTATCAATTGTCTCTATGAGGTGGGCTGCCCGTTACTCACGGAAGCAGACTCTATCGTAGGACAGCGGATAGTTAAAGCGCCTTTGTTGTTGAACACTGTGTCCAGAAAACGTTCAACTGGGATTGAGTTACAGCACCATCGGCCAAGCGACTTGCTCTTCTATACACGACCCAGGCGGTAGTCTGTGCTGATCGGGGAGGCTTACGACTCGTTGCGGGGTTATTAAGGATTGAAAGTTAGCTACACAGCGCAGCGGTCAGGGAAGTTACGTTCGAATACTGCTGCACCTGGCTTGATAGCAGGTTAAGATGTCGGCCATCCCATTCGATGCTAGGGGAAATTTGGCTGACTTCCGCGATGCCGCACCTCTAAGTTAACGATGTTCACCGTGTGAAGTCGGGGGGTCACCATAGTCAGACGAAACTCATCCTTGCAGCATATCAATCGGTGGGACAGCTGGCGTTGCTTTCGATTTCCGTGGCCGCCGAGCGCTGAATGCTCTAAAGCAAAGTCACCATTATTCGTAAACGAGGCGATGGTAGAGCGCCTGAAGTATTAGCCGGACTCATATAATAGGCCAGGCATAGAGTTTACTCCATGGTGACTATGGAGCCAAGGTAGGCGAGCTTTACGCCAGACATAAAGAACCCAGAAACTGCGGACACTCTAAAAGGGTAGACGTTTGTTCTTCTGCGCTAAATTTCCCCGGCATATGGGGATTCCCTGAGATATAACGCAGAGGGATTTGGAGAGAGGATCGGAATCCGCGAGTGTGAGATTTTCTATATGCACTCAAACCTGTCCCGGCCAGTACACGTACATGGATGATTACGATCCGAGGCCACACTGTTCTACCAGTACGAGAACCCTTTCCGCTCACAGCGCAATTATCATCGTAGCACGTTAGAGTATACGGTGGGTTGTGGTGACTACAAGCGATAAAAATTGAGTTTCCCTGTCTGGGCCGGGGCTCCACAAATTCCTGAGAACTCGCAAGCACGGGCACCTGATGTGCGCGGGGGTCGCGGCCTTTCGCGCCTTTGCCCCAGCGTCGCGGAAAGTCCAGACCAGTGCTCGACCTGGGAACTAGCCGCTTGTCAGCTCTCCAAATTTTAGGGAGGGCAAGTCCGGCTTCTGCCTCTCTGATAGCCCCATGCCGAAGGCTCGGCAGACCCCGTCCTAGCCGGGATAAAACATCTGCGATGAGGCATGACCTAAAGGAAATCCGTTATAACGTTGATCCCCCTATGAAATGTTCCTTCATAAACCAGGAGCTAATCGCGAAAGTGCGGATTATGTCTGTGCAATGGCTGACGGTATCGGAGTTGACACGCTCCCGTTGCCGGCTTCCGCGATAAGGTGGGAAGGCGTATCAAATACGTTAATGACTTGACCGAGAGCGCATGCGCGTTGCTCGGTTCGGACGGGGAGGTAGGAGACAGTTACACTTCCCGAGGGTAATCGACATTACTCCCGGATAGTGCGCGATAACTAGGGCCGAGAACACGTTACTTCTCATCGTGTATTAACACTTAGTGAAGGCTCAACCCATTTAAGCCGCAATCCACAAAAGCCCGGCAGGCAAGATACAACCCTCCGTAACGGGAACACTTACTGGAATTCCGGTCCCTGGCGTTCAATGCCGAGAGTGCTCTGTCAGCTTCAGGCGCGGCGTGTGTAATCCATCTGCAATTACGCGATCTAATCAGTCGGTGTTAGGGTGCGATTAGGGGTGCGGAGATCAAGTCGCCTTCATCAGGCGCACTTAATGGCAGCGGCACTATGAGTGAATGGTTGTGAAAGTCAAGCATTCGCTGCGCGGCGCCCTTAGCGATGCAGTTCAAGAGTTTAGTAGGTACTGGGGCGCAGACC"

reverse_complement(pattern_judge)

'GGTCTGCGCCCCAGTACCTACTAAACTCTTGAACTGCATCGCTAAGGGCGCCGCGCAGCGAATGCTTGACTTTCACAACCATTCACTCATAGTGCCGCTGCCATTAAGTGCGCCTGATGAAGGCGACTTGATCTCCGCACCCCTAATCGCACCCTAACACCGACTGATTAGATCGCGTAATTGCAGATGGATTACACACGCCGCGCCTGAAGCTGACAGAGCACTCTCGGCATTGAACGCCAGGGACCGGAATTCCAGTAAGTGTTCCCGTTACGGAGGGTTGTATCTTGCCTGCCGGGCTTTTGTGGATTGCGGCTTAAATGGGTTGAGCCTTCACTAAGTGTTAATACACGATGAGAAGTAACGTGTTCTCGGCCCTAGTTATCGCGCACTATCCGGGAGTAATGTCGATTACCCTCGGGAAGTGTAACTGTCTCCTACCTCCCCGTCCGAACCGAGCAACGCGCATGCGCTCTCGGTCAAGTCATTAACGTATTTGATACGCCTTCCCACCTTATCGCGGAAGCCGGCAACGGGAGCGTGTCAACTCCGATACCGTCAGCCATTGCACAGACATAATCCGCACTTTCGCGATTAGCTCCTGGTTTATGAAGGAACATTTCATAGGGGGATCAACGTTATAACGGATTTCCTTTAGGTCATGCCTCATCGCAGATGTTTTATCCCGGCTAGGACGGGGTCTGCCGAGCCTTCGGCATGGGGCTATCAGAGAGGCAGAAGCCGGACTTGCCCTCCCTAAAATTTGGAGAGCTGACAAGCGGCTAGTTCCCAGGTCGAGCACTGGTCTGGACTTTCCGCGACGCTGGGGCAAAGGCGCGAAAGGCCGCGACCCCCGCGCACATCAGGTGCCCGTGCTTGCGAGTTCTCAGGAATTTGTGGAGCCCCGGCCCAGACAGGGAAACTCAATTTTTATCGCTTGTAGTCACCACAACCCACCGTATACTCTAACGTGCTACGATGATAATTGCGCTGTGAGCG

<hr>

# <u> B1AD - Find All Occurrences of a Pattern in a String </u>

In [None]:
def all_occurences_pattern(pattern, genome):
  occurrences = []

  for i in range(len(genome)):
    if genome[i : i+ len(pattern)] == pattern:
      occurrences.append(str(i))

  return ' '.join(occurrences)

# ===================== MORE PYTHONIC ==========================

def all_occurences_pattern_short(pattern, genome):
  return ' '.join([str(i) for i in range(len(genome)) if genome[i : i+ len(pattern)] == pattern ])

In [None]:
# DRIVER CODE
pattern = "ATAT"
genome = "GATATATGCATATACTT"

pattern_judge = "TTTGTTCTT"
genome_judge = "ATATTTGTTCCTTTGTTCTGTTTGTTCATGTTTGTTCTTTGTTCTTTGTTCGTTTGTTCTTTGTTCATTTGTTCTTTTGTTCTTTGTTCAATACATTTGTTCATTTGTTCGCCTTTGTTCATTTTGTTCTTTGTTCACTGTTTGTTCAGTTTGTTCATTTGTTCGGTCTTTGTTCATGCAGTTTGTTCGGCTTTGTTCTGACTTTGTTCGTTTGTTCCTTTTGTTCTTATTTGTTCTTTGTTCGGATTTGTTCAGTTTGTTCGTCAATATACGTCGCCCCTGTTTGTTCTTTGTTCGTTTGTTCACTGCAATTTTTGTTCTTTTGTTCTTTGTTCGTTTGTTCTAGTTTGTTCCCGATTATTTGTTCTTTGTTCCATTTGTTCATTTGTTCCTTTGTTCTTTTGTTCTTTGTTCATTTGTTCTTGTAACCCGTTTGTTCGACTTTGTTCCATTTTTGTTCAATAATTTGTTCGTGATTTGTTCTTTGTTCGATATCTTTGTTCTATTTGTTCTTTGTTCTTTGTTCATTTGTTCATTTGTTCCAAACTGTTGACATTTGTTCACCTTTGTTCTTTGTTCTCACATTTTTGTTCTGTTTGTTCTTTGTTTGTTCCGTTTGTTCATTTGTTCGGTTTTGTTCGTTTGTTCAGTTTGTTCCGGGTTTTGTTCGGCTGCTCTTGTCGATATATTTGTTCCTTTGTTCACCTGCTTTGTTCATTCCTTTGTTCCAACATCGGTTTGTTCCATTTGTTCCTGTTCCAGATTTTGTTCTTATTTGTTCATTTGTTCTTTGTTCTTTGTTCTTTGTTCTTTGTTCATCATTTGTTCTTTGTTCTTTGTTCTCCTTTGTTCTTTGTTCTTTGTTCATTTGTTCTTTGTTCTTTGTTCACTTTGTTCTATTTATTTGTTCTTTGTTCTGTTTGTTCTAATAGTTTGTTCAGTTTGTTCGTTTGTTCTTTGTTCATGTTTGTTCTTTGTTCCCGGCTTTGTTCCTTTGTTCGTTTGTTCTCTCCCAAATTTGTTCTGTCTTTGTTCCACGTTTTTTGTTCTTTTGTTCATAAAAGCTTTGTTCCGTTTGTTCTTTGTTCATTTGTTCGCTCCGTTTGTTCATGTTTGTTCTTTGTTCTACTCTTTGTTCAGCATAACTTTGTTCGTAACTAACGTTTGTTCAATTTGTTCGCTTTGTTCTTGATTTGTTCATTTGTTCCGTTTGTTCGGTTTGTTCCGGCCCAATTTGTTCATACATTGTTTTGTTCATTTGTTCTTTTGTTCTCTTTTGTTCTTTTGTTCCTTTGTTCGCGCTTTTGTTCTTTGTTCTTTGTTCTTTGTTCCGTGCCATCTTTGTTCCATTGTACTTTTTGTTCGTTTGTTCTTTGTTCTTTGTTCTTTATTTGTTCTCTTTGTTCAGTTTGTTCATTTGTTCTTTGTTCCACACGGGACTTTGTTCTATTTGTTCATTTGTTCGCGCTTTGTTCGCGTTTGTTCAGTTTGTTCGAACTTTGTTCAAAGATTTGTTCCTTTGTTCGTTTGTTCGCATTTGTTCGTTTGTTCTCTGTTTGTTCATTTGTTCTTTTGTTCTTTGTTCCAACATTTGTTCTTTGTTCACTTTTGTTCTTTGTTCCCTTTTGTTCGACCTTTGTTCGTTTTTGTTCGCTTTGTTCTTTGTTCTTTGTTCTTTGTTCGGTTTGTTCCCCTTTGTTCAACGTTTGTTCATTTGTTCTGTTTTGTTCGTTTTGTTCTTTGTTCCCTTTGTTCTATTTGTTCTAAGCTAAAGGAACTTTGTTCGTCATTCTTTGTTCTTTGTTCATTTGTTCAATTTGTTCTTTGTTCAGAGATTTGTTCTTTGTTCCCTTTGTTCCTCAATTTGTTCCTTTGTTCACATCTTTGTTCTTTTGTTCTTTTTGTTCCTTTGTTCTTTTGTTCAGCTGTTTGTTCTTAATGGCTTTGTTCATTTGTTCGCAAAATTTGTTCTTTTGTTCCTTTGTTCCGCTTTGTTCTTTTTGTTCAGAGTTTGTTCATGCGTTTGTTCCCACCACTTTGTTCTTTGTTCTACCGTTTGTTCTTTGTTCCCTAGCTTTGTTCCAAATTTGTTCTTTGTTCCTTTGTTCATTTGTTCTTTGTTCATTCTTTGTTCGGTTTGTTCTTAGCTCTGAACAAATTTGTTCTTTTGTTCGGTTTGTTCCTTTGTTCTTTGTTCGGTTTGTTCATATTTTGTTCTTTGTTCTAACCTTTGTTCTTCACTTTGTTCCTTTGTTCGAATTTTGTTCTGTACATTGGGTTTGTTCGTCGTACTTTGTTCTTGCATTTGTTCTTTGTTCTTCGTCACTTTGTTCTTTGTTCTTTTGTTCCGTTTTGTTCTATTTTGTTCAGGAACAACTTTGTTCAGGGAGTTTTGTTCGTAACAGAACATGATTTGTTCTTTTGTTCCTTTGTTCTTTGTTCTTTTGTTCCGTACGGTCTTTGTTCAGTATTTTTGTTCCGACTTTGTTCTCTTTGTTCCGCTTTGTTCTTTCTTTGTTCCTTTGTTCTTTGTTCTGCATAGTTTGTTCCTAGTTTGTTCATTTGTTCCACCTTTGTTCCTTTGTTCTTTGTTCGAATTTGTTCAGGTAACTGTGTTTGTTCATTTGTTCGTTTGTTCTTTGTTCTTTGTTCTTTGTTCATTTGTTCTTTGTTCCTTTGTTCCTTTGTTCGAGTTTGTTCGGTCTATTTGTTCTTTGTTCTTTGTTCCACGTTCGTTTTGTTCCTTTTGTTCTTTGTTCTTTGTTCATTTTGTTCTTTGTTCTGTTTGTTCCTTTGTTCTGTTTGTTCTTGGATTTGTTCGCATTTGTTCGTTTGTTCCCATGGTTTGTTCCTTTGTTCTTTGTTCTCGTAATTTTGTTCATGTTTTGTTCGGGACCGTTTGTTCTTTTGTTCTAGTCTACATTTGTTCTCTTCCTTTGTTCTTTGTTCGCAGTTTGTTCCTTTGTTCATTTTGTTCCTAAGTTGCATTTGTTCCTTTGTTCTTTGTTCGGTTTGTTCTTTGTTCGCCTTTGTTCATAATTTGTTCGGGTTTTGTTCTTTTTTGTTCACATTTGTTCTTTGTTCTCTGCATACTTTGTTCGTTATTTGTTCGATTTTGTTCTGTTTGTTCCTTTGTTCTTTGTTCTTTGTTCATTTTGTTCGAACCCTTTTGTTCCTTTGTTCTTTGTTCAATTTGTTCTAATTTGTTCGGTTTGTTCCTTTGTTCTGGTTTGTTCATTTGTTCACTTTGTTCACATTTGTTCAGTCAGTTTGTTCGCTTTGTTCGGTTTGTTCAATTCACTATTTGTTCAGTTTTGTTCTCTCGCGGAATCCAAGTTTGTTCCATTTGTTCTTTGTTCTTTGTTCGTTTGTTCTTTGTTCTTTGTTCGACATTTGTTCCACTTTGTTCTTTGTTCTTTGTTCTGGATTTGTTCTTTGTTCAACTTTTGTTCACACTTTGTTCAATTTGTTCAATTTGTTCTTTGTTCCGTTTGTTCTGCTTTTTGTTCTCATTTGTTCTTTGTTCGTGTGGGTTTGTTCTTTGTTCTTTGTTCACGTTTGTTCATCCTTTGTTCTTTGTTCATTTGTTCGGTTTGTTCTTTGTTCTTTGTTCAAAGATTCGACTGAATATTTGTTCCAACCGCGACTTCTTTTGTTCTTTGTTCTTTTGTTCGTTTGTTCCTCTTTTGTTCAGGTTTGTTCTTTGTTCTTTGTTCGTTTGTTCTTTGTTCTTTGTTCTTTGTTCCGTTTGTTCTTAAACCCTTTGTTCAGTTTGTTCTTTGTTCGTTTGTTCTTTGTTCCAACTTTGTTCTTTGTTCTTTTGTTCATTTGTTCGCACCTCATTTGTTCGTTTGTTCTTTTTGTTCATTTGTTCATTTGTTCCACTGGCAGTTTGTTCCAGCTTTGTTCTTTGTTCACCTATTTGTTCGTTTGTTCTTTGTTCTGTTTGTTCTTTGTTCGCTCCTTTGTTCTTTGTTCTTTGTTCTCGTTTGTTCTAGGACGGTCCAGCTTTGTTCTTTTCTTTTTGTTCTTTGTTCCCACCTTTGTTCTTTGTTCTTTTGTTCGAGGTTTGTTCTTTGTTCCTTTGTTCGCTTGTTTGTTCTTTGTTCGTTTGTTCTTTTGTTCGGTTTGTTCATTTGTTCGCGCGAGTTTGTTCTTTTGTTCTTCTGCATCGTTTGTTCCCTTTGTTCTTTGTTCTGGTTTGTTCTTTGTTCATTATTTTGTTCCGCTTTGTTCTTTGTTCTTTGTTCGTGGGCTATTTGTTCTTTGTTCAAGATGTTTGTTCTCCATTTTGTTCGATCGTCAGTATTTTGTTCTTTACGGTTCCGTTTGTTCGCTTTGTTCTACATATCCAAATTTGTTCTTTGTTCTTTTGTTCGTTATCTTTGTTCGTCTTTGTTCATTTGTTCTTTGTTCCTTTGTTCTTTTGTTCTTTGTTCCACCCATTTGTTCAAGCTTTGTTCCATTTGTTCTTTGTTCCGCGCTATTTGTTCGAGTTTGTTCGCAGTTGTTTGTTCAACGTTTGTTCGTTTGTTCCTTTGTTCTGCGTGGCCCTTTGTTCTTTGTTCATTTTGTTCTTCGTCGCTTTGTTCCCATGCGAATTTGTTCTGGCTATCTTTGTTCTTTGTTCTTTGTTCTGTTTGTTCGACTTTGTTCTTTGTTCTTTGTTCTGTTTGTTCTTTGTTCGCTTTGTTCGAGTTTGTTCTTTGTTCTCGGTGTTTTGTTCGTTTGTTCTTTTTGTTCTTTTGTTCGCCGTTTGTTCCTCCGTTTGTTCTATTTGTTCGTTTGTTCGCTTTGTTCAAGTTTGTTCTATTTGTTCTATTTGTTCATTTGTTCCCATTTTGTTCATTTGTTCTTTGTTCCTGGACCAAGGACTTTTGTTCTTTTGTTCGCCCTTTGTTCTTTTGTTCTTTGTTCGTTTGTTCTTTGTTCTGCACGTTTGTTCCTTTGTTCTTTGTTCTGTTTGTTCCATTTTGTTCTTTGTTCTTTGTTCTTTGTTCCCTTTGTTCATTTGTTCTGTTTGTTCATAAGTTTGTTCACAGGTTAATTTCCACTTTGTTCGATGGTTTTGTTCACTTTGTTCTTTGTTCGGTCTTTGTTCACTACTCGTTTGTTCTTCTTTGTTCTTTGTTCTGTTTGTTCAGCATCTTGGGTCTTTGTTCATTTGTTCATTTGTTCATTTGTTCAGGTTTGTTCATTTGTTCTTTTGTTCTTTGTTCGAGTTTGTTCGGAGTTGGTTTGTTCTTTGTTCTTTGTTCATTGTTTGTTCGATATGGGACCAGGTGTTTGTTCTGTTTGTTCTTTGTTCCTTTGTTCTTTGTTCGTTTGTTCTTTGTTCGCATCCTTTGTTCGGTTTGTTCTTTGTTCCTTCGGTCCGCTTTGTTCTTTGTTCTGTTTGTTCAGTTTGTTCAGCTTTTGTTCGGTCTTTGTTCTTTGTTCTTTGTTCTTTGTTCTTTGATTTGTTCTGTTTGTTCGTTTGTTCTTTGTTCAGGCCGTTTGTTCTACGTTTTGTTCTTTGTTCCCTTTTGTTCTTTGTTCGGTTTTGTTCTTTTGTTCATCTTTGTTCGACTTTTGTTCAGCGGATTTTGTTCGCGAAGTATTTGTTCGGATTTTGTTCGTGCAGAATTTTTGTTCCCGACTGGCCTTTGTTCTTTGTTCTTTGTTCTTTGTTCATCTTTTTGTTCTGTTTGTTCGCGGTTTGTTCCGTTTGTTCGTTTGTTCCATTTGTTCTTTTTGTTCTTTGTTCATATTTTGTTCATTTGTTCGCTCTTTGTTCCAATTTGTTCGTTTGTTCATTTGTTCCGAGCTTTGTTCTTTGTTCTTAAATTTGTTCTTTGTTCATTTTGTTCTTTGTTCTGTTTGGGCTTTTAAGTTTGTTCTTTGTTCAACCTTTGTTCGGATTTGTTCGTTTTGTTCGTCTTTGTTCCTTTGTTCCGCTTTTGTTCTTGTTTTGTTCTTTGTTCTCCTTTGTTCGCACTTTGTTCTGTTTGTTCTCGTTTTTGTTCTTTTGTTCATTTGTTCATTTGTTCTTTGTTCTGTTTGTTCTCATTTTTGTTCTTTGTTCGTATTTGTTCCTTTGTTCCTTTGTTCCGGATTTGTTCTTTTGTTCGATTTGTTCTTTGTTCATTTGTTCAGCTTTGTTCTTTGTTCTTTGTTCTTTGTTCATTTGTTCAAGGGTTTGTTCTTTGTTCTGTTTGTTCTTTGTTCGTTTGTTCAAATGGACGGATTTTTGTTCGTATTTGTTCGTCCGATTTGTTCTTGCGGCTTTGTTCTTTTGTTCTGTGTTTTGTTCTGTTTGTTCTTTGTTCGTTTATTTGTTCTTTGTTCCTTTGTTCCTTTGTTCTTTGTTCATTTGTTCTTTTTGTTCTCTAATTTTGTTCTATTTGTTCATTTGTTCATATTTGTTCTCATTTGTTCATCGTTTTGTTCTTTTTGTTCGCGTTTGTTCTTTGTTCTCATGGTCCCGTTTGTTCTTTGTTCACTTTTGTTCTTTAATTTGTTCTTTGTTCTTTGTTCTTTGTTCTTTGTTCCACTTTGTTCCATTTGTTCTTTGTTCTTTGTTCGATTTGTTCTTTGTTCCTTTGTTCAGATTTGTTCTCCTTTGTTCATTTGTTCGCAGTGGTTTGTTCGCCTTTTTTGTTCCTTTGTTCGAGGATTTGTTCGTTTGTTCAAGGATTTGTTCGTTTGTTCTTTGTTCCTTTGTTCTAACTTTGTTCTAGCATTTGTTCTTTTGTTCTTTGTTCCTATTTTGTTCATTTGTTCTTTGTTCGGCATTTGTTCCTAACTTTGTTCGAGTTTGTTCGTTTGTTCATTTGTTCCGGTTTTGTTCTTTGTTCTTTTGTTCCTTTGTTCTTTGTTCTTATGGCCGAATTTGTTCCGTTTGTTCATTTGTTCTTTGTTCTGCGCTGTCTTATTTGTTCCCATTTATTTGTTCATTTGTTCTTTGTTCGATTTGTTCTCCGGTTTGTTCTTTTTGTTCTTTGTTCTTTTTGTTCTTTGTTCGTCATGTGGATTTGTTCTTTGTTCTTTGTTCTTTTTGTTCCTTTGTTCGTTCATTTGTTCGAAGCTCTGATTTGTTCTTTGTTCCATTTTTGTTCATTTGTTCATTTGTTCTTTGTTCGTTTGTTCTCTTCCTTTGTTCGATTTGTTCTTTGTTCTTTGTTCAGTGAGCTCCCCTAATGTTTTTGTTCTCATTTGTTCATTTGTTCGATTTGTTCCTTGTTTTTGTTCTGAAGTTTGTTCTTTGTTCGATTTGTTCATTTGTTCTTTGTTCGAATTTTGTTCATTTGTTCTTTGTTCCTTTGTTCCTATTTGTTCGTTTGTTCATATTTGTTCTTTGTTCTAGTTTGTTCCGGGGTAGTTTTGTTCAGATTTGTTCTTTGTTCTTTGTTCGAATGTGGTTTGTTCCCTCTTTAGTAATAGCATGTAGAGCTTTGTTCCATTTGTTCGAATTTGTTCGGATCTTTGTTCGTCTTTGTTCCGTGGCCGTTATTTGTTCTTTGTTCGTAAGCATCTTTGTTCTTAAGAACACGTTTGTTCGTATTTTTGTTCTTTTGTTCGTTTGTTCTTTGTTCCATTTGTTCCCTGTTTTGTTCGTTTGTTCTTTGTTCTTTGTTCTGATTTGTTCACTGTTTGTTCCCTGTTTGTTCCTATTTGTTCCTTTGTTCATTTTGTTCTTTTGTTCTGCGAAAATTTTTGTTCTATTTGTTCAAATTTGTTCTTTGTTCTTTGTTCTTTGTTCTTTGTTCGTTTGTTCGAGGTGTATTTGTTCTTTGTTCCATCCTTTTGTTCGCCAAGGCTTTGTTCTTTGTTCGCATTTGTTCCATTTTGTTCAAAGGCATTTGTTCTTTGTTCTCTTTTGTTCGTTTGTTCTAGATTTGTTCTATTTGTTCCGTACCCGTCTACTTTGTTCTTTGTTCTACTTTGTTCGGTTTGTTCGACTTTTGTTCAATAAGGAGTTTGTTCTAGCATGTTCTTTGTTCTTTGTTCTTTGTTCTTTTTGTTCCCCTTTGTTCATTTGTTCATCAGATAGGTGACGATTTGTTCTTTTGTTCTTTGTTCACGACATTTTGTTCGGTTTGTTCCGGTTTGTTCTTTGTTCTTTGTTCTTTGTTCTTTGTTCTAGGACATTTGTTCAACAACTTTGTTCCTTTGTTCGTTTGTTCTTGAAACTTTGTTCCGGTGTGTTTTGTTCTTTGTTCCTCAAATTTGTTCCTTTTGTTCTTTGTTCACTTTTGTTCTTTGTTCATTTGTTCTTTGTTCCTTTTTGTTCATTTGTTCGTTTGTTCGAGTTTGTTCCACCATTTGTTCCTTTTGTTCTTTGTTCTTTGTTCGTTTGTTCCACATTTGTTCATTGTTTGTTCTAATTTTGTTCATTTGTTCAATTTGTTCTCATTTGTTCTCCAAACTTTTTTGTTCTTTGTTCCCTTTTGTTCTTTGTTCCTTTGTTCGATAGTTTGTTCCTTTTGTTCTTTGTTCTTTGTTCTTTGTTCACTTTGTTCTTTTGTTCCCTGATTTGTTCTTTGTTCGGATTTGTTCGTTTGTTCTTTGTTCTTTTGTTCCCCTCTTTGTTCTTTGTTCCGGTTTTTGTTCCCGATTTGTTCCGTTTGTTCGCTTTGTTCTGTCATGGCGTGTTTGTTCTTTGTTCTTTGTTCCACTTTGTTCTTTGTTCCTTTGTTCTGTCATTTGTTCATCTGTTTGTTCGAGTTTGTTCTTTTGTTCCTACACTTTGTTCTTACGCGTTTGTTCATTTGTTCAGGTTTTGTTCTTTTGTTCTTTGTTCTTTGTTCTTTGTTCTTTTTGTTCAGGCAGGTCATCCCCCGTTTGTTCGTAATTTGTTCGAAATTTGTTCAGACTGATGTTTGTTCCTTTTGTTCTTTGTTCGCAAAACTTTGTTCTGCGGTACTCGTCCTTTTGTTCTTTGTTCCTTCTTTGTTCACATTTGTTCCCATTTGTTCTTTTGTTCGTTTGTTCCAGGTTTGTTCCGTTTGTTCATTTGTTCTTTGTTCTTTGTTCGCCATTTTGTTCCTTATGTTTGTTCTTTGTTCTGTGTAGAGTCTTTTGTTCAACGCTTTGTTCATTTTGTTCTCCTGGTTTGTTCTTTGTTCTTTTGTTCATTTGTTCACCTTTGTTCCCTTTGTTCAATTTGTTCCATTTGTTCAGTCTTTGTTCAGGTCCCGTTTGTTCCTTTGTTCTTTGTTCCTTTGTTCTTTGTTCCCAATCTTTGTTCTTTGTTCTTAGCAGGATTTGTTCCTTTGTTCGGTCCCGTCTTTGTTCGACCTTTGTTCTTTGTTCCTTTGTTCTTTGTTCTTTGTTCATCTGTAGTGATTTGTTCCTTTGTTC"

all_occurences_pattern(pattern_judge, genome_judge)

'30 37 52 67 75 122 219 229 282 313 321 360 392 400 415 476 505 512 565 595 764 782 789 796 803 821 828 845 852 867 874 903 949 966 1042 1074 1112 1181 1257 1275 1303 1310 1317 1365 1372 1379 1416 1563 1571 1590 1607 1654 1661 1668 1732 1792 1816 1835 1883 1891 1908 1928 1964 1990 2036 2055 2086 2109 2136 2158 2183 2210 2229 2292 2304 2311 2326 2333 2412 2428 2435 2502 2521 2580 2631 2638 2645 2660 2706 2713 2745 2752 2768 2801 2852 2898 2935 2995 3011 3050 3070 3131 3138 3176 3345 3352 3367 3374 3402 3409 3427 3474 3512 3533 3540 3568 3592 3599 3651 3658 3695 3702 3717 3724 3731 3747 3771 3786 3804 3811 3850 3902 3929 3945 3964 3971 4009 4023 4042 4049 4068 4095 4110 4149 4157 4183 4200 4229 4236 4258 4309 4356 4363 4402 4417 4425 4465 4554 4570 4616 4623 4649 4656 4672 4698 4727 4736 4848 4876 4895 4903 4918 4946 4972 4979 4986 5076 5109 5119 5197 5205 5237 5244 5294 5309 5324 5353 5378 5425 5432 5439 5446 5475 5507 5524 5541 5644 5651 5658 5723 5732 5807 5814 5826 5842 5872 5938 594

<hr>

# <u> BA1E - Find Patterns Forming Clumps in a String </u>

In [None]:
from collections import Counter

def pattern_forming_clumps(genome, k, L, t):
  clump_kmers = set()

  for i in range(len(genome)-L+1):
    clump_seq = genome[i:i+L]
    kmers_counter = Counter([clump_seq[j:j+k] for j in range(L-k+1)])

    for key in kmers_counter:
      if kmers_counter[key] >= t:
        clump_kmers.add(key)


  return " ".join(clump_kmers)



# ===================== MORE PYTHONIC ==========================

def pattern_forming_clumps_short(genome, k, L, t):
  clumping_kmers = set()
  
  [[clumping_kmers.add(kmer[0]) for kmer in Counter([genome[i+j:i+j+k] for j in range(L-k+1)]).most_common() if kmer[1] >= t] for i in range(len(genome)-L+1)]

  return ' '.join(clumping_kmers)

In [None]:
# DRIVER CODE
genome = "CGGACTCGACAGATGTGAAGAAATGTGAAGACTGAGTGAAGAGAAGAGGAAACACGACACGACATTGCGACATAATGTACGAATGTAATGTGCCTATGGC"
k, L, t = 5, 75, 4

genome_judge = "CAACCCTCTTGTCTCGTCGGAAAGTTTAGGTCCACCCCTCCAGGTTGCGTCGCCGTTTGGTGCAGCCTCTAAGAAAACCAAGATACGAGACCACGCGAGGGGGGATCTGTTGTGAGTCTTACGAGTAGTGGTCCGGCTAGCGTTCGTATTGGCAATTCTGATCAGGAGCATAAGACCTGCCTGGCCTGCTTTGGTTGCGGAGTGACACGGGTATCCGGCTCTAACACTGAGCTTTCTAGGGCGCTCGTTTCGAGTAATATACCTGGCCTGCTTCTCCGGCCTGGCCTGCCCTACGCTGGAACCCTGGCCTGCCTGGCCCCTACGCTGGAAAAGGGCGGCCGATGCCTCCTACGCTGGAATTCCTACGCTGGAACTACGCTGGAATAGCGTCAAAGACCTGGCCTGCTTACAATACCTACGCTGGAAGGAATTTGCCTGCCTGGCCTCCTACGCTGGAAGCTTGCTTCTATCTGGAAAAGCCCGTGACCTACGCTGGAAACGCTGGAAACCTCCTACGCTGGAACGCCCAGATTTCCTACGCCTACGCTGGAATACGCTGGAAGGAAGCTGGAAGGCCTGCTTCAAAAGAGTCCTGGCCTGCCCTACGCTGGAACTGGAATTTCCTGGCCTGCTTTGCTTGCCCTGGCCTGCTCCTACGCTGGAACTGGATATTAATCCTACGCTGGAACTTTAAACCGCCTGGCCTGCCTACGCTCCTACGCTGGAATACGCTGGAACTGCTTCTGGCCTGCCTACGCTGCCTACGCTGGAACTGGCCTACGCTGGAACCCTACGCTGGAAACGCTGGAACGGTCAGGGGCATTGATGACCTACGCTGGAAGCCTACGCTGGAATCCCCTACGCTGGAACAGGTCAGGCAGACTGAAGCGCGGACTAGTCCAACTAAAGCTTTTACCCTGAAAAATTCGCCCAGTTGATACGGGTGCACGCGAACGGAGGGGCGCTGGTTTACCCTGAAATGAAAGCTCACTATTTACCCTGAAATCAATTTTACCCTGAAACCCTTTTACCCTGAAAAAAGGGCTCCTAGCCGTTTACCCTGAAATCTACCTTTTTTTACCCTGAAAGTCCGACATTGATCTTTACCCTGAAAGTTATGCCTTTACCCTGAAAAGTAAATAATAGCGATAGTTCTACCCTCCCCGCTCCCCCATTTCGTTACGGGATAAGGATAAGGAAGGAACAGAACGGGATAAGGAGGAGGCTTACGGGATAAGGACACGGGATAAGGACGGGATAAGGAGATTACCCTGAAACCACGGGATAAGGAAGGTCCCAGTGCTCCAGATAGTCCCAGTGCTCTGAAATACGGGATAAGGATTTACCACGGGATAAGGAAATTTACCCTGAAAGAAATTTAACGGGATAAGGACGGGATAAGGACAGTGCTCTCACGGGATAAGGAATACGGGATAAGGAGAGGACGGGATAAGGAAATTGTCCCAGTGCACGGGATAAGGAGTGCACGGGATAACGGGATAAGGACACCTGTCGGATGGGAGAGCACTAGGAGTCCCAGTGCTCGTTGACGGGATAAGGAGTCCCACGGGATAAGGAACGGGATAAACGGGATAAGGACTCAGAGTCCGTCCCAGTGCACGGGATAAGGACCAGTGCTCGAGTCCCAGTGCTCGTTCCACGACAGGATGCGTCCCAGTGCTCAGACGGGATAAGGATACGGGATAAACGGGATAAGGATGCTCCGTCCCAGTCCCAGTGCTACGGGATAAGGAGTGCACGGGATAAGGAAGGAGTGCTCGTCCCAGTGCTCCCACGGAGCCAGAGTCCCAGTGCTCTAGCCGAGTAGCGGTCCCAGTGCGTCCCAGTGCTCTGCTCGCCGTCCCAGTGCTCGGTACGCCGCAGCTATGTACGAGCCACGCAAGATATAAGGTTTAGGAAGTGTTCTTCGAGGAGCCACGCAAGGATGGAGCCACGCAAGCAAGCTAGAGCCACGCAAGAAGTGTTTGTTCGTGCGAATTGTATCTCTTTTCAGAGCCACGCAAGAATCCGTGTTCCAAGATGGAGAGCCACGCAGAGCCACGCAAGGGAGCCACGAGCCACGCAAGGGGCAGGGGACAAGTGCTGTTCCTTGACTGGTGCGTTAAATGTCCGAGCCACGCAAGCTCTCACGAGTCTGTAATTGATGAATTCTCGTAAGAGAGCCACGCAAGAAGAGGCTGAGCCACGCAAGCTGGGAGATTTGGTTGGAGCCACGCAAGGGATATAGAGCCACGCAAGGCTCATGCTTTATTTCTAAGGTTTTTGGATCCCCAGTGCATGATGTCAATGGCCACTCTCGGGTTAAGTACGCGCGCACCGTCCTGGGGAAAGTTTATAGGCCACACGTAACTGGGTAGTTTTGCTGCAGGAGGCATCTGTCCCATGCGCACCTAACCGGAGACTCCTTGACACTCGAAGGTGACGTTCACTATGTACTGCTTCACAGGGCAGTTGGGAGCGCCGAAGCGCTTAAACATGGTTCTTCATACCAGACGATACTACTCTCTGTATAGAAATCAACGGTCTCGGAAGATTCTTTGACGGGAAGATTCTTTCGAACCGAAGCGCTGCGGAAGATTCTTTGGAGCCAGGGAAGATTCTTTGAGGAAGATTCTTTCAGTGGCAGTGGAATCCGGCGTGGAACAGTGGAATCCCAGTGGAATCCGTGGAATCCGTCAGTGGAATCCGTCTCAGTGGAATCCGCGTATGTATAACGCAGTGGAATCCGCTTTCAGTGGAATCCGTGGAATCCGTCTTTCAGTGGACAGTGGAATCCGCCGTTTTCTTGGAAGGGGGAAGATTCTTTCCAAGCTCCAGTGGAATCCGCGTGTGGGCAGTGGAATCCGACAGTGGAATCCGGTATGCTTAGCCCAGTGGAATCCGTCAGTGGACAGTGGAATCCGCGTTCGGAAGATTCTTTCAGTGGAATCCGGGAAGATTCTTTATTGTTAACACAGTGCAGTGGAATCCGGGAGGAAGATTCTTTGGAAGATTCTTCAGTGGAATCCGTTTCGGAAGATTCTCAGTGGAATCCGGTGGAATCCGTTGATTCTTTGGGAGAGGAAGATTCTTTAAATTCTCATCGTGGGGAAGATTCTTTACAGTGGAATCCGACCTGTTCGGTTCAGTGGAATCCGGTCCAGGTCAGTGGAATCCGAGTGGAATCCGAGTGGAATCCGAACTCAGATCGGAGGCTCAGACACTTGGGATTGTGAGTTTTCGCTATCAACTTTCTTGGCGCCCTAATAAAGCATGCCATCGATTCATCCGCAAGCTATAGGCCCGGAAGGAGTCAGACGGCGACGCACGCTGCTTACAGAGGGCTTACCCTCTTCATCGTATGGGCGGCTCTCAGGTCCGCCTTTCGCACGGGATCCGCAGTGCAATCTCGAGGCTTCAAGTCCGAGTGTCGGATCTGAACGGTGGTGATCTCGTTTCTTTCCGCTGACGTTATGCAGCGGATACCTACACCTTGCCGCTGAACTTCCTGGGCCCCTTCTACTTAGTTGGAAAGGGGATCGCTCCTTGAGGCGAGTCCCTGGAGGTCTGGGTACGTATGGTCTTGTCGTAGCGTCAGTCGACGCGTAAGCCATACATGGACCATTTTGTGCAGGTAAGGTACCGATATGATTGCGCGTGGAGCCAAGGTGCACGATAAAGTCTGGATTAACGAACAGTGAATTCACAACCAGAATTTTTGCTTATTAACCTTTGTCGTAATACGATCCTTATGCTCAGCACGCCGCCAACTAGTCATAAATCTAGCATCCAGCGCCGTCGCTTAATAGCGGTGAGCACCCTCCTCGGCTACACCGTTGACTGATGTGGCTACTTCTACTAATCATGAGTTGTCAAGAATCCCACGCAAGCCCTATTCCGCATTTTAGGTAGGAGTTCTTAACTCATGGTAGGGATATACCAGCTTAAACAGTTGGGGTTAGTGGGACGAGTATACGATGTGAGGTTAACTCGAATGATAGCCCAACAATCCTTCGTTGACAATGTAATCCCAAAAGATGAGGACTACTCCTACGATGCTTTTTGCTTGTAGTACTACTCCTGTGTGCCAAGTACGCCAGTGAGAGATGACATATCGCCAGTACATCTTAAAACAGGCTAGAGTTACACTTAGACCCAGGTACATAGAGGCATAGCCTCTCTGGGCTACCAGCTTTTAGGCCAGGCCCGATGCGACTCCGGGCGATCATCCGATCCGTTGGGGAGTGTCTAGTTCCGGTCATTCGAAAACACACTGTATGTGGGTACGACGAGAGGTGGACTTATGATAGTCTTATTGGCAGGCACACGCAGGGTTTGCACAACGGGTCGGACCCACTTAAAAGCTAGCCAGGAACTTAAACTGACACTTGTTTGGCTGAGTAGTTCGTTTTTAACGGGCTTATCTGCACAACGCCGGGAGCTTCCCCCCTCGTACGGCTTCGGCTCGGACTAACAGTATTTACATATTGACCCTAGAGTCTCCTAAAAATTGTCATAGAATCCGTACCGGTTTTCATGGCTGTATATTGAGGCGACTTCTAGTATTTATAGCCTTCCTCCTGTTGGCTTTGGGTCGTCAACCCGCCACCCACTCCGCACTTCCCACCAAGCCTATTCGACCATCCGGTCCTCAGCCTTGTTAGTAAAGTAGTATCCCCTGCTTGAGCAAATCGCTCGTCGTGCTCCCCCGGTCGTCTACTATCGGAAATCAAGGATCGAGTAATCGCAAGGATCGAGTAGGCGACTGCAACATCTACCATTCCGTTGTTATTCCGTTAAGGAAGGAAGGATCGAGTAGAAGGTTATTCCAGGAAGGAAAGGATCGAGTATAAGGATCGAGTATCGAGTAGTATAATCAAGGATCGAGTACAAGGATCGAGTATTAAGGATCAAGGATCGAGTAGGGTTGACGTGGTGTGGTCATGAAGGATCGAGTATATCCAGGGATTCCAGGGGGAAAGGAAAGGATCGAGTAGGATCGAGTAGAAGGATCGAGTATATTCCAGGGGGTTATTAAGGATCGAGTAGGTCAGTTATTCGTTAAGGATCGAGTATTCCAGGGAAAAGGATCGAGTAGTCATGGGTGGGTGTTGTGTGTGGTCATGGGTATAAGGATCGAGTAAGGTTATTCAAGGATCGAGTATTATAAGGATCGAGTAGAAGGATCGAGTATATCGTTATAAGGATCGAGTAAGAGGATAAAGGATCGAGTAGGAAGGATCGAAAGGATCGAGTAGGTCAAAGGATCGAGAAGGATCGAGTAGACGGTTATTCCAGGGGGAAGGATCGAGTATTGGTGGTCATGGGTTGGGTTTTACGGGACTGTGGTCATGGGGTGGTCATGGGTGTCATGGGTTCAGCCACTGGCCGTGGTCATGGGTGGATCAGGTGGTCATGGGTACACAGGACTCACCCGTGGTCATGGGTGAGCGCTAGCAGCCGTTGGGCGGTGGTCATGGGTAAAAAACCCCTAGCTAGCAGATAATGTACAAGCTCGCCAACTAAGCAATCGGGAATGGGTTGTTCCACGTCCAGCACTTAGCTCCAAGTTGGCAACGAAGCGTACCCGTTATTATTCGAAAGCCGTAGCGGCACTTCCTGTCGTTTAGCGGCTGCGTCGGATTTCTGGACTATTCACAATCTCACCCTAAGTTGTTTGACGATAGTGATCCTCTCGATAAATTGCAAAACGTTATGCTTAACGTCATTTTGGCATTTGCTCACATCATCCGGCCTGCGCTTAACAATCTACTGTCTATCCGTAGCACGATTCTGCGAGCTCTGTGATAAGAGACATGCGCGTTGAGACACTTGGGTAGCGCCCCAGCGCATAGGACGGTTCATACATCGCAGTTACGCCTTTTAGCACAATCCGAATTACTGTGCGCCTCGGTACAGTAAGCATACCTGGGCATAAGGGCCCACGTCCGTTGGACGGATCAATATCTGGACAACTCCCCTGTACGCAACTAGACACTTGCAGCAGACTCGGAGCCTTAAATGACGCACACTCGGCATTCTGGAATTGCTGTTCACGGCCAGGTACATCAATGCCATTGAGAGGATTAATATGCGTCACTGATTAAGCAATCTACTCACTAGCAGACGCGCAGCGGGCCCGAAAGGTGCGGTTTTATGTGGAATCATCAAGACCCTAGGGAATTGGACTTTACTCTCACAGTATTGATCGAGGGAACGCTTAGGTCGACTAATCCAGCAGTTTCTTATCTGATGAAGCATGTACGGTTAGGCAAAGTTTGTTCTTGAAAAGCCAGTTAGTGTACTCCCACGATGAGCATCTCGAAAACTATGGCTATATAACAGAATGGTGCACGATCTACCCGAGGGGTCATCAAATCCCTCCCGTCCTCCCTTCCCTCCCGTCCTCAAGCCGTGCACACGACAATGAACCTCAGGGTTAACAACTGGGTTATCCTTTCCCTTCCCTCCCGTCCGCGAGAACCGATTCATCCCTCCCGTCCCGTCCTATAATGGAACTGATTCGGATTCCCTCCCGTCCAGGTTCAACCAGTTTGTTCCCTCCTCCCTCCCGTCCCCTTCCCTCCCGTCCGTTATTCCCGGTCCCTCCCGTCCCTGCAGTTCCCTCCCGTCCCCCCTCACTTCCTCCCTCCCGTCCTCCCTCCCGTCCATTTACTGCGTCCGGATCACGGATCTTCCCTCCCGTCCTCCCGTGTTCACCCTGCTATGTGGGTCCTCATAAAACATTTAGTAACGACGAAGCACTTCCCTCCCGTCCGATAGAGAACGCTCCCTCCCGTCCCCATTTAGCTAATTCCCTCCCGTCCTCGTCCCTCCCGTCCACTGCCGCTCGAGGACCTAGAGACGCGGTCCCTCCCTCCCGTCCTCCCTCCCGTCCCCACTAGCAATAGCTGACCTGCCCATCTCCCTCCCGTTCCCTCCCGTCCTCCTCCCTCCCTCCCTCCCGTCCCCGCGAGTAGCCCAAATGCTCCAGTTAGAACCTGACAGCAACTTCGATCAGTTAAGTTGTCAGGCATAAAGTAGTAAAGGCACCGTGAGTCGGAATACCCGGCGCTCATTCGGGTTTTGCCCGTTCGAACTATACGGTTCAGGACCTTTATACAGTTTGATGTTAACCATGGCCCGTTGAGTTGGTCTCGAAAATCCGTAATGGAGGTGTTCCTCGACGGGGGGCCCTCTTACAAACTTCCAATGGAGGATTGGGGCACGCAAGCCTCAGGGACATCGAGGGTTCTCCAATTCCAATGGAGGAGGAAAGGAACTAATCCAATGGAGGATAGCCACGTGCTGGCACTACTCCACGTGGCCGACAGGAGCTCACTATCCAATGGAGGACGCATCCAATGGAGGAATGGAGGATCGTTACATCGCGATCATTCACCGTGGGCACCTTCTCCAATGGAGGAATGGAGGTCCAATCCAATGGAGGACCTCCAATGGAGGATCCAATGGAGGATCCAATGGAGGAAGGAGGAGGACCTGACAGTAAACCATCCAATGGAGGAGTAAACCACTACCACTATCCAATGGAGGACCAATGGAGGAAATTTAGGTCCAATGGAGGAGATATGTTGAAGTCATCAACTCCAATGGAGGACTCCAATGGAGGAGACGAGTGGGCATTAGTATCGGTACTCCAATGGAGGAAATGGAGGAAGGACAATGGAGGACATCCAATGGAGGATTCCAATGGAGGAACCACTTACAGTTCAGTAAACCACTCCAATTCCAATGGAGGAGGAACTCCAATGGAGGATAAACGCAGTAAACCACTCAAGGATGTGGTGGTGGCTCAGTAAACCACTCCACTTGTCACAGTAAACCACTTCTTACTCATTCAGTAAACCACTGCAGTAAACCACTCAACCGCAGTAAACAGTAAACCACTCTGGGGTGCAGGAGATGCCCTGTTGCAATAGGTCACAGTAAACCACTACCACTCCAGGCCGATCCAGTAACAGTAAACCACCAGTAAACCACTACCACTATGCCAGTAAACCACCAGTAAACCACTAATTGTCTTTTGGAAATCATAGAACCAAATCACTGGATGCTATGACAAGACTACAGCCCAAAGTAGCCTTACCTGTTACTTGCACATCCGTTGACCGCATGCATCCATTGTGCTTATTTGCAGACTGAAGATGAGAGCGTTTATTTAAAATCGCCATAAAGCAGTAGGGGGCATAAACGGCGGGTAACTCGCTCGCCTGAAAACCTCAACAACCTTCTCCATATTCCACATCCGGTCGTGGTCAGGATCGCTGCTGCACGACATCGCATCAGATCGCGTCCCCTTATTCACGCCTCGTCTGACAGGGACTTAATTATCATTTGTGGTCCCAGATAGCACAGGCCGCCTTTTACCCTCTGCACATACTGAAAGTAGCATGATATTCGGATAACAGGGAACCCTCTTTGACGCGCCGTGGGACACGCACAAGGGTATCTCGCGAGAGTGTGCGATAAAAATCCGTGGAGTTCTTAGTCATTTCTTCCCATCGGTCTGCTCGAAAATCTCCGTCGCCACTGCTACCACTTACCCCCGAAGTAACTATCGAAAACCCTACCTCAAAATTTGTCTGGTACTGCCGCATCACTGCCCTCTGACGTTAACCGAACCCCACCAAAAATAAGTTACAACGAGCCGCTGTAAGTTGATCGGGTTCCCCCCCATTGCTCGCTATGCGAAATCTATGACCTCCGTTT"
k_judge, L_judge, t_judge = 12, 509, 19

pattern_forming_clumps_short(genome_judge, k_judge, L_judge, t_judge)

'ACGGGATAAGGA AAGGATCGAGTA'

<hr>

# <u> BA1F - Find a Position in a Genome Minimizing the Skew </u>

In [None]:
def pos_minimizing_skew(genome):
  weights = {'A' : 0, 'T': 0, 'C' : -1, 'G': 1}

  cumsum_gc_list = []
  min_val, cumsum_val = 10000, 0 

  for nu in genome:
    cumsum_val += weights[nu]
    cumsum_gc_list.append(cumsum_val)
    min_val = min(min_val, cumsum_val)

  return " ".join([str(idx+1) for idx, gc in enumerate(cumsum_gc_list) if gc == min_val])

# ===================== MORE PYTHONIC ==========================
import numpy as np

def pos_minimizing_skew_short(genome):
  weights = {'A' : 0, 'T': 0, 'C' : -1, 'G': 1}

  cumsum_gc_list = np.cumsum([weights[nu] for nu in genome])
  min_val = min(cumsum_gc_list)

  return " ".join([str(idx+1) for idx, gc in enumerate(cumsum_gc_list) if gc == min_val ])



In [None]:
# DRIVER CODE
genome = "CCTATCGGTGGATTAGCATGTCCCTGTACGTTTCGCCGCGAACTAGTTCACACGGCTTGATGGCAAATGGTTTTTCCGGCGACCGTAATCGTCCACCGAG"


pos_minimizing_skew_short(genome)

'53 97'

# <u> BA1G - Compute the Hamming Distance Between Two Strings </u>

In [None]:
def hamming_distance(pattern1, pattern2):
  cnt = 0

  for p1, p2 in zip(pattern1, pattern2):
    if p1 != p2:
      cnt += 1
  
  return cnt

# ===================== MORE PYTHONIC ==========================
def hamming_distance_short(pattern1, pattern2):
  return sum( p1!=p2 for p1, p2 in zip(pattern1, pattern2))

In [None]:
# DRIVER CODE
pattern1 = "TAGAACCAAAAGCGTCCTATTATAGATAGTCAACATCACTTTGACATCTAGATTAGTAAAAGAGCGAGATCGACAGATGCTAAAGCGCTTTTGCTAAACTCGTGTGCAACTGTTACTGGTACTCGGGCCATCAAGAGTCCATTAGCGCCTACAGCAATTCCTCGGAGACCTGAGTGTTGTGCGCTTCCAGGTGTTGTCCAGCCGACACACCTAAGCCGTAAGGGACTCTAGTTCGACGGAATGGCAGTTAAACCGCTGGGTGCTCGTAGCATGCCATATAGCTTACTATGTCGAAGCACCCTTCAGCTTATACTTTCTGGAATCCGACTCTCGTATCTCCAAGCGGAAACTGCATAATTTGAACGCCATGATCGTGCAGTTCCCTCGATCACCTCTTAATTGCCGTTTCAACATAGGAACTTTGTTGGTGATTGACAGTTGGTTACCAAATGTCTGGTCGGAAACGGGGTTCCAGGATTAGTCCATGGCTGTCGTTGGATTTTGCGTCGTGGAGTTGTCCGATTTACCCATTACCAGTTTTCAAAATTACATAGATCCTCATCTAGAGGGCGGAAAGCCGCATACGCGCGGCCAGCGAACACTATGCGGGACGCACCCAGAACGGTGCACGACGTACGCCCCGTAATACATGCCGACCTGCTATCTGCAGGACAGGTTTGCTGTTTAAAAAGAAATGGAGGTGTAGAAACAATAGTGTATGTACATCCAGCGACCTGTGTTTTCTACGCGTTAGCTTAGAGGCTGTCACTTAACCGACCATCAGCTGATAATGGTCTCTTATAACCGGGCTCATACTGAGTCGGCTCAATTCGGCGCTAGCATCTCTTAATGCTGCAAACGAGTCCCCCGAAGATAAGGGTTTTGCTTCGGTGCTATCCGAACGTGTTACTCTTGACTTCTAGATCTCCCTAAAGGTGTAGGCAAAGCAGCTCCCAAGAGTGCCTGAAGCCAGCCGAACAGGTAGCCGGGCACGACGGGGCCATCTGAGCGACCAGCTCTATGTATGCAGCTTTGTATCATGACATTGAAGTCCAGTGGAACTTCGCCCGCATTACGTGGTTGCCAACAAGGAGTGCGATGTACCCAACTGGGCTTATCTGCTACTATCACCCCGATATTCCCAGGCACGCACCCAAGCTATCAATAGTTGGCTTAAAGGAATTGACTA"
pattern2 = "GAGAGGTACTGATTCCAATAAAGGTTTGAGACAAAGTAATTGCGCCTGCGGCTGGAGTCTTCTCAAAGTTCCCTGTTGTCATGACCTGCGTAGGACTCGCGTTCGTTGGTCCCTCTTTCGTACGAATCAGCGGGGTCAGCAATGCCATGAAGAGAAATACGGTTCCTCAATCCAACGAGGAGCTATTTGAGGACGTATGGATCATGCTACTGCATCAAGACGAAAGGCAATGCACCAAGGGGGTCTTTCGCAATGCTACCCAAGTGTTTTGTATTAGCGCGAGTAAACTGTGCGATGCGGTGACGGGAATTAACGCGTGCCCGCCGCAGTGCTCTATAGGGCTTAGCGAAGTGACCAATAACCATCTCACGAGCTACCAGAACGACATTTTGAGGTGATATCTTACCTATTCGTACATTTCGCCCGATGAACTTACCCATTACAAAATAATCAGCGCCGCCGACTCGGATCTGCGCGCTTGTCGTTGGGAGTAGTAGAACAGACCGTAGCCGCATACCAAAGTCGTCAATCGCGGCTACCCTGCGGAATGCGATATCTGCGCGTGGCTACGTTAGGAATCGGGCATATGGGTTAATGAATGTGATTCATACTAGTATCTCGAGGCCCTCCCCGTGTCAGCACTCTCCGACTTTAGAAAGGTTCCACAGGCCAATTTTGTTGGGCGAAAACACATTAACGGGGCATTTAGCGCTGTAATTTGTAAGGATCATGCGGATCCTGGCTTTATACGCCAAAGTCCGATAGGCCGTGTTCATTAGATCATTGTAGATCTCACATCGGGGCACCCTCGTATTAGTAAGAGGTGTTCACGGAACCATTGCGGAGAGATACAACCACACATGCAGCATGCTTGTACCGTCTAAGGTGGATGTCCACGCGGGCTTCACGGTCTAGTTCGACTCTCTTGCCACTGATGCACTTGGTATTGCGGGTGATCATGCATCGGCTTCGCCTGTTTTCCTTCCCTCATGCAGACCCCCCAGGACGTTTACACTTGCGGCCGGCGTTAGTATAGGCCCTGCCATGGAGAGCTGCCGGGGAACCTCCAACTCTGCACTTACCTATGAGGCACAACGGCCCAGAAAGCTTGTAAAGCGGCTTCATAAGCAAACTACCAGGATGCAGAAAGTATTGCATGTCGAATGGAAATTCGCTAGAAACCGATGCA"

hamming_distance_short(pattern1, pattern2)

895

<hr>

# <u> BA1H - Find All Approximate Occurrences of a Pattern in a String

In [None]:
def approx_occurrences(genome, pattern, d):
  occurrences = []
  for i in range(len(genome)-len(pattern)):
    if hamming_distance(genome[i:i+len(pattern)], pattern) <= d:
      occurrences.append(str(i))

  return ' '.join(occurrences)

# ===================== MORE PYTHONIC ==========================

def approx_occurrences_short(g , p, d):
  return " ".join([str(i) for i in range(len(g)-len(p)) if hamming_distance(g[i:i+len(p)], p) <= d])

In [None]:
# DRIVER CODE
pattern = "ATTCTGGA"
genome = "CGCCCGAATCCAGAACGCATTCCCATATTTCGGGACCACTGGCCTCCACGGTACGGACGTCAATCAAATGCCTAGCGGCTTGTGGTTTCTCCTACGCTCC"
d = 3

approx_occurrences_short(genome, pattern, d)

'6 7 26 27 78'

<hr>

# <u> BA1I - Find the Most Frequent Words with Mismatches in a String

In [39]:
from itertools import product

######################## HELPER FUNCTIONS ##############################
def count_with_mismatch(g , p, d):
  return sum(hamming_distance(g[i:i+len(p)], p) <= d for i in range(len(g)-len(p)+1))

def all_combinations(k):
  return [''.join(s) for s in product('ACGT', repeat=k)]
########################################################################



def most_freq_with_mistmatches(text, k, d):
  word_dict = {}
  max_len = 0

  for pattern in all_combinations(k):
    word_dict[pattern] = count_with_mismatch(text, pattern , d)

    max_len = max(word_dict[pattern], max_len)
  

  return " ".join([key for key in word_dict if word_dict[key] == max_len])


In [41]:
# DRIVER CODE
text = "ACGTTGCATGTCGCATGATGCATGAGAGCT"
k,d = 4, 1

most_freq_with_mistmatches(text, k, d)

'ATGC ATGT GATG'

# B1AJ - <u> Find Frequent Words with Mismatches and Reverse Complements

In [46]:
from itertools import product

######################## HELPER FUNCTIONS ##############################
def count_with_mismatch(g , p, d):
  return sum(hamming_distance(g[i:i+len(p)], p) <= d for i in range(len(g)-len(p)+1))

def reverse_complement(pattern):
  complement_map = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}

  return ''.join([complement_map[letter] for letter in pattern][::-1])

def all_combinations(k):
  return [''.join(s) for s in product('ACGT', repeat=k)]
########################################################################



def most_freq_with_mistmatches_and_reverse_complement(text, k, d):
  word_dict = {}
  max_len = 0

  for pattern in all_combinations(k):
    word_dict[pattern] = count_with_mismatch(text, pattern , d) + count_with_mismatch(text, reverse_complement(pattern) , d)

    max_len = max(word_dict[pattern], max_len)
  

  return " ".join([key for key in word_dict if word_dict[key] == max_len])


In [48]:
# DRIVER CODE
text = "ACGTTGCATGTCGCATGATGCATGAGAGCT"
k, d = 4, 1

text = "TTTTCCGGGATAATAGCTTTATGAAGATTTTCCGGGACAGTCGGTTATGAAGATTTTCCGGGATAATAGCTTTATGAAGAACAGTCGGATAATAGCTACAGTCGGTTTTCCGGGACAGTCGGATAATAGCTGCTGCAGCGACAGTCGGTTATGAAGAATAATAGCTATAATAGCTGCTGCAGCGATAATAGCTTTATGAAGAATAATAGCTACAGTCGGTTATGAAGAGCTGCAGCGTTTTCCGGGATAATAGCTACAGTCGGTTTTCCGGGTTATGAAGAATAATAGCTTTTTCCGGGACAGTCGGACAGTCGGTTATGAAGAACAGTCGGTTATGAAGAGCTGCAGCGTTATGAAGATTTTCCGGGTTTTCCGGGTTATGAAGAGCTGCAGCGACAGTCGGTTATGAAGAGCTGCAGCGGCTGCAGCGGCTGCAGCGTTTTCCGGGACAGTCGGGCTGCAGCGATAATAGCTATAATAGCTGCTGCAGCGATAATAGCTGCTGCAGCGGCTGCAGCGACAGTCGGACAGTCGGTTATGAAGATTATGAAGAGCTGCAGCGATAATAGCTACAGTCGGATAATAGCTACAGTCGGTTATGAAGATTATGAAGATTATGAAGAATAATAGCTGCTGCAGCGATAATAGCTTTATGAAGAACAGTCGGTTATGAAGAGCTGCAGCGACAGTCGGGCTGCAGCGATAATAGCTATAATAGCTACAGTCGGTTATGAAGATTTTCCGGGTTATGAAGAGCTGCAGCGACAGTCGGGCTGCAGCGATAATAGCTTTTTCCGGGTTTTCCGGGTTATGAAGATTATGAAGA"
k, d = 7, 3

most_freq_with_mistmatches_and_reverse_complement(text, k, d)

'ATCAGCA TGCTGAT'

# BA1K - <u> Generate the Frequency Array of a String

In [15]:
from itertools import product

######################### HELPIER FUNCTIONS #####################
def all_combinations(k):
  return [''.join(s) for s in product('ACGT', repeat=k)]

def pattern_count(text, pattern):
  return sum(1 for i in range(len(text)- len(pattern)+1) if text[i: i+len(pattern)] == pattern)
#####################################################################


def freq_array(dna, k):
  ln = len(dna)
  all_combo = all_combinations(k)

  return ' '.join([str(pattern_count(dna, pattern)) for pattern in all_combo])
    


In [16]:
# DRIVER CODE
dna = 'ACGCGGCTCTGAAA'
k = 2

freq_array(dna, k)

'2 1 0 0 0 0 2 2 1 2 1 0 0 1 1 0'

# BA1L - <u> Implement PatternToNumber

In [8]:
def pattern_to_num(pattern):
  val_map = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}
  res, pos = 0, 0

  for l in pattern[: : -1]:
    res += val_map[l]* (4**pos)
    pos += 1

  return res


# ================== MORE PYTHONIC ====================
def pattern_to_num_short(pattern):
  val_map = {'A' : 0, 'C' : 1, 'G' : 2, 'T' : 3}

  return sum(val_map[letter]*(4**pos) for pos, letter in enumerate(pattern[: : -1]))

In [9]:
pattern = 'AGT'
pattern = 'AATTATACTTAGAAGTTGCAAATTAA'
pattern = 'ATC'

pattern_to_num_short(pattern)

13

# B1AM - <u> Implement NumberToPattern

In [7]:
def num_to_pattern(num, k):
  val_map = {0 : 'A', 1: 'C', 2: 'G', 3: 'T'}
  res = ''
  for i in range(k):
    res += val_map[(num // (4**i)) % 4]


  return res[: : -1]


# ================== MORE PYTHONIC ====================
def num_to_pattern_short(num, k):
  val_map = {0 : 'A', 1: 'C', 2: 'G', 3: 'T'}
  
  return ''.join([val_map[(num // (4**i)) % 4] for i in range(k)][: : -1])
    

In [9]:
# DRIVER CODE
num = 5917
k = 7

num_to_pattern_short(num, k)

'CCTACTC'

# B1AN - <u> Generate the d-Neighborhood of a String

In [28]:
from itertools import product

######################## HELPER FUNCTIONS ##############################
def hamming_distance(pattern1, pattern2):
  return sum( p1!=p2 for p1, p2 in zip(pattern1, pattern2))

def all_combinations(k):
  return [''.join(s) for s in product('ACGT', repeat=k)]
########################################################################

def d_neighbourhood(pattern, d):
  all_combo = all_combinations(len(pattern))
  res = []
  for combo in all_combo:
    if hamming_distance(pattern, combo) <= d:
      res.append(combo)
  
  return '\n'.join(res)

# ======================== MORE PYTHONIC ======================
def d_neighbourhood_short(pattern, d):
  return '\n'.join([combo for combo in all_combinations(len(pattern)) if hamming_distance(pattern, combo) <= d])



In [29]:
# DRIVER CODE
pattern = 'GCG'
d = 2

print(d_neighbourhood_short(pattern, d))

AAG
ACA
ACC
ACG
ACT
AGG
ATG
CAG
CCA
CCC
CCG
CCT
CGG
CTG
GAA
GAC
GAG
GAT
GCA
GCC
GCG
GCT
GGA
GGC
GGG
GGT
GTA
GTC
GTG
GTT
TAG
TCA
TCC
TCG
TCT
TGG
TTG
