<a href="https://colab.research.google.com/github/alibekk93/PWM-for-Secondary-Structure/blob/v2/PWM_for_SS.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# PWM for SS

## Setup

In [1]:
# import libraries
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm

In [90]:
#@markdown Choose how many types of SS you have

types_of_SS = 'Nucleotides' #@param ['3 types of SS', '8 types of SS', 'Nucleotides']

if types_of_SS == '3 types of SS':
  SS_TYPES = ['C', 'E', 'H']
elif types_of_SS == '8 types of SS':
  SS_TYPES = ['B', 'C', 'E', 'G', 'H', 'I', 'S', 'T']
elif types_of_SS == 'Nucleotides':
  SS_TYPES = ['A', 'C', 'G', 'T']

CHAR_TO_INDEX = {ss:i for i, ss in enumerate(SS_TYPES)}
INDEX_TO_CHAR = {i:ss for i, ss in enumerate(SS_TYPES)}

#@markdown Choose a pseudocount value
pseudocount = '1e-100' #@param {type:'string'}
pseudocount = float(pseudocount)

#@markdown Set motif length

motif_length = 7 #@param {type:'integer'}

#@markdown Set number of iterations
n_iter = 1000 #@param {type:'integer'}

In [91]:
seqs = ['ATGAAACGCATTAGCACCACCATTACCACCACCATCACCATTACCACAGGTAACGGTGCGGGCTGA',
        'ATGGTTAAAGTTTATGCCCCGGCTTCCAGTGCCAATATGAGCGTCGGGTTTGATGTGCTCGGGGCGGCGGTGACACCTGTTGATGGTGCATTGCTCGGAGATGTAGTCACGGTTGAGGCGGCAGAGACATTCAGTCTCAACAACCTCGGACGCTTTGCCGATAAGCTGCCGTCAGAACCACGGGAAAATATCGTTTATCAGTGCTGGGAGCGTTTTTGCCAGGAACTGGGTAAGCAAATTCCAGTGGCGATGACCCTGGAAAAGAATATGCCGATCGGTTCGGGCTTAGGCTCCAGTGCCTGTTCGGTGGTCGCGGCGCTGATGGCGATGAATGAACACTGCGGCAAGCCGCTTAATGACACTCGTTTGCTGGCTTTGATGGGCGAGCTGGAAGGCCGTATCTCCGGCAGCATTCATTACGACAACGTGGCACCGTGTTTTCTCGGTGGTATGCAGTTGATGATCGAAGAAAACGACATCATCAGCCAGCAAGTGCCAGGGTTTGATGAGTGGCTGTGGGTGCTGGCGTATCCGGGGATTAAAGTCTCGACGGCAGAAGCCAGGGCTATTTTACCGGCGCAGTATCGCCGCCAGGATTGCATTGCGCACGGGCGACATCTGGCAGGCTTCATTCACGCCTGCTATTCCCGTCAGCCTGAGCTTGCCGCGAAGCTGATGAAAGATGTTATCGCTGAACCCTACCGTGAACGGTTACTGCCAGGCTTCCGGCAGGCGCGGCAGGCGGTCGCGGAAATCGGCGCGGTAGCGAGCGGTATCTCCGGCTCCGGCCCGACCTTGTTCGCTCTGTGTGACAAGCCGGAAACCGCCCAGCGCGTTGCCGACTGGTTGGGTAAGAACTACCTGCAAAATCAGGAAGGTTTTGTTCATATTTGCCGGCTGGATACGGCGGGCGCACGAGTACTGGAAAACTAA',
        'GTGAAAAAGATGCAATCTATCGTACTCGCACTTTCCCTGGTTCTGGTCGCTCCCATGGCAGCACAGGCTGCGGAAATTACGTTAGTCCCGTCAGTAAAATTACAGATAGGCGATCGTGATAATCGTGGCTATTACTGGGATGGAGGTCACTGGCGCGACCACGGCTGGTGGAAACAACATTATGAATGGCGAGGCAATCGCTGGCACCTACACGGACCGCCGCCACCGCCGCGCCACCATAAGAAAGCTCCTCATGATCATCACGGCGGTCATGGTCCAGGCAAACATCACCGCTAA',
        'ATGCTGATTCTTATTTCACCTGCGAAAACGCTTGATTACCAAAGCCCGTTGACCACCACGCGCTATACGCTGCCGGAGCTGTTAGACAATTCCCAGCAGTTGATCCATGAGGCGCGGAAACTGACGCCTCCGCAGATTAGCACGCTGATGCGCATCAGCGACAAACTGGCGGGTATCAACGCCGCTCGCTTTCATGACTGGCAGCCAGATTTCACGCCGGCGAATGCCCGCCAGGCGATTCTGGCGTTTAAAGGTGATGTCTACACCGGCTTGCAGGCCGAAACCTTCAGCGAAGACGATTTCGATTTTGCCCAACAGCATTTGCGAATGCTTTCCGGCTTGTATGGCGTACTCCGCCCGCTCGATTTAATGCAGCCTTATCGTCTGGAAATGGGGATCCGTCTTGAGAATGCCCGAGGGAAAGATCTGTATCAATTCTGGGGAGATATCATCACCAACAAGCTGAACGAGGCGCTCGCAGCACAAGGCGATAATGTGGTGATTAATCTGGCGTCAGATGAGTATTTTAAATCGGTGAAGCCGAAGAAATTGAATGCCGAGATTATCAAGCCGGTGTTCCTCGATGAGAAGAACGGCAAATTTAAGATCATCAGCTTCTACGCTAAGAAAGCACGCGGTCTGATGAGTCGTTTCATTATTGAAAATCGGCTGACCAAACCAGAGCAACTGACTGGTTTTAATAGCGAAGGTTACTTCTTTGATGAAGATTCCTCCAGCAATGGCGAACTGGTGTTTAAACGCTACGAGCAGCGTTAA',
        'ATGAATACTTTACGTATTGGCTTAGTTTCCATCTCTGATCGCGCATCCAGCGGCGTTTATCAGGATAAAGGCATCCCTGCGCTGGAAGAATGGCTGACATCGGCGCTAACCACGCCGTTTGAACTGGAAACCCGCTTAATCCCCGATGAGCAGGCGATCATCGAGCAAACGTTGTGTGAGCTGGTGGATGAAATGAGTTGCCATCTGGTGCTCACCACGGGCGGAACTGGCCCGGCGCGTCGTGACGTAACGCCCGATGCGACGCTGGCAGTAGCGGACCGCGAGATGCCTGGCTTTGGTGAACAGATGCGCCAGATCAGCCTGCATTTTGTACCAACTGCGATCCTTTCGCGTCAGGTGGGCGTGATTCGCAAACAGGCGCTGATCCTTAACTTACCCGGTCAGCCGAAGTCTATTAAAGAGACGCTGGAAGGTGTGAAGGACGCTGAGGGTAACGTTGTGGTACACGGTATTTTTGCCAGCGTACCGTACTGCATTCAGTTGCTGGAAGGGCCATACGTTGAAACGGCACCGGAAGTGGTTGCAGCATTCAGACCGAAGAGTGCAAGACGCGACGTTAGCGAATAA',
        'ATGGGCAACACTAAGTTGGCTAATCCGGCACCGCTGGGCCTGATGGGCTTCGGCATGACCACCATTCTGCTTAACCTGCACAACGTGGGTTATTTCGCTCTGGACGGTATTATTCTTGCCATGGGCATTTTCTACGGCGGCATCGCGCAAATTTTTGCTGGTCTGCTGGAGTACAAAAAAGGCAACACTTTCGGTTTAACCGCATTCACCTCTTACGGTTCTTTCTGGCTGACGCTGGTTGCGATTCTGCTGATGCCGAAACTGGGTCTGACCGATGCGCCAAATGCACAGTTCCTTGGTGTCTACCTGGGTCTGTGGGGCGTATTTACGCTGTTTATGTTCTTCGGCACGCTGAAAGGCGCACGCGTTCTGCAATTCGTTTTCTTTAGCCTGACCGTGCTGTTTGCCCTGCTGGCGATCGGTAACATTGCCGGTAACGCCGCAATCATCCACTTTGCCGGCTGGATTGGGCTGATCTGCGGTGCCAGCGCAATCTATCTGGCGATGGGTGAAGTACTGAACGAGCAGTTTGGTCGCACCGTTCTGCCGATTGGTGAATCCCACTAA',
        'ATGAAATCCGTTTTTACGATTTCCGCCAGCCTGGCGATTAGCCTGATGCTGTGCTGCACGGCGCAGGCAAACGACCATAAACTCCTCGGCGCCATTGCAATGCCGCGTAACGAAACCAACGATCTGGCGCTGAAACTTCCTGTTTGTCGCATTGTGAAACGCATACAACTCTCTGCCGACCATGGCGATTTACAGTTAAGCGGTGCATCGGTTTATTTCAAAGCCGCCCGTAGCGCCAGTCAGAGCCTGAATATTCCTTCAGAAATAAAAGAAGGGCAAACCACTGACTGGATCAACATTAACAGCGATAACGACAATAAACGCTGCGTCTCAAAAATCACCTTTTCGGGTCATACGGTGAACTCATCGGATATGGCCACGCTGAAAATTATCGGCGACGACTAA',
        'ATGAAGCAGCATAAGGCGATGATTGTCGCCCTGATCGTCATCTGTATCACCGCCGTAGTGGCGGCGCTGGTAACGAGAAAAGACCTCTGTGAGGTTCACATCCGAACTGGCCAGACGGAGGTTGCTGTTTTCACGGCTTACGAATCCGAGTAA',
        'ATGTCCTTGATTAACACCAAAATTAAACCTTTTAAAAACCAGGCATTCAAAAACGGCGAATTCATCGAAATCACCGAAAAAGATACCGAAGGCCGCTGGAGCGTCTTCTTCTTCTACCCGGCTGACTTTACTTTCGTATGCCCGACCGAACTGGGTGACGTTGCTGACCACTACGAAGAACTGCAGAAACTGGGCGTAGACGTATACGCAGTATCTACCGATACTCACTTCACCCACAAAGCATGGCACAGCAGCTCTGAAACCATCGCTAAAATCAAATATGCGATGATCGGCGACCCGACTGGCGCCCTGACCCGTAACTTCGACAACATGCGTGAAGATGAAGGTCTGGCTGACCGTGCGACCTTCGTTGTTGACCCGCAGGGTATCATCCAGGCAATCGAAGTTACCGCTGAAGGCATTGGCCGTGACGCGTCTGACCTGCTGCGTAAAATCAAAGCAGCACAGTACGTAGCTTCTCACCCAGGTGAAGTTTGCCCGGCTAAATGGAAAGAAGGTGAAGCAACTCTGGCTCCGTCTCTGGACCTGGTTGGTAAAATCTAA',
        'ATGTATAAGACAATCATTATGCCAGTTGATGTATTTGAAATGGAATTGAGCGACAAAGCTGTTCGCCACGCTGAATTCCTCGCCCAGGATGACGGAGTTATTCATCTACTTCACGTACTACCCGGGTCAGCCAGCCTGAGCCTGCACCGTTTTGCCGCTGATGTGCGTCGTTTTGAAGAGCATCTGCAACATGAAGCACAAGAACGTCTGCAAACGATGGTCAGCCACTTCACCATCGATCCTTCCCGCATTAAACAACATGTCCGTTTTGGTAGCGTGCGGGATGAAGTCAATGAGTTGGCAGAAGAACTGGGGGCTGATGTTGTAGTTATTGGTTCTCGCAACCCATCGATTTCGACCCATCTGTTAGGTTCTAACGCCTCGAGCGTGATCCGCCACGCCAATCTGCCGGTGCTGGTTGTGCGTTAA',
        'ATGTCCAGACCAACTATCATCATTAACGACCTGGATGCCGAACGCATCGATATTCTGCTGGAGCAACCCGCCTATGCTGGTTTGCCAATCGCCGACGCGTTAAACGCAGAGTTGGATCGCGCCCAAATGTGTTCGCCAGAAGAGATGCCACACGACGTGGTGACAATGAACAGCCGGGTTAAATTCCGCAATCTTAGCGATGGCGAAGTGCGTGTGCGCACGCTGGTGTATCCGGCAAAAATGACCGATAGCAATACTCAGCTTTCCGTTATGGCTCCGGTAGGTGCCGCACTGCTGGGGCTGCGCGTTGGCGATTCCATTCACTGGGAACTTCCGGGCGGCGTTGCAACCCACCTTGAAGTGCTGGAACTCGAATACCAGCCAGAAGCTGCTGGCGACTACCTGCTTTAA',
        'ATGAAAGCATTCTGGCGTAACGCCGCGTTGCTCGCGGTTTCTCTGCTTCCCTTCTCTTCTGCCAACGCCTTAGCGTTGCAGGCAAAACAGTATGGCGATTTTGATCGCTATGTCCTGGCCCTCTCCTGGCAAACCGGATTTTGCCAGAGTCAACACGATCGAAATCGTAACGAACGAGATGAATGTCGCCTGCAAACCGAAACGACCAACAAAGCTGATTTTCTGACCGTACATGGTCTGTGGCCAGGATTGCCTAAATCGGTTGCTGCCCGTGGTGTTGATGAACGCCGCTGGATGCGCTTCGGTTGCGCTACTCGCCCAATCCCGAATCTACCAGAAGCGCGCGCCAGCCGAATGTGTTCATCGCCGGAAACCGGATTATCACTGGAAACGGCCGCTAAACTAAGTGAAGTCATGCCAGGAGCTGGCGGACGTTCCTGCCTGGAACGCTACGAATATGCCAAACACGGTGCCTGCTTTGGTTTTGATCCGGACGCATACTTCGGTACGATGGTACGCCTGAATCAAGAAATTAAAGAGAGCGAAGCCGGAAAATTCCTTGCGGATAATTACGGTAAAACAGTGAGCCGCCGTGACTTTGACGCCGCCTTTGCCAAAAGCTGGGGAAAAGAGAACGTGAAAGCAGTTAAGCTAACGTGCCAGGGTAACCCTGCGTATTTGACTGAAATTCAGATCTCGATCAAAGCTGACGCCATCAACGCTCCGCTTTCTGCAAACTCATTCTTGCCACAACCTCACCCAGGTAACTGTGGCAAAACCTTTGTGATTGATAAAGCGGGTTATTAA',
        'ATGCACCTGCTTCCTGAACTCGCCAGCCACCATGCGGTATCAATTCCCGAGCTGCTCGTCAGCCGGGATGAAAGGCAAGCACGGCAACACGTCTGGCTCAAGCGCCATCCTGTTCCACTGGTCTCCTTTACCGTGGTTGCGCCTGGGCCGATTAAAGACAGCGAGGTCACACGCCGAATTTTTAATCATGGCGTGACAGCCTTGCGTGCCTTAGCCGCAAAACAGGGCTGGCAAATTCAGGAGCAGGCTGCACTGGTTTCCGCCAGCGGGCCGGAGGGCATGTTGAGCATTGCCGCCCCGGCTCGCGACCTCAAGCTCGCCACCATTGAGCTTGAACATAGTCATCCTCTCGGGCGGTTATGGGATATCGATGTCCTGACGCCCGAAGGCGAAATTCTCTCCCGCCGCGACTATTCACTGCCGCCTCGCCGCTGCCTGTTGTGCGAACAAAGCGCAGCCGTCTGCGCGCGTGGAAAAACCCATCAACTGACCGATTTACTCAACCGCATGGAGGCACTGCTGAACGATGTCGATGCCTGCAACGTCAACTAA',
        'ATGATTTCCGCTTCGCTGCAACAACGTAAAACTCGCACCCGCCGCAGCATGTTGTTTGTGCCTGGTGCCAATGCCGCGATGGTCAGCAACTCCTTCATCTACCCGGCTGATGCCCTGATGTTTGACCTCGAAGACTCCGTAGCATTGCGTGAAAAAGACACCGCCCGCCGCATGGTTTACCACGCGCTGCAACATCCGCTGTATCGCGATATTGAAACCATTGTGCGTGTCAACGCGCTGGATTCCGAATGGGGTGTTAACGACCTGGAAGCCGTCGTTCGCGGTGGTGCGGACGTTGTGCGTCTGCCGAAAACCGATACCGCTCAGGATGTTCTGGATATTGAAAAAGAGATCCTGCGTATCGAAAAAGCCTGTGGTCGTGAACCCGGCAGCACCGGCCTGCTGGCGGCGATTGAATCTCCGCTGGGGATTACCCGCGCAGTGGAAATCGCTCACGCTTCCGAGCGTTTGATCGGTATCGCCCTCGGTGCAGAAGACTATGTGCGCAACCTGCGTACAGAACGCTCCCCGGAAGGAACTGAACTGCTGTTCGCACGCTGTTCCATTTTGCAGGCCGCGCGCTCTGCGGGTATTCAGGCGTTCGATACCGTCTATTCCGACGCTAACAACGAAGCCGGATTTCTGCAAGAAGCCGCCCACATCAAACAGCTGGGCTTTGACGGCAAATCGCTGATCAACCCGCGTCAGATTGATCTGCTGCACAACCTCTACGCACCGACCCAGAAAGAAGTGGATCACGCCCGCCGCGTCGTAGAAGCCGCTGAAGCCGCCGCTCGCGAAGGCCTCGGCGTGGTTTCCCTGAACGGCAAGATGGTGGACGGTCCGGTTATCGATCGCGCCCGTCTGGTGCTCTCCCGTGCAGAACTTTCCGGCATCCGCGAAGAATAA',
        'ATGAAAATAAACCAGCCCGCCGTTGCAGGCACCCTTGAGTCTGGGGATGTGATGATACGCATCGCCCCACTCGATACGCAGGATATCGACCTGCAAATCAATAGCAGCGTTGAGAAACAGTTTGGCGATGCAATTCGCACCACCATTCTGGACGTTCTCGCCCGCTACAACGTGCGCGGCGTACAGCTGAATGTCGATGACAAAGGCGCACTGGACTGCATTTTACGTGCACGACTGGAAGCCCTGCTGGCACGCGCCAGCGGTATCCCGGCTCTGCCATGGGAGGATTGCCAATGA',
        'ATGTCTAAGATTAAAGGTAACGTTAAGTGGTTTAATGAGTCCAAAGGATTCGGTTTCATTACTCCGGAAGACGGCAGCAAAGACGTGTTCGTACACTTCTCTGCAATCCAGACTAATGGTTTTAAAACTCTTGCTGAAGGTCAGCGCGTAGAGTTCGAAATCACTAACGGTGCCAAAGGCCCTTCTGCTGCAAACGTAATCGCTCTGTAA',
        'ATGAACGTGAGTAAATATGTCGCTATCTTTTCCTTTGTTTTTATTCAGTTAATCAGCGTTGGTAAAGTTTTTGCTAACGCAGATGAGTGGATGACAACGTTTAGAGAAAATATTGCACAAACCTGGCAACAGCCTGAACATTATGATTTATATATTCCTGCCATCACCTGGCATGCACGTTTCGCTTACGACAAAGAAAAAACCGATCGCTATAACGAGCGACCGTGGGGTGGCGGTTTTGGCCTGTCGCGTTGGGATGAAAAAGGAAACTGGCATGGCCTGTATGCCATGGCATTTAAGGACTCGTGGAACAAATGGGAACCGATTGCCGGATACGGATGGGAAAGTACCTGGCGACCGCTGGCGGATGAAAATTTTCATTTAGGTCTGGGATTCACCGCTGGCGTAACGGCACGCGATAACTGGAATTACATCCCTCTCCCGGTTCTACTGCCATTGGCCTCCGTGGGTTATGGCCCAGTGACTTTTCAGATGACCTACATTCCGGGTACCTACAACAATGGCAATGTGTACTTTGCCTGGATGCGCTTTCAGTTTTGA',
        'GTGTTACAACTTCTTTTAGCAGTTTTTATTGGCGGTGGTACGGGAAGCGTGGCGAGATGGCTGTTAAGTATGCGATTTAACCCACTGCATCAGGCGATTCCGTTGGGGACGCTGACAGCAAACCTGATTGGGGCATTCATCATAGGAATAGGATTCGCATGGTTCAGCAGGATGACGAACATTGATCCAGTGTGGAAAGTATTAATCACCACCGGATTTTGTGGCGGTCTAACAACCTTCTCAACATTTTCGGCAGAAGTGGTGTTTTTGTTACAAGAGGGCCGCTTTGGCTGGGCATTACTGAACGTTTTCGTCAACCTTCTGGGGTCTTTTGCCATGACCGCACTGGCATTCTGGCTGTTTTCGGCCTCAACCGCACACTAA'
        'ATGGGTGAGATTAGTATTACCAAACTGCTGGTAGTTGCGGCGCTGGTCGTTCTGCTGTTTGGGACTAAGAAGTTACGTACGCTGGGCGGAGACCTTGGAGCGGCCATTAAAGGGTTCAAGAAGGCGATGAATGATGACGATGCTGCGGCGAAAAAAGGCGCAGACGTTGATCTTCAGGCTGAAAAGCTCTCTCATAAAGAGTGA',
        'TTGTATCAGGATAAAATTCTTGTCCGCCAGCTCGGTCTTCAGCCTTACGAGCCAATCTCCCAGGCTATGCATGAATTCACCGATACCCGCGATGATAGTACCCTTGATGAAATCTGGCTGGTCGAGCACTATCCGGTATTCACCCAAGGTCAGGCAGGAAAAGCGGAGCACATTTTAATGCCGGGTGATATTCCGGTGATCCAGAGCGATCGCGGTGGGCAGGTGACTTATCACGGGCCGGGGCAACAGGTGATGTATGTGTTGCTTAACCTGAAACGCCGTAAACTCGGTGTGCGTGAACTGGTGACCTTGCTTGAGCAAACAGTGGTGAATACCCTGGCTGAACTGGGTATAGAAGCGCATCCTCGGGCTGACGCGCCAGGTGTCTATGTTGGGGAAAAGAAAATTTGCTCACTGGGTTTACGTATTCGACGCGGTTGTTCATTCCACGGTCTGGCATTAAACGTCAATATGGATCTTTCACCATTTTTACGTATTAATCCTTGTGGGTATGCCGGAATGGAAATGGCTAAAATATCACAATGGAAACCCGAAGCGACGACTAATAATATTGCTCCACGTTTACTGGAAAATATTTTAGCGCTACTAAACAATCCGGACTTCGAATATATTACCGCTTAA',
        'ATGAAAACCAAACTTAACGAACTGCTTGAATTCCCTACTCCTTTTACTTACAAAGTTATGGGGCAGGCGTTACCTGAGCTGGTTGATCAGGTGGTTGAAGTGGTACAGCGCCATGCGCCAGGTGACTACACCCCAACGGTAAAACCAAGCAGCAAAGGCAACTACCACTCGGTATCTATCACTATCAACGCCACTCATATCGAGCAGGTTGAAACACTGTATGAAGAACTGGGCAAAATCGATATTGTCCGCATGGTTCTGTAA',
        'ATGGTCGATATGAAAACTACGCATACCTCCCTCCCCTTTGCCGGACATACGCTGCATTTTGTTGAGTTCGATCCGGCGAATTTTTGTGAGCAGGATTTACTCTGGCTGCCGCACTACGCACAACTGCAACACGCTGGACGTAAACGTAAAACAGAGCATTTAGCCGGACGGATCGCTGCTGTTTATGCTTTGCGGGAATATGGCTATAAATGTGTGCCCGCAATCGGCGAGCTACGCCAACCTGTCTGGCCTGCGGAGGTATACGGCAGTATTAGCCACTGTGGGACTACGGCATTAGCCGTGGTATCTCGTCAACCGATTGGCATTGATATAGAAGAAATTTTTTCTGTACAAACCGCAAGAGAATTGACAGACAACATTATTACACCAGCGGAACACGAGCGACTCGCAGACTGCGGTTTAGCCTTTTCTCTGGCGCTGACACTGGCATTTTCCGCCAAAGAGAGCGCATTTAAGGCAAGTGAGATCCAAACTGATGCAGGTTTTCTGGACTATCAGATAATTAGCTGGAATAAACAGCAGGTCATCATTCATCGTGAGAATGAGATGTTTGCTGTGCACTGGCAGATAAAAGAAAAGATAGTCATAACGCTGTGCCAACACGATTAA',
        'ATGTCCTTGATTAACACCAAAATTAAACCTTTTAAAAACCAGGCATTCAAAAACGGCGAATTCATCGAAATCACCGAAAAAGATACCGAAGGCCGCTGGAGCGTCTTCTTCTTCTACCCGGCTGACTTTACTTTCGTATGCCCGACCGAACTGGGTGACGTTGCTGACCACTACGAAGAACTGCAGAAACTGGGCGTAGACGTATACGCAGTATCTACCGATACTCACTTCACCCACAAAGCATGGCACAGCAGCTCTGAAACCATCGCTAAAATCAAATATGCGATGATCGGCGACCCGACTGGCGCCCTGACCCGTAACTTCGACAACATGCGTGAAGATGAAGGTCTGGCTGACCGTGCGACCTTCGTTGTTGACCCGCAGGGTATCATCCAGGCAATCGAAGTTACCGCTGAAGGCATTGGCCGTGACGCGTCTGACCTGCTGCGTAAAATCAAAGCAGCACAGTACGTAGCTTCTCACCCAGGTGAAGTTTGCCCGGCTAAATGGAAAGAAGGTGAAGCAACTCTGGCTCCGTCTCTGGACCTGGTTGGTAAAATCTAA'
        ]

In [92]:
# background frequencies and counts
seqs_concat = ''.join(seqs)
background_counts = [seqs_concat.count(ss) for ss in SS_TYPES]
background_freqs = [count / len(seqs_concat) for count in background_counts]
# print({ss:f for ss, f in zip(SS_TYPES, background_freqs)})

# get initial random n-mers
random_nmers = []
for seq in seqs:
  effective_len = len(seq) + 1 - motif_length
  nmer_start = np.random.choice(range(effective_len))
  random_nmer = seq[nmer_start:nmer_start + motif_length]
  random_nmers.append(random_nmer)
# print(random_nmers)

In [93]:
def optimize_nmer(seq_i, seqs=seqs, random_nmers=random_nmers, motif_length=motif_length,
                  background_counts=background_counts, pseudocount=pseudocount):
  # removing optimized nmer
  seq = seqs[seq_i]
  effective_len = len(seq) + 1 - motif_length
  optimized_nmer = random_nmers[seq_i]
  nmers_for_counts = random_nmers.copy()
  nmers_for_counts.remove(nmers_for_counts[seq_i])
  # counting SS in nmers other than optimized
  counts = np.zeros((motif_length, len(SS_TYPES)))
  for nmer in nmers_for_counts:
    for i, char in enumerate(nmer):
      index = CHAR_TO_INDEX[char]
      counts[i, index] += 1
  # adding optimized nmer to background counts
  background_counts_with_nmer = [optimized_nmer.count(ss) + background + pseudocount*4 for ss, background in zip(SS_TYPES, background_counts)]
  # getting background freqs
  background_freqs_with_nmer = [(bc) / (sum(background_counts_with_nmer)) for bc in background_counts_with_nmer]
  # calculating pwm
  pwm = np.zeros_like(counts)
  for i in range(len(SS_TYPES)):
    pwm[:, i] = np.log((counts[:, i]/sum(counts[:, i]) + pseudocount)/(background_freqs_with_nmer[i]))
  # calculate pwms for each nmer in sequence
  pwms_all = []
  for nmer_i in range(effective_len):
    nmer = seq[nmer_i:nmer_i + motif_length]
    pwms_all.append(np.sum([pwm[i, CHAR_TO_INDEX[ss]] for i, ss in enumerate(nmer)]))
    optimal_nmer_i = np.where(pwms_all==max(pwms_all))[0][0]
    random_nmers[seq_i] = seqs[seq_i][optimal_nmer_i:optimal_nmer_i + motif_length]
  return random_nmers, optimal_nmer_i, pwm

In [96]:
# get optimal nmer for each sequence
n_nochange = 0
optimal_nmer_idx = []
max_F = 0
F_values = []
for iter in tqdm(range(n_iter)):
  for seq_i in range(len(seqs)):
    random_nmers, optimal_nmer_i, pwm = optimize_nmer(seq_i)
    optimal_nmer_idx.append(optimal_nmer_i)
  # calculate F
  counts = np.zeros((motif_length, len(SS_TYPES)))
  for nmer in random_nmers:
    for i, char in enumerate(nmer):
      index = CHAR_TO_INDEX[char]
      counts[i, index] += 1
  new_F = sum(sum(counts * pwm))
  F_values.append(new_F)
  if new_F > max_F:
    max_F = new_F
  else:
    n_nochange += 1
  if n_nochange > 50:
    print('Completed after {n} iterations'.format(n=n_nochange))
    break

  1%|          | 11/1000 [00:05<07:31,  2.19it/s]

Completed after 11 iterations





In [98]:
random_nmers

['ACGGTGC',
 'GCGATGA',
 'AAGATGC',
 'GCGATAA',
 'CCGATGC',
 'CCGATGC',
 'GCGATAA',
 'GCGATGA',
 'GCGATGA',
 'CCGGTGC',
 'GAGATGC',
 'GAGATGA',
 'TCGATGC',
 'CCGATAC',
 'GCGATGC',
 'ACGGTGC',
 'GCGATAA',
 'GCGATGA',
 'GCGATGA',
 'ACGGTAA',
 'CAGATAA',
 'GCGATGA']