<a href="https://colab.research.google.com/github/ahmedembeddedx/BioInformatics/blob/main/DNAaBox.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
import pandas as pd
from tqdm import tqdm

In [3]:
df = pd.read_table('https://bioinformaticsalgorithms.com/data/realdatasets/Replication/Vibrio_cholerae.txt', header=None, names=['Sequence'])

genome = df.Sequence[0]

genome[:100]

'ACAATGAGGTCACTATGTTCGAGCTCTTCAAACCGGCTGCGCATACGCAGCGGCTGCCATCCGATAAGGTGGACAGCGTCTATTCACGCCTTCGTTGGCA'

In [48]:
def find_kmer(k:int, genome: str):
  if(len(genome) < k):
    return None

  li = []
  for i in range(len(genome) - k + 1):
    li.append(genome[i:i+k])

  df = pd.DataFrame(li, columns=['k_mers'])
  li = df.k_mers.value_counts()
  li = li.to_frame().reset_index()

  return li[li['count'] > 3]

In [53]:
kmers = find_kmer(9, genome)

In [54]:
kmers.head()

Unnamed: 0,k_mers,count
0,GCGTTTGTT,128
1,TAACGCCCG,127
2,AACGCCCGC,122
3,CGTTTGTTA,122
4,GGCGTTTGT,120


In [58]:
def similar_kmers(gen1:str, gen2:str)->bool:
  if len(gen1) != len(gen2):
    return False

  dic = {}
  for i in range(0, len(gen1)):
    try:
      if dic[gen1[i]] != gen2[i]:
        return False
    except:
      dic[gen1[i]] = gen2[i]
  return True

In [73]:
def DNAaBox(df):
    li = []
    total_iterations = (df.shape[0] * (df.shape[0] - 1)) // 2
    with tqdm(total=total_iterations, desc="Processing") as pbar:
        for i in range(df.shape[0]-1):
            for j in range(i+1, df.shape[0]):
                if similar_kmers(df.k_mers[i], df.k_mers[j]):
                    li.append((df.k_mers[i], df.k_mers[j]))
                pbar.update(1)

    return li

In [75]:
print('Possible DNAa Boxs')
DNAaBox(kmers[:200])

Possible DNAa Boxs


Processing: 100%|██████████| 19900/19900 [00:00<00:00, 51085.15it/s]


[('CGTTTGTTA', 'CGCCCGCCT'),
 ('TGGGACTGG', 'CAAACACAA'),
 ('TGGGACTGG', 'CAAACTCAA'),
 ('TGGGACTGG', 'CAAAACCAA'),
 ('GGGACTGGA', 'AAACACAAC'),
 ('GGGACTGGA', 'AAACCAAAC'),
 ('GACTGGAAA', 'CAAGCCAAA'),
 ('ACTGGAAAC', 'TTTGGTTTT'),
 ('GGACTGGAA', 'TTTGATTTT'),
 ('GGACTGGAA', 'TTTGGTTTT'),
 ('CCGCCTAAG', 'AAAAAGCCA'),
 ('CGCCTAAGG', 'ACAACAACC'),
 ('CGCCTAAGG', 'AAAAGCCAA'),
 ('GCCCGCCTA', 'GTTTGTTAT'),
 ('GCCCGCCTA', 'GTTTGTTAG'),
 ('CCTAAGGGG', 'TTTGGTTTT'),
 ('CGCGTTGAC', 'ACACAACAA'),
 ('GCGTTGACA', 'TTTGGTTTT'),
 ('GCCTAAGGG', 'AAAGCCAAA'),
 ('GCCTAAGGG', 'ATTGAAAAA'),
 ('CTTGAGGCG', 'AAACGCCAC'),
 ('GTTGACAGT', 'AAAACTCAA'),
 ('CAGTCCCTC', 'ACTCAAACA'),
 ('AGTCCCTCT', 'CTCAAACAC'),
 ('CCTCTTGAG', 'CCGCGGCTC'),
 ('ACAGTCCCT', 'AAACCAAAC'),
 ('CACGCGTTG', 'CGCGCGTTG'),
 ('GCCACGCGT', 'GCCGCGCGT'),
 ('AGGGGCTGG', 'CAAAACCAA'),
 ('GGCAACGCA', 'AAACCAAAC'),
 ('ATGGGACTG', 'CCAAACTCA'),
 ('ATGGGACTG', 'CAAAACCAA'),
 ('AATGGGACT', 'TTTGGGTAT'),
 ('CCACGCGTT', 'CCGCGCGTT'),
 ('GCAACGCAT',