<a href="https://colab.research.google.com/github/anihab/tokenization/blob/main/vocabulary.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
# Libraries
import argparse
import gzip
import os

import pandas as pd
import numpy as np

!pip install Bio
!pip install tokenizers
from Bio import SeqIO
from tokenizers import Tokenizer, models, trainers, normalizers

Collecting Bio
  Downloading bio-1.6.2-py3-none-any.whl (278 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m278.6/278.6 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting biopython>=1.80 (from Bio)
  Downloading biopython-1.83-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
Collecting mygene (from Bio)
  Downloading mygene-3.2.2-py2.py3-none-any.whl (5.4 kB)
Collecting gprofiler-official (from Bio)
  Downloading gprofiler_official-1.0.0-py3-none-any.whl (9.3 kB)
Collecting biothings-client>=0.2.6 (from mygene->Bio)
  Downloading biothings_client-0.3.1-py2.py3-none-any.whl (29 kB)
Installing collected packages: biopython, gprofiler-official, biothings-client, mygene, Bio
Successfully installed Bio-1.6.2 biopython-1.83 biothings-client-0.3.1 gprofiler-official-1.0.0 mygene-3.2.2


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Train a BPE tokenizer on the full bacterial training directory to create the vocabulary file.

###Step 0

define vocabulary size and input data path
- input should be a path to a directory of bacterial fasta files

In [13]:
# Globals
VOCAB_SIZE = 4096

# Input
INPUT_PATH = "/ocean/projects/bio230026p/lindseyl/DATA/SEGMENTS/bacteria"

###Step 1

parse through all fasta files in full bacterial training directory
- to make input similar to input for pre-training, use sequences of at least 1500 nt (i.e., min sequence length = 1500)
- do not limit max sequence length

In [14]:
def parse_sequences(input_path):
  '''
  input: a directory of fasta files or a .txt file that has a list of fasta files
  output: a list of sequences
  '''
  sequences = [] # the only information we need is the sequences, we do not need to limit length
  if os.path.isdir(input_path):
    for filename in os.listdir(input_path):
      f = os.path.join(input_path, filename)
      if os.path.isfile(f):
        if f.endswith('.gz'):
          f = gzip.open(f, 'rt', encoding='utf-8')
        for record in SeqIO.parse(f, 'fasta'):
          seq = str(record.seq).upper() # min sequence length = 1500
          if len(seq) >= 1500:
            sequences.append(seq)
  return sequences

###Step 2

build vocabulary json - limit vocab size to 4096

In [15]:
def build_vocab(sequences):
  '''
  input: a list of sequences
  output: a vocabulary json file
  '''
  tokenizer = Tokenizer(models.BPE())
  # customize the tokenizer to handle DNA sequences
  tokenizer.normalizer = normalizers.Sequence([normalizers.NFKC()])
  # train the tokenizer on DNA sequences
  trainer = trainers.BpeTrainer(vocab_size=VOCAB_SIZE)
  tokenizer.train_from_iterator(sequences, trainer=trainer)
  tokenizer.save("my_vocabulary.json")

###Step 3
build vocabulary on input

In [17]:
# build vocabulary
sequences = parse_sequences(INPUT_PATH)
build_vocab(sequences)