<a href="https://colab.research.google.com/github/anihab/tokenization/blob/main/Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [18]:
# Install libraries
import pandas as pd
import numpy as np

# Install Biopython 
try:
    import google.colab
    # Running on Google Colab, so install Biopython first
    !pip install biopython
    # for byte pair encoding
    !pip install tokenizers
    !pip install transformers
except ImportError:
    pass

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Resources:
# https://www.tutorialspoint.com/biopython/biopython_sequence_io_operations.htm
# https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt
# https://huggingface.co/learn/nlp-course/chapter6/5?fw=pt
# https://huggingface.co/docs/transformers/model_doc/roberta
# https://huggingface.co/docs/tokenizers/pipeline
# https://www.philschmid.de/fine-tune-a-non-english-gpt-2-model-with-huggingface

In [21]:
import os

from Bio import SeqIO
from google.colab import files

MAX_TOKENS = 510

## Given a phage directory and a bacteria directory, tokenize all fasta files according to method of choice 

def read_files(phage_dir, bacteria_dir, method, *args, **kwargs):
  k = kwargs.get('k', None)
  for filename in os.listdir(phage_dir):
    f = os.path.join(phage_dir, filename)
    if os.path.isfile(f):
      tokenize(f, 1, method, k)
  for filename in os.listdir(bacteria_dir):
    f = os.path.join(bacteria_dir, filename)
    if os.path.isfile(f):
      tokenize(f, 0, method, k)

## Tokenizes a sequence given a fasta file and max length

"""\
Runs fasta files through tokenizer and adds the label of 1 for phage and
0 for bacteria. Then shuffles the rows in the dataframe and saves to CSV 

Input:
  phage -- str, path to phage fasta file
  bacteria -- str, path to bacteria fasta file
  method -- str, tokenization method of choice
  k -- int, length of k if using kmer tokenization
"""
def tokenize(filepath, label, method, *args, **kwargs):
  sequences = []
  tokens = []
  
  k = kwargs.get('k', None)
  filename = os.path.basename(filepath)

  if method == 'codon':
    max_length = MAX_TOKENS * 3
  elif method == 'kmer':
    max_length = MAX_TOKENS - (k - 1)
  elif method == 'bpe':
    max_length = MAX_TOKENS

  # Process data to get sequences of appropriate length 
  df = preprocess_data(filepath, max_length)
  sequences = df['sequence'].values.tolist()

  if method == 'bpe':
    train_bpe_tokenizer(sequences)           # FOR NOW -- TRAIN TOKENIZER FOR BPE

  # Tokenize according to chosen method
  for seq in range(len(sequences)):
    if method == 'codon':
      tokens.append(seq2codon(sequences[seq]))
    elif method == 'kmer':
      tokens.append(seq2kmer(sequences[seq], k))
    elif method == 'bpe':
      tokens.append(seq2bpe(sequences[seq]))
  df['tokenized'] = tokens
  df['label'] = [label] * len(tokens)
  
  # Shuffle and save to csv
  df = df.sample(frac=1).reset_index(drop=True)
  write_csv(filename, df)
  return df

"""\
Read fasta file and truncate sequences to appropriate length, returns dataframe

Input:
  file -- str, path to fasta file
  max_length -- int, maximum sequence length

Returns:
  df -- dataframe, includes the > input line, start position, and sequence
""" 
def preprocess_data(file, max_length): 
  records = []
  for record in SeqIO.parse(file, 'fasta'):
    name = str(record.name)
    seq = str(record.seq).upper()
    pos = 0 
    # Truncate sequences if longer than max_length
    while len(seq) > max_length:
      records.append(                  # add subsequence up to max_length
        {
          'name': name,
          'start': pos,
          'sequence': seq[:max_length]
        }
      )
      seq = seq[max_length:]           # sequence continuing from max_length
      pos += max_length
    records.append(
        {
          'name': name,
          'start': pos,
          'sequence': seq
        }
    )
  df = pd.DataFrame(data=records)
  return df

"""\
Read in sequences and tokens to attach labels and return dataframe

Input:
  sequences -- list, original sequences
  tokens -- list, tokenized sequences
  label -- int, 1 for phage or 0 for bacteria

Returns:
  df -- dataframe
""" 
def attach_labels(sequences, tokens, label):
  d = []
  for i in range(len(tokens)):
    d.append(
        {
          'sequence': sequences[i],
          'tokenized': tokens[i],
          'label': label
        }
    )
  df = pd.DataFrame(data=d)
  return df

"""\
Save the given dataframe to two separate csv files:
1. full_output.csv includes the name, start position, sequence, tokenized
   sequence, and label.
2. tokenized_output.csv includes the tokenized sequence and the label.

Input:
  df -- dataframe, full dataframe of tokenized sequences
""" 
def write_csv(filename, df):
  df.to_csv(filename + '_full_output.csv', encoding='utf-8', index=False)
  files.download(filename + '_full_output.csv')

  tokenized = df[['tokenized', 'label']]
  tokenized.to_csv(filename + '_tokenized_output.csv', encoding='utf-8', index=False)
  files.download(filename + '_tokenized_output.csv')

## Different tokenization methods

"""\
Convert a sequence to codons

Input:
  seq -- str, original sequence

Returns:
  codons -- str, codons separated by space
"""
def seq2codon(seq):
  codon = [seq[i:i+3] for i in range(0,len(seq),3)]
  codons = " ".join(codon)
  return codons

"""\
Convert a sequence to kmers

Input:
  seq -- str, original sequence
  k -- int, kmer of length k

Returns:
  kmers -- str, kmers separated by space
"""
def seq2kmer(seq, k):
  kmer = [seq[i:i+k] for i in range(len(seq)+1-k)]
  kmers = " ".join(kmer)
  return kmers

In [83]:
#  TODO: byte pair encoding

from tokenizers import Tokenizer, models, trainers, normalizers
from transformers import PreTrainedTokenizerFast

def train_bpe_tokenizer(sequences):
  # Initialize the tokenizer
  tokenizer = Tokenizer(models.BPE())

  # Customize the tokenizer to handle DNA sequences
  tokenizer.normalizer = normalizers.Sequence([normalizers.NFKC()])

  # Train the tokenizer on your DNA sequences
  trainer = trainers.BpeTrainer(vocab_size=1000)
  tokenizer.train_from_iterator(sequences, trainer=trainer)

  # Save the trained tokenizer
  tokenizer.save("dna_tokenizer.json")

def seq2bpe(sequence):                          # RIGHT NOW, just doing the full sequence. probably want to split it
  # Load the trained tokenizer 
  tokenizer = PreTrainedTokenizerFast(tokenizer_file="dna_tokenizer.json")

  # Now, use the tokenizer object to tokenize your DNA sequences
  encoded_input = tokenizer(sequence, return_tensors="pt")
 
  # Get the tokenized sequence
  token_ids = encoded_input.input_ids
  return tokenizer.batch_decode(token_ids)

In [84]:
## Test
tokenize("/content/sample_data/GCF_022922415.1_ASM2292241v1_cds_from_genomic.fna", 0, "bpe")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,name,start,sequence,tokenized,label
0,lcl|NZ_CP094094.1_cds_WP_080131717.1_603,0,ATGTCTTTACTTTCTAATCCATTCTTTCATTCGCTCTTCTCCATCA...,[A TGTC TTTA CTT TCTAA TCCATT CTT TCATT CGC TC...,0
1,lcl|NZ_CP094094.1_cds_WP_245066239.1_70,0,ATGCAAGATTTTATCAATGGATTTTTAAAGGCGTGGAAAGCTTGGA...,[A TGCAAGA TTTTA TCAA TGGATTTT TAAAGGC GTGG AA...,0
2,lcl|NZ_CP094094.1_cds_WP_245066250.1_77,0,ATGAATAACAGCGTCATCATTATTGAAAGTCCTAATAAGGTAGCTA...,[ATGAA TAA CAGC GTCA TCA TTATT GAAA GTC CTAA T...,0
3,lcl|NZ_CP094094.1_cds_WP_245066901.1_631,0,ATGGGAAAATTTTCTAAATTAGGCTTTATTTTAGCCACTTTAGGTA...,[ATGG GAAAA TTTT CTAAA TTA GGCTTTA TTTTA GCCA ...,0
4,lcl|NZ_CP094094.1_cds_WP_245065555.1_813,0,ATGCCATTTGAAGCTGTAATCGGGCTAGAAGTCCATGTCCAACTCA...,[ATGC CATT TGAAGC TGTAA TC GGGC TAGAA GTCCA TG...,0
...,...,...,...,...,...
1539,lcl|NZ_CP094094.1_cds_WP_245065929.1_1132,0,ATGCAAAAAAATATATTAAAAATGACTCTGTTGTTGGTTTTCCTCT...,[ATGC AAAAAAA TATA TTAAAAA TGA CTC TGTT GTT GG...,0
1540,lcl|NZ_CP094094.1_cds_WP_245065656.1_901,0,GTGAAAAAAATCGTTGTGAGTTGGTGTGTGGCGTTGGCTTTTTTAA...,[GTG AAAAAAA TCGTT GTGA GTTGG TGTG TGGCGTT GGC...,0
1541,lcl|NZ_CP094094.1_cds_WP_245066490.1_272,0,ATGAAAAAATTGGTTTTAATCATTTTTTTAACGCTAACACTTTCAA...,[A TGAAAAAA TT GG TTTTAA TCA TTTTTT TAACGC TAA...,0
1542,lcl|NZ_CP094094.1_cds_WP_245066469.1_259,0,ATGGCTAAAGAAAATCTGCCTGTCGTTTTTGGGCCTGTTTTATCCA...,[ATGGC TAAAGAAAA TC TGCC TGTC GTTTT TGG GCC TG...,0


In [None]:
# import random
#
# def seq2bpe(sequence):
#   splitted = []
#   prev = 0
#   while True:
#     n = random.randint(1,3)
#     splitted.append(sequence[prev:prev+n])
#     prev = prev + n
#     if prev >= len(sequence)-1:
#         break
#   return splitted