<a href="https://colab.research.google.com/github/anihab/tokenization/blob/main/Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install libraries
import pandas as pd
import numpy as np

# Install Biopython 
try:
    import google.colab
    # Running on Google Colab, so install Biopython first
    !pip install biopython
except ImportError:
    pass

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m21.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Resources:
# https://www.tutorialspoint.com/biopython/biopython_sequence_io_operations.htm
# https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt
# https://huggingface.co/learn/nlp-course/chapter6/5?fw=pt

In [None]:
import os
from Bio import SeqIO
from google.colab import files

MAX_TOKENS = 510

## Given a phage directory and a bacteria directory, tokenize all fasta files according to method of choice 

def read_files(phage_dir, bacteria_dir, method, *args, **kwargs):
  k = kwargs.get('k', None)
  for file in os.listdir(phage_dir):
    f = os.path.join(phage_dir, file)
    if os.path.isfile(f):
      tokenize(f, 1, method, k)
  for file in os.listdir(bacteria_dir):
    f = os.path.join(bacteria_dir, file)
    if os.path.isfile(f):
      tokenize(f, 0, method, k)

## Tokenizes a sequence given a fasta file and max length

"""\
Runs fasta files through tokenizer and adds the label of 1 for phage and
0 for bacteria. Then shuffles the rows in the dataframe and saves to CSV 

Input:
  phage -- str, path to phage fasta file
  bacteria -- str, path to bacteria fasta file
  method -- str, tokenization method of choice
  k -- int, length of k if using kmer tokenization
"""
def tokenize(file, label, method, *args, **kwargs):
  sequences = []
  tokens = []
  
  k = kwargs.get('k', None)

  if method == 'codon':
    max_length = MAX_TOKENS * 3
  elif method == 'kmer':
    max_length = MAX_TOKENS - (k - 1)

  # Process data to get sequences of appropriate length 
  df = preprocess_data(file, max_length)
  sequences = df["sequence"].values.tolist()

  # Tokenize according to chosen method
  for seq in range(len(sequences)):
    if method == 'codon':
      tokens.append(seq2codon(sequences[seq]))
    elif method == 'kmer':
      tokens.append(seq2kmer(sequences[seq], k))
  df["tokenized"] = tokens
  df["label"] = [label] * len(tokens)
  
  # Shuffle and save to csv
  df = df.sample(frac=1).reset_index(drop=True)
  write_csv(df)
  return df

"""\
Read fasta file and truncate sequences to appropriate length, returns dataframe

Input:
  file -- str, path to fasta file
  max_length -- int, maximum sequence length

Returns:
  df -- dataframe, includes the > input line, start position, and sequence
""" 
def preprocess_data(file, max_length): 
  records = []
  for record in SeqIO.parse(file, 'fasta'):
    name = str(record.name)
    seq = str(record.seq).upper()
    pos = 0 
    # Truncate sequences if longer than max_length
    while len(seq) > max_length:
      records.append(                  # add subsequence up to max_length
        {
          'name': name,
          'start': pos,
          'sequence': seq[:max_length]
        }
      )
      seq = seq[max_length:]           # sequence continuing from max_length
      pos = pos + max_length
    records.append(
        {
          'name': name,
          'start': pos,
          'sequence': seq
        }
    )
  df = pd.DataFrame(data=records)
  return df

"""\
Read in sequences and tokens to attach labels and return dataframe

Input:
  sequences -- list, original sequences
  tokens -- list, tokenized sequences
  label -- int, 1 for phage or 0 for bacteria

Returns:
  df -- dataframe
""" 
def attach_labels(sequences, tokens, label):
  d = []
  for i in range(len(tokens)):
    d.append(
        {
          'sequence': sequences[i],
          'tokenized': tokens[i],
          'label': label
        }
    )
  df = pd.DataFrame(data=d)
  return df

"""\
Save the given dataframe to two separate csv files:
1. full_output.csv includes the name, start position, sequence, tokenized
   sequence, and label.
2. tokenized_output.csv includes the tokenized sequence and the label.

Input:
  df -- dataframe, full dataframe of tokenized sequences
""" 
def write_csv(df):
  df.to_csv('full_output.csv', encoding='utf-8', index=False)
  files.download('full_output.csv')

  tokenized = df[['tokenized', 'label']]
  tokenized.to_csv('tokenized_output.csv', encoding='utf-8', index=False)
  files.download('tokenized_output.csv')

## Different tokenization methods

"""\
Convert a sequence to codons

Input:
  seq -- str, original sequence

Returns:
  codons -- str, codons separated by space
"""
def seq2codon(seq):
  codon = [seq[i:i+3] for i in range(0,len(seq),3)]
  codons = " ".join(codon)
  return codons

"""\
Convert a sequence to kmers

Input:
  seq -- str, original sequence
  k -- int, kmer of length k

Returns:
  kmers -- str, kmers separated by space
"""
def seq2kmer(seq, k):
  kmer = [seq[i:i+k] for i in range(len(seq)+1-k)]
  kmers = " ".join(kmer)
  return kmers

#  TODO: byte pair encoding

In [None]:
## Test
tokenize("/content/sample_data/GCF_022922415.1_ASM2292241v1_cds_from_genomic.fna", 0, "codon")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# ## OLD TESTING

# # Swap between tokenization methods (used for testing)
# def tokenize_sequences(file_path, method, *args, **kwargs):
#   sequences = []
#   tokens = []

#   k = kwargs.get('k', None)

#   if method == 'codon':
#     max_length = 512*3
#     sequences = preprocess_data(file_path, max_length)
#     for i in range(len(sequences)):
#       tokens.append(seq2codon(sequences[i]))
  
#   elif method == 'kmer':
#     max_length = 512
#     sequences = preprocess_data(file_path, max_length)
#     for i in range(len(sequences)):
#       tokens.append(seq2kmer(sequences[i], k))

#   # Get output
#   output = attach_labels(sequences, tokens, 0)
#   return output

# # Test 
# tokenize_sequences("/content/sample_data/GCF_022922415.1_ASM2292241v1_cds_from_genomic.fna", 'codon')