<a href="https://colab.research.google.com/github/anihab/tokenization/blob/main/Tokenization.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Install libraries
import pandas as pd
import numpy as np

# Install Biopython 
try:
    import google.colab
    # Running on Google Colab, so install Biopython first
    !pip install biopython
    # HuggingFace transformers for byte pair encoding
    !pip install transformers
except ImportError:
    pass

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting transformers
  Downloading transformers-4.29.1-py3-none-any.whl (7.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.1/7.1 MB[0m [31m50.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m224.5/224.5 kB[0m [31m16.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m49.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: tokenizers, huggingface-hub, transformer

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Resources:
# https://www.tutorialspoint.com/biopython/biopython_sequence_io_operations.htm
# https://huggingface.co/learn/nlp-course/chapter2/4?fw=pt
# https://huggingface.co/learn/nlp-course/chapter6/5?fw=pt

In [None]:
import os

from Bio import SeqIO
from transformers import AutoTokenizer
from collections import defaultdict
from google.colab import files

MAX_TOKENS = 510

## Given a phage directory and a bacteria directory, tokenize all fasta files according to method of choice 

def read_files(phage_dir, bacteria_dir, method, *args, **kwargs):
  k = kwargs.get('k', None)
  for filename in os.listdir(phage_dir):
    f = os.path.join(phage_dir, filename)
    if os.path.isfile(f):
      tokenize(f, 1, method, k)
  for filename in os.listdir(bacteria_dir):
    f = os.path.join(bacteria_dir, filename)
    if os.path.isfile(f):
      tokenize(f, 0, method, k)

## Tokenizes a sequence given a fasta file and max length

"""\
Runs fasta files through tokenizer and adds the label of 1 for phage and
0 for bacteria. Then shuffles the rows in the dataframe and saves to CSV 

Input:
  phage -- str, path to phage fasta file
  bacteria -- str, path to bacteria fasta file
  method -- str, tokenization method of choice
  k -- int, length of k if using kmer tokenization
"""
def tokenize(filepath, label, method, *args, **kwargs):
  sequences = []
  tokens = []
  
  k = kwargs.get('k', None)
  filename = os.path.basename(filepath)

  if method == 'codon':
    max_length = MAX_TOKENS * 3
  elif method == 'kmer':
    max_length = MAX_TOKENS - (k - 1)
  elif method == 'bpe':
    max_length = MAX_TOKENS * 8

  # Process data to get sequences of appropriate length 
  df = preprocess_data(filepath, max_length)
  sequences = df['sequence'].values.tolist()

  # Tokenize according to chosen method
  for seq in range(len(sequences)):
    if method == 'codon':
      tokens.append(seq2codon(sequences[seq]))
    elif method == 'kmer':
      tokens.append(seq2kmer(sequences[seq], k))
  df['tokenized'] = tokens
  df['label'] = [label] * len(tokens)
  
  # Shuffle and save to csv
  df = df.sample(frac=1).reset_index(drop=True)
  write_csv(filename, df)
  return df

"""\
Read fasta file and truncate sequences to appropriate length, returns dataframe

Input:
  file -- str, path to fasta file
  max_length -- int, maximum sequence length

Returns:
  df -- dataframe, includes the > input line, start position, and sequence
""" 
def preprocess_data(file, max_length): 
  records = []
  for record in SeqIO.parse(file, 'fasta'):
    name = str(record.name)
    seq = str(record.seq).upper()
    pos = 0 
    # Truncate sequences if longer than max_length
    while len(seq) > max_length:
      records.append(                  # add subsequence up to max_length
        {
          'name': name,
          'start': pos,
          'sequence': seq[:max_length]
        }
      )
      seq = seq[max_length:]           # sequence continuing from max_length
      pos += max_length
    records.append(
        {
          'name': name,
          'start': pos,
          'sequence': seq
        }
    )
  df = pd.DataFrame(data=records)
  return df

"""\
Read in sequences and tokens to attach labels and return dataframe

Input:
  sequences -- list, original sequences
  tokens -- list, tokenized sequences
  label -- int, 1 for phage or 0 for bacteria

Returns:
  df -- dataframe
""" 
def attach_labels(sequences, tokens, label):
  d = []
  for i in range(len(tokens)):
    d.append(
        {
          'sequence': sequences[i],
          'tokenized': tokens[i],
          'label': label
        }
    )
  df = pd.DataFrame(data=d)
  return df

"""\
Save the given dataframe to two separate csv files:
1. full_output.csv includes the name, start position, sequence, tokenized
   sequence, and label.
2. tokenized_output.csv includes the tokenized sequence and the label.

Input:
  df -- dataframe, full dataframe of tokenized sequences
""" 
def write_csv(filename, df):
  df.to_csv(filename + '_full_output.csv', encoding='utf-8', index=False)
  files.download(filename + '_full_output.csv')

  tokenized = df[['tokenized', 'label']]
  tokenized.to_csv(filename + '_tokenized_output.csv', encoding='utf-8', index=False)
  files.download(filename + '_tokenized_output.csv')

## Different tokenization methods

"""\
Convert a sequence to codons

Input:
  seq -- str, original sequence

Returns:
  codons -- str, codons separated by space
"""
def seq2codon(seq):
  codon = [seq[i:i+3] for i in range(0,len(seq),3)]
  codons = " ".join(codon)
  return codons

"""\
Convert a sequence to kmers

Input:
  seq -- str, original sequence
  k -- int, kmer of length k

Returns:
  kmers -- str, kmers separated by space
"""
def seq2kmer(seq, k):
  kmer = [seq[i:i+k] for i in range(len(seq)+1-k)]
  kmers = " ".join(kmer)
  return kmers

#  TODO: byte pair encoding - https://huggingface.co/learn/nlp-course/chapter6/5?fw=pt
def seq2bpe(seq):
    return 0

In [None]:
## Test
tokenize("/content/sample_data/GCF_022922415.1_ASM2292241v1_cds_from_genomic.fna", 0, "codon")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

Unnamed: 0,name,start,sequence,tokenized,label
0,lcl|NZ_CP094094.1_cds_WP_220899399.1_541,0,ATGTTTGAAGATTTAAAACCGCATTTACAGGAATTAAGAAAGCGTT...,ATG TTT GAA GAT TTA AAA CCG CAT TTA CAG GAA TT...,0
1,lcl|NZ_CP094094.1_cds_WP_220951370.1_62,0,ATGCTAAATAACAAACTCACCAAATCTCAGAGAGAACTCTTTTGTA...,ATG CTA AAT AAC AAA CTC ACC AAA TCT CAG AGA GA...,0
2,lcl|NZ_CP094094.1_cds_WP_245066367.1_171,0,ATGGCTCAGTTAGAAGATTTGAAAGCACATGAAAAATACAATTTGT...,ATG GCT CAG TTA GAA GAT TTG AAA GCA CAT GAA AA...,0
3,lcl|NZ_CP094094.1_cds_WP_245066258.1_85,0,ATGCAAGAATTCAGTTTGTGGTGCGATTTTATAGAAAGGGATTTTT...,ATG CAA GAA TTC AGT TTG TGG TGC GAT TTT ATA GA...,0
4,lcl|NZ_CP094094.1_cds_WP_245066478.1_264,0,ATGCTAAACATGAACACACACACAAGAGGCATTGACAGCAATCTGA...,ATG CTA AAC ATG AAC ACA CAC ACA AGA GGC ATT GA...,0
...,...,...,...,...,...
1779,lcl|NZ_CP094094.1_cds_WP_000394639.1_54,0,ATGGAATTTAAAAACACTAAAAAAGACAGGCTGAGCGATCTAGAAA...,ATG GAA TTT AAA AAC ACT AAA AAA GAC AGG CTG AG...,0
1780,lcl|NZ_CP094094.1_cds_WP_245066051.1_1289,0,ATGCTACATAAAAAATATCGTCCTAATGTTGCGGCCATTATCATGT...,ATG CTA CAT AAA AAA TAT CGT CCT AAT GTT GCG GC...,0
1781,lcl|NZ_CP094094.1_cds_927,1530,AAACAGGAAAAACCACGAAAGAGCGTTATAACCAATGGAATCCGGC...,AAA CAG GAA AAA CCA CGA AAG AGC GTT ATA ACC AA...,0
1782,lcl|NZ_CP094094.1_cds_WP_245066113.1_1423,0,ATGCAAAAAAAGATTTTTTTACTAGAAGACGATTACCTTTTAAGCG...,ATG CAA AAA AAG ATT TTT TTA CTA GAA GAC GAT TA...,0
