# Encode with HS Tokenizer

In [1]:
import helix_swap_bpe as hsb
import pandas as pd

In [3]:
tokenizer_path = '/scratch/jbusch/ma/tokenizer/hs_tokenizer/fixed_encode_decode/sub5M_tokenizer.json'
input_file = '/scratch/jbusch/ma/data/dnabert_2_pretrain/dev.txt'
output_file = '/scratch/jbusch/ma/data/dnabert_2_pretrain/hs_tokenized/sub5M_dev.txt'

In [4]:
# load tokenizer
tokenizer = hsb.helixswap_loader(tokenizer_path)

## Short Example

In [5]:
sequences = ["AATTATATATATATATATATGGCCCACACggacaaaaaa","AGGAGTATAVGGACCAGATTGCAC"]

In [6]:
tokenized = tokenizer.encode(sequences)

In [7]:
print(sequences[0])
print(sequences[1])

print('Examples of decoded sequences')
print(tokenizer.decode(tokenized[0]))
print(tokenizer.decode(tokenized[1]))

AATTATATATATATATATATGGCCCACACggacaaaaaa
AGGAGTATAVGGACCAGATTGCAC
Examples of decoded sequences
AATTATATATATATATATATGGCCCACACGGACAAAAAA
AGGAGTATA[UNK]GGACCAGATTGCAC


In [8]:
sequences[0].upper() == tokenizer.decode(tokenized[0])

True

## Long Example

In [9]:
column_name = 'input_ids'
dev = pd.read_csv(input_file, sep=" ", names=[column_name])
test = dev.head()

In [10]:
input_list = test.input_ids.to_list()
input_list[1]

'CTGTGAACAAGCTTCTCTCGTGCACTCATGAACGCGCAACAGATTTTCACTACAAAATGACTTCGCCCAAAATTTAAAATGTTTTCTTAATTTACTCAACCTCATGCTATCCCAGATCTTTGACTATTTTCATCTGCTAAACACAAACGAAATTTAGAAAAATATCTCAGCTCTGGTGAATGGTGATCAGAACTTTGAAGGTCCAGAAAGCACAAAGGCAGCATAATAGTAATCCACATGACTCCAGTGGTTAAATCCATATCTTCAGAAGCAATATGGGTGAGAAAACGATCCCAATTCTCCTCCCTGCCCAGCAGGTGTCGATATGCACAAAGAATGTGAATTGCCAAAAACAAAAGAAGATTTCTAGTAAAAAAAGGACTTAAATATGGATCTGTGTCTTACCCACACCTATCATACCACTTCTGAATATATAGACTTTACCACTGGTGTCTTATGGATTACTTTTATGCTGCCTTTATATGCTTTTTGGACCTTAAAAGTTCTGGCCACCATTCACTTGCATTGTACAGAGCTGAGATATTTTAAAAAAAAATCTTTGTGTTCAGCAGAAGAAAGACATTTATACACATCTGGGATGGCATGAGGGTGTTTTAACGAAGAGAATTTTAAAATTGTATGCCTTCAGAAGACCTGGAAAATGACACTGGAGTCGCATGAGGGCGAGTAAAGTAGGACAGAATTTCCATTTTTTGGATGAAATATTCCTTTGATAAAAAGTATTTTCTCAAAGTTATGCCTTATGACTTATCTCAAAGTGTAACTACTTTGTAACAATTGTAACAATATGATCAAACACAGCCTTAAGTGTTGTCTTGACATATTATCACTTAAGTTTTTGACCCTTTTTATTATAATTCAACTTACATTTTTTAAATAGCGTCTCTGAAACACTGTGGTACCCAGCAGACCAGATTTCTAGGGTTTTGAGTGGTGGCTTGCTAGCCCAAGTCAAAAGAGCCCATCCTCAGGTCTCTA

In [11]:
tokenized = tokenizer.encode(input_list)
tokenized[1]

[3376,
 1385,
 84,
 57,
 1423,
 859,
 49,
 1096,
 98,
 846,
 132,
 512,
 178,
 3520,
 34,
 170,
 50,
 2901,
 1777,
 80,
 1654,
 122,
 509,
 42,
 644,
 120,
 342,
 72,
 125,
 524,
 198,
 18,
 731,
 3419,
 224,
 326,
 168,
 52,
 584,
 294,
 164,
 1021,
 275,
 164,
 167,
 34,
 1225,
 208,
 64,
 785,
 71,
 55,
 661,
 43,
 340,
 183,
 609,
 1365,
 269,
 24,
 136,
 982,
 49,
 438,
 109,
 848,
 1479,
 452,
 100,
 158,
 606,
 190,
 2520,
 949,
 391,
 50,
 512,
 30,
 304,
 192,
 1234,
 240,
 40,
 150,
 46,
 438,
 1233,
 128,
 42,
 53,
 260,
 491,
 53,
 165,
 3276,
 67,
 893,
 184,
 52,
 980,
 182,
 2285,
 209,
 1224,
 160,
 46,
 1751,
 274,
 86,
 41,
 165,
 28,
 296,
 1000,
 2344,
 105,
 169,
 327,
 540,
 280,
 757,
 278,
 99,
 3222,
 315,
 1031,
 628,
 126,
 521,
 397,
 90,
 139,
 51,
 49,
 788,
 578,
 50,
 2695,
 1122,
 597,
 1753,
 3493,
 660,
 1932,
 90,
 2497,
 36,
 460,
 159,
 795,
 3232,
 409,
 73,
 119,
 1318,
 360,
 68,
 1570,
 214,
 872,
 174,
 4064,
 160,
 214,
 133,
 36,
 2670,
 314

In [12]:
print(input_list[0] == tokenizer.decode(tokenized[0]))

True


In [14]:
df = pd.DataFrame({'tokenized_sequence': tokenized})
df

Unnamed: 0,tokenized_sequence
0,"[626, 97, 363, 1522, 899, 130, 110, 74, 136, 4..."
1,"[3376, 1385, 84, 57, 1423, 859, 49, 1096, 98, ..."
2,"[269, 67, 185, 1081, 92, 34, 797, 302, 6, 173,..."
3,"[3844, 1684, 1084, 95, 95, 111, 1518, 964, 144..."
4,"[8, 197, 41, 126, 50, 2377, 3999, 166, 228, 60..."


In [27]:
df.to_csv('test_tokenized_sequences.csv', index=False)

# Helix Swap as HF Tokenizer

In [8]:
from transformers import Tokenizer

tokenizer = Tokenizer.from_pretrained(tokenizer_path)

  from .autonotebook import tqdm as notebook_tqdm
None of PyTorch, TensorFlow >= 2.0, or Flax have been found. Models won't be available and only tokenizers, configuration and file/data utilities can be used.


ImportError: cannot import name 'Tokenizer' from 'transformers' (/home/cluster_home/jbusch/conda/envs/bens_tokenizer_39_less_output/lib/python3.9/site-packages/transformers/__init__.py)