# Encode with HS Tokenizer

In [1]:
import helix_swap_bpe as hsb
import pandas as pd

In [2]:
tokenizer_path = '/scratch/jbusch/ma/tokenizer/hs_tokenizer/fixed_encode_decode/sub5M_tokenizer.json'
input_file = '/scratch/jbusch/ma/data/dnabert_2_pretrain/dev.txt'
output_file = '/scratch/jbusch/ma/data/dnabert_2_pretrain/hs_tokenized/sub5M_dev.txt'

In [3]:
# load tokenizer
tokenizer = hsb.helixswap_loader(tokenizer_path)

## Short Example

In [4]:
sequences = ["AATTATATATATATATATATGGCCCACACggacaaaaaa","AGGAGTATAVGGACCAGATTGCAC"]

In [5]:
tokenized = tokenizer.encode(sequences)

In [6]:
print(sequences[0])
print(sequences[1])

print('Examples of decoded sequences')
print(tokenizer.decode(tokenized[0]))
print(tokenizer.decode(tokenized[1]))

AATTATATATATATATATATGGCCCACACggacaaaaaa
AGGAGTATAVGGACCAGATTGCAC
Examples of decoded sequences
AATTATATATATATATATATGGCCCACACGGACAAAAAA
AGGAGTATA[UNK]GGACCAGATTGCAC


In [7]:
sequences[0].upper() == tokenizer.decode(tokenized[0])

True

## Long Example

In [8]:
column_name = 'input_ids'
dev = pd.read_csv(input_file, sep=" ", names=[column_name])
test = dev.head()

In [9]:
input_list = test.input_ids.to_list()
input_list[1]

'CTGTGAACAAGCTTCTCTCGTGCACTCATGAACGCGCAACAGATTTTCACTACAAAATGACTTCGCCCAAAATTTAAAATGTTTTCTTAATTTACTCAACCTCATGCTATCCCAGATCTTTGACTATTTTCATCTGCTAAACACAAACGAAATTTAGAAAAATATCTCAGCTCTGGTGAATGGTGATCAGAACTTTGAAGGTCCAGAAAGCACAAAGGCAGCATAATAGTAATCCACATGACTCCAGTGGTTAAATCCATATCTTCAGAAGCAATATGGGTGAGAAAACGATCCCAATTCTCCTCCCTGCCCAGCAGGTGTCGATATGCACAAAGAATGTGAATTGCCAAAAACAAAAGAAGATTTCTAGTAAAAAAAGGACTTAAATATGGATCTGTGTCTTACCCACACCTATCATACCACTTCTGAATATATAGACTTTACCACTGGTGTCTTATGGATTACTTTTATGCTGCCTTTATATGCTTTTTGGACCTTAAAAGTTCTGGCCACCATTCACTTGCATTGTACAGAGCTGAGATATTTTAAAAAAAAATCTTTGTGTTCAGCAGAAGAAAGACATTTATACACATCTGGGATGGCATGAGGGTGTTTTAACGAAGAGAATTTTAAAATTGTATGCCTTCAGAAGACCTGGAAAATGACACTGGAGTCGCATGAGGGCGAGTAAAGTAGGACAGAATTTCCATTTTTTGGATGAAATATTCCTTTGATAAAAAGTATTTTCTCAAAGTTATGCCTTATGACTTATCTCAAAGTGTAACTACTTTGTAACAATTGTAACAATATGATCAAACACAGCCTTAAGTGTTGTCTTGACATATTATCACTTAAGTTTTTGACCCTTTTTATTATAATTCAACTTACATTTTTTAAATAGCGTCTCTGAAACACTGTGGTACCCAGCAGACCAGATTTCTAGGGTTTTGAGTGGTGGCTTGCTAGCCCAAGTCAAAAGAGCCCATCCTCAGGTCTCTA

In [10]:
tokenized = tokenizer.encode(input_list)
tokenized[1]

[3376,
 1385,
 84,
 57,
 1423,
 859,
 49,
 1096,
 98,
 846,
 132,
 512,
 178,
 3520,
 34,
 170,
 50,
 2901,
 1777,
 80,
 1654,
 122,
 509,
 42,
 644,
 120,
 342,
 72,
 125,
 524,
 198,
 18,
 731,
 3419,
 224,
 326,
 168,
 52,
 584,
 294,
 164,
 1021,
 275,
 164,
 167,
 34,
 1225,
 208,
 64,
 785,
 71,
 55,
 661,
 43,
 340,
 183,
 609,
 1365,
 269,
 24,
 136,
 982,
 49,
 438,
 109,
 848,
 1479,
 452,
 100,
 158,
 606,
 190,
 2520,
 949,
 391,
 50,
 512,
 30,
 304,
 192,
 1234,
 240,
 40,
 150,
 46,
 438,
 1233,
 128,
 42,
 53,
 260,
 491,
 53,
 165,
 3276,
 67,
 893,
 184,
 52,
 980,
 182,
 2285,
 209,
 1224,
 160,
 46,
 1751,
 274,
 86,
 41,
 165,
 28,
 296,
 1000,
 2344,
 105,
 169,
 327,
 540,
 280,
 757,
 278,
 99,
 3222,
 315,
 1031,
 628,
 126,
 521,
 397,
 90,
 139,
 51,
 49,
 788,
 578,
 50,
 2695,
 1122,
 597,
 1753,
 3493,
 660,
 1932,
 90,
 2497,
 36,
 460,
 159,
 795,
 3232,
 409,
 73,
 119,
 1318,
 360,
 68,
 1570,
 214,
 872,
 174,
 4064,
 160,
 214,
 133,
 36,
 2670,
 314

In [11]:
print(input_list[0] == tokenizer.decode(tokenized[0]))

True


In [13]:
df = pd.DataFrame({'tokenized_sequence': tokenized})
df

Unnamed: 0,tokenized_sequence
0,"[626, 97, 363, 1522, 899, 130, 110, 74, 136, 4..."
1,"[3376, 1385, 84, 57, 1423, 859, 49, 1096, 98, ..."
2,"[269, 67, 185, 1081, 92, 34, 797, 302, 6, 173,..."
3,"[3844, 1684, 1084, 95, 95, 111, 1518, 964, 144..."
4,"[8, 197, 41, 126, 50, 2377, 3999, 166, 228, 60..."


In [27]:
df.to_csv('test_tokenized_sequences.csv', index=False)

# Helix Swap as HF Tokenizer

In [29]:
import helix_swap_bpe as hsb
from transformers import PreTrainedTokenizer
from collections import OrderedDict

class HelixSwap(PreTrainedTokenizer):
	def __init__(self,trained_tokenizer_file):
		
		self.helix_swap_pretrained = hsb.helixswap_loader(trained_tokenizer_file)
		self._vocab = OrderedDict(sorted(self.helix_swap_pretrained.vocab().items(), key=lambda item: item[1]))
		super().__init__()
		self.add_special_tokens({'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

	def get_vocab(self):
		return(self._vocab)
	
	def _tokenize(self,x):
		return self.helix_swap_pretrained.tokenize(x)

	def _convert_id_to_token(self,ids):
		return self.helix_swap_pretrained.decode([ids])

	@property
	def vocab_size(self):
		letzter_key, lasttokenid = next(reversed(self._vocab.items()))
		return lasttokenid
		# requires new wheel of helixswapbpe
		#self.helix_swap_pretrainedvocab_size()

	def _convert_token_to_id(self,token):
		print("_convert_token_to_id used! Please double check expected outcome!!!")
		return self._vocab[token]

	#def encode(self,sequence,**kwargs):
	#	# diese Zeile auskom
	#	return self.helix_swap_pretrained.encode(sequence)

In [None]:
# class HelixSwap(PreTrainedTokenizer):
# 	def __init__(self,trained_tokenizer_file):
		
# 		self.helix_swap_pretrained = hsb.helixswap_loader(trained_tokenizer_file)
# 		super().__init__()

# 	def get_vocab(self):
# 		return(self.helix_swap_pretrained.vocab())
	
# 	def _tokenize(self,x):
# 		return self.helix_swap_pretrained.tokenize(x)

# 	def _convert_id_to_token(self,ids):
# 		return self.helix_swap_pretrained.decode([ids])

# 	def vocab_size(self):
# 		return len(self.helix_swap_pretrained.get_vocab())
# 		# requires new wheel of helixswapbpe
# 		#self.helix_swap_pretrainedvocab_size()

# 	#def _convert_token_to_id(self,token):
# 	#	print(token)
# 	#	print (type(token))
# 	#	return self.helix_swap_pretrained.encode(token)

# 	def encode(self,sequence):
# 		return self.helix_swap_pretrained.encode(sequence)

In [14]:
tokenizer_path2 = '/scratch/jbusch/ma/tokenizer/hs_tokenizer/fixed_encode_decode/sub5M_tokenizer.json'

In [30]:
hs1 = HelixSwap(tokenizer_path2)

In [32]:
hs1.vocab_size

4095

In [20]:
vocab = hs1.get_vocab()
vocab

OrderedDict([('[UNK]', 0),
             ('[CLS]', 1),
             ('[SEP]', 2),
             ('[PAD]', 3),
             ('[MASK]', 4),
             ('[UNUSED]', 5),
             ('A', 6),
             ('T', 7),
             ('C', 8),
             ('G', 9),
             ('AA', 10),
             ('TT', 11),
             ('TG', 12),
             ('CA', 13),
             ('AG', 14),
             ('CT', 15),
             ('TC', 16),
             ('GA', 17),
             ('AC', 18),
             ('GT', 19),
             ('CC', 20),
             ('GG', 21),
             ('AT', 23),
             ('TA', 25),
             ('TTT', 26),
             ('AAA', 27),
             ('GC', 29),
             ('GAA', 30),
             ('TTC', 31),
             ('CTG', 32),
             ('CAG', 33),
             ('CAA', 34),
             ('TTG', 35),
             ('TAA', 36),
             ('TTA', 37),
             ('ATT', 38),
             ('AAT', 39),
             ('CTT', 40),
             ('AAG', 41),
   

In [28]:
hs1.tokenize(sequences[0])

['AATT', 'ATATATATATAT', 'ATA', 'TGG', 'CCCA', 'CAC', 'GGA', 'CAAAAAA']

In [31]:
hs1.encode(sequences)

[[134, 3075, 119, 43, 137, 53, 96, 890], [316, 7, 119, 0, 96, 647, 3560]]