# Custom Tokenizer for mutation data

* Tokenize mutation ids (chrom + pos), genotypes (0/0, 0/1, 1/0, 1/1), mutation alleles (ref + alt)

In [None]:
from importlib import reload
from data.data_utils import SEQUENCE_PATTERN

SOURCE_DATA_DIR = 'data/sources/json/'
POP_CODES = ['EUR', 'EAS', 'AFR', 'AMR', 'SAS']

chrom = 22
num_samples = 100 # 2504

# CORPUS_DATA = f'{SOURCE_DATA_DIR}corpus_chr{chrom}_small/'
CORPUS_DATA = f'{SOURCE_DATA_DIR}corpus_chr{chrom}.json'
RANDOM_CORPUS = f'data/generated/json/random/rand_corpus_chr{chrom}_{num_samples}ids_1000muts.json'

# TEST_CORPUS_PATH = f'data/sources/test/test_corpus_chr{chrom}.json'
# TEST_CORPUS_PATH = f'/Volumes/asia_2T/genomics/reference/1000GP/corpus/chr_ALL.json'
TEST_CORPUS_PATH = f'/Volumes/asia_2T/genomics/reference/1000GP/corpus/chr_{chrom}.json'

In [None]:
# from opacus import PrivacyEngine
from models.tokenizers import *

corpus = load_corpus(VCF_corpus='/Volumes/asia_2T/genomics/reference/1000GP/corpus/chr_ALL.json')

Loading corpus from /Volumes/asia_2T/genomics/reference/1000GP/corpus/chr_ALL.json...


# A. Manual Custom VCF Tokenizer
___

In [None]:
manual_tokenizer = ManualTokenizer(subtokenize=True)

seq = 'HG00097 22:938472:T>GCTTG/T_1/1 2:93920:TACACAC>C/T_0|0'
token_ids = manual_tokenizer.encode(seq, allowed_special='all')
print(manual_tokenizer._tokenize(seq)[0]) # encoded tokens
tokens = manual_tokenizer.decode(token_ids)
print(token_ids) # tokenizaotion ids
print(tokens) # decoded tokens

['2', '2', ':', '9', '3', '8', '4', '7', '2', 'T', '>', 'GCTTG/T', '1', '/', '1', '2', ':', '9', '3', '9', '2', '0', 'TACACAC', '>', 'C/T', '0', '|', '0']
[11, 11, 0, 18, 12, 17, 13, 16, 11, 8, 2, 35, 10, 4, 10, 11, 0, 18, 12, 18, 11, 9, 39, 2, 65, 9, 3, 9]
22:938472:T>GCTTG/T_1/1 2:93920:TACACAC>C/T_0|0 


In [93]:
manual_tokens = manual_tokenizer.tokenize_corpus(corpus)
manual_tokenizer.vocab_size, manual_tokenizer.vocab, # len(manual_tokens), 

Tokenizing corpus: 100%|██████████| 823/823 [00:02<00:00, 317.10it/s]


(1059,
 {':': 0,
  '_': 1,
  '>': 2,
  '|': 3,
  '/': 4,
  'A': 5,
  'C': 6,
  'G': 7,
  'T': 8,
  '0': 9,
  '1': 10,
  '2': 11,
  '3': 12,
  '4': 13,
  '5': 14,
  '6': 15,
  '7': 16,
  '8': 17,
  '9': 18,
  '[UNK]': 19,
  '[PAD]': 20,
  '[CLS]': 21,
  '[SEP]': 22,
  '[MASK]': 23,
  'A/T': 24,
  'CT': 25,
  'TG': 26,
  'CGG': 27,
  'TGA': 28,
  'CCT': 29,
  'GC': 30,
  'CA': 31,
  'GT': 32,
  'AT': 33,
  'TCTTG': 34,
  'GCTTG/T': 35,
  'GTA': 36,
  'TAC': 37,
  'TACAC': 38,
  'TACACAC': 39,
  'TACACACACAC/TACACACACACAC': 40,
  'GTT': 41,
  'GTAT': 42,
  'CTG': 43,
  'TTGAG': 44,
  'AAAAAC': 45,
  'TTAAA': 46,
  'CAT': 47,
  'GA': 48,
  'TTTG': 49,
  'CAG': 50,
  'TC': 51,
  'TGAGTAGATGATGGGC': 52,
  'TA': 53,
  'GAAC': 54,
  'GATGA': 55,
  'CCATTCATA': 56,
  'A/C': 57,
  'TGAA': 58,
  'TTC': 59,
  '22': 60,
  '938472': 61,
  'T/A': 62,
  '93920': 63,
  'TAAAGCCTAC': 64,
  'C/T': 65,
  '16070603': 66,
  '16144208': 67,
  '16172288': 68,
  '16410474': 69,
  '16424795': 70,
  '16425962': 

In [None]:
token_ids = manual_tokenizer.encode(seq, allowed_special='all')
print(manual_tokenizer._tokenize(seq)[0]) # encoded tokens
tokens = manual_tokenizer.decode(token_ids)
print(token_ids) # tokenizaotion ids
print(tokens) # decoded tokens

['2', '2', ':', '9', '3', '8', '4', '7', '2', 'T', '>', 'GCTTG/T', '1', '/', '1', '2', ':', '9', '3', '9', '2', '0', 'TACACAC', '>', 'C/T', '0', '|', '0']
[11, 11, 0, 18, 12, 17, 13, 16, 11, 8, 2, 35, 10, 4, 10, 11, 0, 18, 12, 18, 11, 9, 39, 2, 65, 9, 3, 9]
22:938472:T>GCTTG/T_1/1 2:93920:TACACAC>C/T_0|0 


In [None]:
start_sequence = corpus[list(corpus.keys())[0]]
print(start_sequence) # original sequence

HG00099 22:16070603:C>T_0|0 22:16144208:G>T_0|0 22:16172288:G>A_0|0 22:16410474:C>T_0|0 22:16424795:G>A_0|0 22:16425962:G>T_0|0 22:16508084:A>C_0|0 22:16538214:G>A_0|0 22:16568186:G>A_0|0 22:16597246:T>G_0|0 22:16617559:T>C_0|0 22:16628905:G>A_0|0 22:16632066:T>A_0|0 22:16663192:T>G_0|0 22:16880368:C>T_0|0 22:16895958:T>C_0|0 22:16922887:G>A_0|0 22:16932440:A>G_0|1 22:16944153:C>T_0|0 22:16957431:T>C_0|0 22:17031861:A>G_0|0 22:17049243:C>T_0|0 22:17079196:G>T_0|0 22:17105304:A>G_0|0 22:17109100:A>G_0|0 22:17147671:A>G_0|0 22:17253129:C>A_0|0 22:17321089:C>A_0|0 22:17336994:T>C_0|0 22:17338489:A>G_0|0 22:17339041:G>A_1|0 22:17354299:T>A_0|0 22:17443906:T>C_0|0 22:17460640:C>T_0|0 22:17512175:G>A_0|0 22:17553198:G>A_0|0 22:17556466:G>A_0|0 22:17629300:C>A_0|0 22:17705808:C>T_1|1 22:17772561:G>A_0|0 22:17792855:G>A_0|0 22:17816909:C>G_0|0 22:17893234:T>C_0|0 22:17942592:G>T_0|0 22:17974508:G>C_0|0 22:17985294:G>A_0|0 22:18017054:G>A_0|0 22:18018960:T>C_0|0 22:18030910:C>T_0|0 22:18040368:

In [None]:
start_sequence = corpus[list(corpus.keys())[0]]
corpus_token_ids = manual_tokenizer.encode(start_sequence, allowed_special='all')
# print(corpus_token_ids) # tokenized corpus
recon_corpus = manual_tokenizer.decode(corpus_token_ids)
print(recon_corpus) # reconstructed corpus (without sample ID)

22:16070603:C>T_0|0 22:16144208:G>T_0|0 22:16172288:G>A_0|0 22:16410474:C>T_0|0 22:16424795:G>A_0|0 22:16425962:G>T_0|0 22:16508084:A>C_0|0 22:16538214:G>A_0|0 22:16568186:G>A_0|0 22:16597246:T>G_0|0 22:16617559:T>C_0|0 22:16628905:G>A_0|0 22:16632066:T>A_0|0 22:16663192:T>G_0|0 22:16880368:C>T_0|0 22:16895958:T>C_0|0 22:16922887:G>A_0|0 22:16932440:A>G_0|1 22:16944153:C>T_0|0 22:16957431:T>C_0|0 22:17031861:A>G_0|0 22:17049243:C>T_0|0 22:17079196:G>T_0|0 22:17105304:A>G_0|0 22:17109100:A>G_0|0 22:17147671:A>G_0|0 22:17253129:C>A_0|0 22:17321089:C>A_0|0 22:17336994:T>C_0|0 22:17338489:A>G_0|0 22:17339041:G>A_1|0 22:17354299:T>A_0|0 22:17443906:T>C_0|0 22:17460640:C>T_0|0 22:17512175:G>A_0|0 22:17553198:G>A_0|0 22:17556466:G>A_0|0 22:17629300:C>A_0|0 22:17705808:C>T_1|1 22:17772561:G>A_0|0 22:17792855:G>A_0|0 22:17816909:C>G_0|0 22:17893234:T>C_0|0 22:17942592:G>T_0|0 22:17974508:G>C_0|0 22:17985294:G>A_0|0 22:18017054:G>A_0|0 22:18018960:T>C_0|0 22:18030910:C>T_0|0 22:18040368:C>T_0|0 

In [None]:
recon_corpus[:len(recon_corpus)-1] == start_sequence[8:]  # check if reconstruction is correct 
                                                        # (without sample ID and space at the end)

True

# B. Custom Tokenizer from Pre-Trained
___

In [None]:
vcf_tokenizer = VCFTokenizer()

print("Before training:")
print(f"Vocab size: {vcf_tokenizer.vocab_size}")
print(f"Vocab: {vcf_tokenizer.vocab}")

Before training:
Vocab size: 24
Vocab: {':': 0, '_': 1, '>': 2, '|': 3, '/': 4, 'A': 5, 'C': 6, 'G': 7, 'T': 8, '0': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, '9': 18, '[UNK]': 19, '[PAD]': 20, '[CLS]': 21, '[SEP]': 22, '[MASK]': 23}


In [None]:
encoded_ids = vcf_tokenizer.encode("1:23854:A/T>T_1|0 1:507248:GCTTG/T>A_0|0 1:23854:A>CCATTCATA_1/0 1:507248:AC>A_1|1", 
                                   allowed_special='all')
print(f"Encoded IDs: {encoded_ids}")
decoded_seq = vcf_tokenizer.decode(encoded_ids)
print(f"Decoded Sequence: {decoded_seq}")

Encoded IDs: [10, 0, 11, 12, 17, 14, 13, 19, 2, 8, 10, 3, 9, 10, 0, 14, 9, 16, 11, 13, 17, 19, 2, 5, 9, 3, 9, 10, 0, 11, 12, 17, 14, 13, 5, 2, 19, 10, 4, 9, 10, 0, 14, 9, 16, 11, 13, 17, 19, 2, 5, 10, 3, 10]
Decoded Sequence: 1:23854:[UNK]>T_1|0 1:507248:[UNK]>A_0|0 1:23854:A>[UNK]_1/0 1:507248:[UNK]>A_1|1 


In [156]:
vcf_tokenizer.train()

Loading corpus from data/sources/test/test_corpus_chr22.json...


Training tokenizer on corpus: 100%|██████████| 823/823 [00:04<00:00, 200.07it/s]

Tokenizer trained on corpus with 17000 tokens.





In [157]:
print("After training:")
print(f"Vocab size: {vcf_tokenizer.vocab_size}")
print(f"Vocab: {vcf_tokenizer.vocab}")

After training:
Vocab size: 60
Vocab: {':': 0, '_': 1, '>': 2, '|': 3, '/': 4, 'A': 5, 'C': 6, 'G': 7, 'T': 8, '0': 9, '1': 10, '2': 11, '3': 12, '4': 13, '5': 14, '6': 15, '7': 16, '8': 17, '9': 18, '[UNK]': 19, '[PAD]': 20, '[CLS]': 21, '[SEP]': 22, '[MASK]': 23, 'A/T': 24, 'CT': 25, 'TG': 26, 'CGG': 27, 'TGA': 28, 'CCT': 29, 'GC': 30, 'CA': 31, 'GT': 32, 'AT': 33, 'TCTTG': 34, 'GCTTG/T': 35, 'GTA': 36, 'TAC': 37, 'TACAC': 38, 'TACACAC': 39, 'TACACACACAC/TACACACACACAC': 40, 'GTT': 41, 'GTAT': 42, 'CTG': 43, 'TTGAG': 44, 'AAAAAC': 45, 'TTAAA': 46, 'CAT': 47, 'GA': 48, 'TTTG': 49, 'CAG': 50, 'TC': 51, 'TGAGTAGATGATGGGC': 52, 'TA': 53, 'GAAC': 54, 'GATGA': 55, 'CCATTCATA': 56, 'A/C': 57, 'TGAA': 58, 'TTC': 59}


In [None]:
encoded_ids = vcf_tokenizer.encode("1:23854:A/T>T_1|0 1:507248:GCTTG/T>A_0|0 1:23854:A>CCATTCATA_1/0 1:507248:AC>A_1|1", 
                                   allowed_special='all')
print(f"Encoded IDs: {encoded_ids}")
decoded_seq = vcf_tokenizer.decode(encoded_ids)
print(f"Decoded Sequence: {decoded_seq}")

Encoded IDs: [10, 0, 11, 12, 17, 14, 13, 24, 2, 8, 10, 3, 9, 10, 0, 14, 9, 16, 11, 13, 17, 35, 2, 5, 9, 3, 9, 10, 0, 11, 12, 17, 14, 13, 5, 2, 56, 10, 4, 9, 10, 0, 14, 9, 16, 11, 13, 17, 19, 2, 5, 10, 3, 10]
Decoded Sequence: 1:23854:A/T>T_1|0 1:507248:GCTTG/T>A_0|0 1:23854:A>CCATTCATA_1/0 1:507248:[UNK]>A_1|1 


# C. Custom BPE-based Tokenizer
___

In [None]:
CORPUS_PATH = '/Volumes/asia_2T/genomics/reference/1000GP/corpus/chr_ALL.json'
separators = {'[EOS]': '<|endoftext|>',
            # '[BOS]': '<|startoftext|>'},
            # '[START_SUPERPOP]': '<|startofsuperpop|>', 
            # '[END_SUPERPOP]': '<|endofsuperpop|>', 
            '[SEP]': 'Ġ'}
preprocess_corpus_text(load_corpus(CORPUS_PATH),
                        pattern=SEQUENCE_PATTERN,
                        separators=separators,
                        save_path="data/sources/processed_corpus_ALL_CHR_with_separators.txt")
print(f"Corpus preprocessed with tokens: {list(separators.keys())}")

Loading corpus from /Volumes/asia_2T/genomics/reference/1000GP/corpus/chr_ALL.json...


Processing corpus: 100%|██████████| 2543/2543 [00:50<00:00, 50.83it/s] 


Corpus preprocessed with tokens: ['[EOS]', '[SEP]']


In [None]:
separators = {'[SEP]': 'Ġ'
            # '[EOS]': '<|endoftext|>',
            # '[BOS]': '<|startoftext|>'},
            # '[START_SUPERPOP]': '<|startofsuperpop|>', 
            # '[END_SUPERPOP]': '<|endofsuperpop|>', 
}
preprocess_corpus_text(load_corpus(CORPUS_PATH),
                        pattern=SEQUENCE_PATTERN,
                        separators={'[SEP]': 'Ġ'},#, '[BOS]': '<BOS>', '[EOS]': '<EOS>'},
                        save_path="data/sources/processed_corpus_ALL_CHR_gpt_style.txt")
                        
print(f"Corpus preprocessed with tokens: {list(separators.keys())}")

Loading corpus from /Volumes/asia_2T/genomics/reference/1000GP/corpus/chr_ALL.json...


Processing corpus: 100%|██████████| 2543/2543 [00:47<00:00, 53.10it/s]


Corpus preprocessed with tokens: ['[SEP]']


In [None]:
separators = {'[SEP]': ' ',}
preprocess_corpus_text(load_corpus(CORPUS_PATH),
                        pattern=SEQUENCE_PATTERN,
                        separators=separators,
                        save_path="data/sources/processed_corpus_ALL_CHR.txt")
                        
print(f"Corpus preprocessed with tokens: {list(separators.keys())}")

Loading corpus from /Volumes/asia_2T/genomics/reference/1000GP/corpus/chr_ALL.json...


Processing corpus: 100%|██████████| 2543/2543 [00:47<00:00, 53.18it/s]


Corpus preprocessed with tokens: ['[SEP]']


In [None]:
CORPUS_PATH = '/Volumes/asia_2T/genomics/reference/1000GP/corpus/chr22.json'
# use the same special tokens used for the finetuning model
separators = {'[SEP]': 'Ġ',
            '[BOS]': '<START_SAMPLE>',
            '[EOS]': '<END_SAMPLE>',
            # '[START_SUPERPOP]': '<|startofsuperpop|>', 
            # '[END_SUPERPOP]': '<|endofsuperpop|>', 
}
preprocess_corpus_text(load_corpus(CORPUS_PATH),
                        pattern=SEQUENCE_PATTERN,
                        separators=separators,
                        save_path="data/sources/training_tokenizer_corpus_chr22_with_separators.txt")
                        
print(f"Corpus preprocessed with tokens: {list(separators.keys())}")

Loading corpus from /Volumes/asia_2T/genomics/reference/1000GP/corpus/chr22.json...


Processing corpus:  79%|███████▉  | 1989/2504 [07:23<01:56,  4.42it/s]

: 

## C.1. BPEVCFTokenizer
___

In [3]:
from models.tokenizers import BPEVCFTokenizer

bpe_tokenizer = BPEVCFTokenizer()
# TRAIN_CORPUS = "data/sources/processed_corpus_ALL_CHR_with_separators.txt"
TRAIN_CORPUS = "data/sources/processed_corpus_with_separators.txt"
bpe_tokenizer.train(TRAIN_CORPUS, vocab_size=4096, allowed_special={"<|endoftext|>", 'Ġ'})

Loading training corpus...
Building vocabulary from processed text...



[A
[A

In [None]:
input_text = "1:23854:A/T>T_1|0 1:507248:GCTTG/T>A_0|0 1:23854:A>CCATTCATA_1/0 1:507248:AC>A_1|1"
token_ids = bpe_tokenizer.encode(input_text, allowed_special='all')
print(token_ids)
print(bpe_tokenizer.decode(token_ids))

[10, 0, 57, 458, 13, 384, 4, 46, 8, 84, 33, 28, 10, 0, 463, 99, 17, 0, 7, 647, 7, 4, 46, 5, 34, 33, 28, 10, 0, 57, 458, 13, 384, 2, 6, 122, 320, 122, 181, 84, 4, 9, 28, 10, 0, 463, 99, 17, 384, 38, 5, 84, 85]
1:23854:A/T>T_1|0 1:507248:GCTTG/T>A_0|0 1:23854:A>CCATTCATA_1/0 1:507248:AC>A_1|1


In [6]:
for token_id in token_ids:
    print(f"{token_id} -> {bpe_tokenizer.decode([token_id])}")

10 -> 1
0 -> :
57 -> 23
458 -> 85
13 -> 4
384 -> :A
4 -> /
46 -> T>
8 -> T
84 -> _1
33 -> |0
28 ->  
10 -> 1
0 -> :
463 -> 507
99 -> 24
17 -> 8
0 -> :
7 -> G
647 -> CTT
7 -> G
4 -> /
46 -> T>
5 -> A
34 -> _0
33 -> |0
28 ->  
10 -> 1
0 -> :
57 -> 23
458 -> 85
13 -> 4
384 -> :A
2 -> >
6 -> C
122 -> CA
320 -> TT
122 -> CA
181 -> TA
84 -> _1
4 -> /
9 -> 0
28 ->  
10 -> 1
0 -> :
463 -> 507
99 -> 24
17 -> 8
384 -> :A
38 -> C>
5 -> A
84 -> _1
85 -> |1


In [7]:
bpe_tokenizer.save_vocab_and_merges(vocab_path="models/saved/tokenizers/BPE/vocab.json", 
                                    bpe_merges_path="models/saved/tokenizers/BPE/bpe_merges.txt")

In [16]:
len(bpe_tokenizer.vocab), bpe_tokenizer.vocab[2500]

(4096, '704645:C>A_0|0Ġ22:47')

In [None]:
preprocess_corpus_text(load_corpus(TEST_CORPUS_PATH),
                        pattern=SEQUENCE_PATTERN,
                        separators={'[SEP]': ' '}, # '[BOS]': '<|startoftext|>'},
                        save_path="data/sources/processed_corpus.txt")
print("Corpus preprocessed with space tokens")

Loading corpus from data/sources/test/test_corpus_chr22.json...


Processing corpus: 100%|██████████| 823/823 [00:02<00:00, 354.29it/s]


Corpus preprocessed with space tokens


In [2]:
import models.tokenizers
reload(models.tokenizers)
from models.tokenizers import *

## C.2. Regex Tonkenizer
___

In [None]:
from models.tokenizers import RegexTokenizer

special_tokens = {
    '<|endoftext|>': 4097,
    'Ġ': 4098,  # space token
}
# special_tokens = {'<START_SAMPLE>':4097,
#                 '<END_SAMPLE>': 4098,
#                 '<MUT_SEP>': 4099,
#                 '<START_ID>': 4100,
#                 '<END_ID>': 4101,
#                 '<START_POP>': 4102,
#                 '<END_POP>': 4103,
#                 '<PAD>': 4104,
#                 '<UNK>': 4105}
TRAIN_CORPUS = "data/sources/processed_corpus_with_separators.txt"
with open(TRAIN_CORPUS, 'r', encoding='utf-8') as file:
    corpus = file.read()


In [None]:
vocab_size = 4096 #+ len(special_tokens)
regex_tokenizer = RegexTokenizer(special_tokens=special_tokens)
regex_tokenizer.train(corpus, vocab_size=vocab_size, verbose=True)
regex_tokenizer.save()

Found 999093 formatted mutations, encoded into 19134089 byte tokens


Training Regex BPE tokenizer: 100%|██████████| 3840/3840 [1:21:40<00:00,  1.28s/it, merge 3840/3840: (3216, 333) -> 4095 (b'22:48719458:A>C_0|0') had 659 occurrences]                         


In [None]:
regex_tokenizer.decode(regex_tokenizer.encode("22:23854:A/T>T_0|1 22:507248:GCTTG/T>A_0|0 22:23854:A>CCATTCATA_1/0 22:507248:AC>A_1|1", 
                                              allowed_special='all'))

'22:23854:A/T>T_0|1 22:507248:GCTTG/T>A_0|0 22:23854:A>CCATTCATA_1/0 22:507248:AC>A_1|1'

In [12]:
prova_tokenizer = RegexTokenizer()
prova_tokenizer.load()

# input_text = "22:23854:A/T>T_3|222:507248:GCTTG/T>A_0|0 22:23854:A>CCATTCATA_1/0 22:507248:AC>A_1|1"
input_text = "22:23854:A/T>T_3|1<|endoftext|>22:507248:GCTTG/T>A_0|0Ġ22:23854:A>CCATTCATA_1/0Ġ22:507248:AC>A_1|1"
token_ids = prova_tokenizer.encode(input_text, allowed_special="all")
print(token_ids)
print(prova_tokenizer.decode(token_ids))

[348, 56, 357, 643, 47, 273, 84, 95, 51, 310, 4097, 3351, 307, 56, 58, 71, 997, 71, 47, 273, 264, 4098, 348, 56, 357, 277, 67, 356, 572, 356, 410, 309, 47, 48, 4098, 3351, 307, 56, 643, 262, 65, 347]
22:23854:A/T>T_3|1 22:507248:GCTTG/T>A_0|0 22:23854:A>CCATTCATA_1/0 22:507248:AC>A_1|1
