In [None]:
# pip install datasets --user

In [1]:
from run_train import create_model_and_diffusion
from utils.step_sample import create_named_schedule_sampler
from train_util import TrainLoop
from utils.data import load_data_text

from transformers import AutoTokenizer, PreTrainedTokenizerFast, BertTokenizerFast, set_seed
import json, torch, os
from utils import dist_util
from functools import partial
import pickle
import random

2024-02-04 19:24:59.309549: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
# with open('vocab_list.pickle', 'rb') as handle:
#     vocab_list = pickle.load(handle)
# vocab_list = list(vocab_list.values())

In [3]:
dist_util.clear_cache()

In [4]:
lr=0.0001
batch_size=64
microbatch=20
epochs=5000
eval_interval=1000
ema_rate='0.9999' 
schedule_sampler='uniform'
diffusion_steps=1000
noise_schedule='sqrt'
vocab='custom'
use_plm_init='no' # embedding in transformer
vocab_size=0
config_name='bert-base-uncased'
cc_data_dir='data/commonsense'
ss_data_dir='data/shakespeare'
combined_data_dir='data/combined'
data_dir=combined_data_dir
seq_len=128
hidden_t_dim=128
hidden_dim=64
dropout=0.1
seed=102
weight_decay=0.0
predict_xstart=True
rescale_timesteps=True
emb_scale_factor=1.0

In [5]:
set_seed(seed)

In [6]:
class myTokenizer():
    """
    Load tokenizer from bert config or defined BPE vocab dict
    """
    ################################################
    ### You can custome your own tokenizer here. ###
    ################################################
    def __init__(self, vocab, config_name):
        if vocab == 'bert':
            tokenizer = AutoTokenizer.from_pretrained(config_name)
            self.tokenizer = tokenizer
            self.sep_token_id = tokenizer.sep_token_id
            self.pad_token_id = tokenizer.pad_token_id
        elif vocab == 'shakespeare':
            tokenizer = BertTokenizerFast('shakespeare-tokenizer-bert/vocab.txt')
            self.tokenizer = tokenizer
            self.sep_token_id = tokenizer.sep_token_id
            self.pad_token_id = tokenizer.pad_token_id
        elif vocab == 'combined':
            # look into common english words
            # modern english to old english encoder decoder
            tokenizer = AutoTokenizer.from_pretrained(config_name)
            ss_tokenizer = BertTokenizerFast('shakespeare-tokenizer-bert/vocab.txt')
            new_tokens = list(set(ss_tokenizer.vocab.keys())-set(tokenizer.vocab.keys()))
            n_new_tokens = tokenizer.add_tokens(new_tokens)
            print(f'### Total of {n_new_tokens} new tokens added')
            self.tokenizer = tokenizer
            self.sep_token_id = tokenizer.sep_token_id
            self.pad_token_id = tokenizer.pad_token_id

        self.vocab_size = len(self.tokenizer)
    
    def encode_token(self, sentences):
        if isinstance(self.tokenizer, dict):
            input_ids = [[0] + [self.tokenizer.get(x, self.tokenizer['[UNK]']) for x in seq.split()] + [1] for seq in sentences]
        elif isinstance(self.tokenizer, PreTrainedTokenizerFast):
            input_ids = self.tokenizer(sentences, add_special_tokens=True)['input_ids']
        else:
            assert False, "invalid type of vocab_dict"
        return input_ids
        
    def decode_token(self, seq):
        if isinstance(self.tokenizer, dict):
            seq = seq.squeeze(-1).tolist()
            while len(seq)>0 and seq[-1] == self.pad_token_id:
                seq.pop()
            tokens = " ".join([self.rev_tokenizer[x] for x in seq]).replace('__ ', '').replace('@@ ', '')
        elif isinstance(self.tokenizer, PreTrainedTokenizerFast):
            seq = seq.squeeze(-1).tolist()
            while len(seq)>0 and seq[-1] == self.pad_token_id:
                seq.pop()
            tokens = self.tokenizer.decode(seq)
        else:
            assert False, "invalid type of vocab_dict"
        return tokens


def load_model_emb(hidden_dim, tokenizer):
    ### random emb or pre-defined embedding like glove embedding. You can custome your own init here.
    model = torch.nn.Embedding(tokenizer.vocab_size, hidden_dim)
    torch.nn.init.normal_(model.weight)

    return model, tokenizer


def load_tokenizer(vocab, config_name):
    tokenizer = myTokenizer(vocab, config_name)
    return tokenizer

In [7]:
tokenizer = load_tokenizer('combined', config_name)

### Total of 19044 new tokens added


In [8]:
tokenizer.vocab_size

49566

In [9]:
tokenizer.encode_token('find we a time for fright peace to pant')

[101,
 10882,
 33872,
 2057,
 1037,
 2051,
 39230,
 1054,
 10424,
 35550,
 1044,
 2102,
 3521,
 2000,
 43917,
 102]

In [10]:
model_weight, tokenizer = load_model_emb(hidden_dim, tokenizer)

In [11]:
model_weight

Embedding(49566, 64)

In [12]:
## very very important to set this!!!!!
vocab_size = tokenizer.vocab_size

In [13]:
vocab_size

49566

In [14]:
data = load_data_text(
        batch_size=batch_size,
        seq_len=seq_len,
        data_dir=data_dir,
        loaded_vocab=tokenizer,
        model_emb=model_weight # use model's weights as init
    )

############################## 
Loading text data...
############################## 
Loading dataset from data/combined...
### Loading form the TRAIN set...
### Data samples...
 ['under their constitution , japan cannot have a military . everybody in the self - defense force is a volunteer .', "i do n't see how a mewtwo only bracket will advance his meta when his meta also involves how how fares against other characters ."] ['sounds like technicality . i mean , is it viable career to be a soldier in japan ?', "i think it 's just to encourage more people to play him . it may not help developing mu knowledge , but tech knowledge and general discovery should get a nice boost ."]
RAM used: 1785.95 MB
This is raw_datasets:  Dataset({
    features: ['src', 'trg'],
    num_rows: 1099754
})
RAM used: 2009.96 MB


Running tokenizer on dataset (num_proc=4):   0%|          | 0/1099754 [00:00<?, ? examples/s]

### tokenized_datasets Dataset({
    features: ['input_id_x', 'input_id_y'],
    num_rows: 1099754
})
### tokenized_datasets...example [101, 1057, 33872, 9413, 2037, 48572, 1045, 43961, 10163, 1010, 2900, 34525, 27178, 34503, 1041, 1037, 32755, 1061, 1012, 1041, 42960, 1061, 44186, 1061, 1999, 1996, 2969, 1011, 48867, 24978, 2063, 36381, 1041, 2003, 1037, 49537, 25212, 2099, 1012, 102]
RAM used: 3159.72 MB


merge and mask:   0%|          | 0/1099754 [00:00<?, ? examples/s]

RAM used: 4299.55 MB


padding:   0%|          | 0/1099754 [00:00<?, ? examples/s]

Dataset({
    features: ['input_id_x', 'input_id_y', 'input_ids', 'input_mask'],
    num_rows: 1099754
}) padded dataset
RAM used: 6485.15 MB
RAM used: 6292.14 MB


Passed in as batch in TrainLoop - this is the batch data

In [15]:
# next(data)[0].shape # batch_size, seq_len, hidden_dim

In [16]:
# next(data)[0]

Passed in as cond in TrainLoop - this is a dictionary of input_ids and input_mask

In [17]:
# next(data)[1]

In [18]:
# next(data)[1]['input_ids'].shape # batch_size, hidden_dim

In [19]:
# next(data)[1]['input_mask'].shape # batch_size, hidden_dim

In [20]:
model, diffusion = create_model_and_diffusion(
                        hidden_t_dim,
                        hidden_dim,
                        vocab_size,
                        config_name,
                        use_plm_init,
                        dropout,
                        diffusion_steps,
                        noise_schedule,
                        predict_xstart,
                        rescale_timesteps,
                    )

In [21]:
model.to(dist_util.dev())

TransformerNetModel(
  (word_embedding): Embedding(49566, 64)
  (lm_head): Linear(in_features=64, out_features=49566, bias=True)
  (time_embed): Sequential(
    (0): Linear(in_features=128, out_features=512, bias=True)
    (1): SiLU()
    (2): Linear(in_features=512, out_features=768, bias=True)
  )
  (input_up_proj): Sequential(
    (0): Linear(in_features=64, out_features=768, bias=True)
    (1): Tanh()
    (2): Linear(in_features=768, out_features=768, bias=True)
  )
  (input_transformers): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features

In [22]:
pytorch_total_params = sum(p.numel() for p in model.parameters())

In [23]:
pytorch_total_params

90411358

In [24]:
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'

In [25]:
schedule_sampler = create_named_schedule_sampler('uniform', diffusion)

TrainLoop(
        model=model,
        diffusion=diffusion,
        data=data,
        batch_size=batch_size,
        microbatch=microbatch,
        lr=lr,
        ema_rate=ema_rate,
        schedule_sampler=schedule_sampler,
        weight_decay=weight_decay,
        epochs=epochs,
#         eval_data=data_valid,
        eval_interval=eval_interval
    ).run_loop()

Epoch 0 Loss: 1.017941951751709
Epoch 1 Loss: 0.7939045429229736
Epoch 2 Loss: 0.8087126016616821
Epoch 3 Loss: 0.8104199767112732
Epoch 4 Loss: 0.7441797256469727
Epoch 5 Loss: 0.7625315189361572
Epoch 6 Loss: 0.7080917954444885
Epoch 7 Loss: 0.6782647371292114
Epoch 8 Loss: 0.6661217212677002
Epoch 9 Loss: 0.64247727394104
Epoch 10 Loss: 0.6533815860748291
Epoch 11 Loss: 0.6965190172195435
Epoch 12 Loss: 0.5396015048027039
Epoch 13 Loss: 0.5966410636901855
Epoch 14 Loss: 0.5668760538101196
Epoch 15 Loss: 0.581430196762085
Epoch 16 Loss: 0.5724841356277466
Epoch 17 Loss: 0.5836612582206726
Epoch 18 Loss: 0.5455548763275146
Epoch 19 Loss: 0.5476232767105103
Epoch 20 Loss: 0.5333210229873657
Epoch 21 Loss: 0.5537278652191162
Epoch 22 Loss: 0.5353633761405945
Epoch 23 Loss: 0.5387117862701416
Epoch 24 Loss: 0.5012481212615967
Epoch 25 Loss: 0.5682761073112488
Epoch 26 Loss: 0.49232715368270874
Epoch 27 Loss: 0.5420951843261719
Epoch 28 Loss: 0.5308223366737366
Epoch 29 Loss: 0.5457789897

In [27]:
pickle.dump(model, open("models/model_0204.pkl", 'wb'))

In [51]:
with open('models/model_0204.pkl', 'rb') as handle:
    model = pickle.load(handle)

In [None]:
model.eval().to(dist_util.dev())

model_emb = torch.nn.Embedding(
        num_embeddings=tokenizer.vocab_size, 
        embedding_dim=hidden_dim, 
        _weight=model.word_embedding.weight.clone().cpu()
    ).eval()

In [63]:
ss_data = load_data_text(
        batch_size=10,
        seq_len=seq_len,
        data_dir=ss_data_dir,
        loaded_vocab=tokenizer,
        model_emb=model_emb.cpu() # use model's weights as init
    )

############################## 
Loading text data...
############################## 
Loading dataset from data/shakespeare...
### Loading form the TRAIN set...
### Data samples...
 ['so i do, madonna, but to read his right wits is to', 'why, cousin! why, rosalind! cupid have mercy! not a word?'] ['read thus therefore perpend, my princess, and give ear <EOS>', 'not one to throw at a dog <EOS>']
RAM used: 6025.28 MB
This is raw_datasets:  Dataset({
    features: ['src', 'trg'],
    num_rows: 99754
})
RAM used: 6039.30 MB


Running tokenizer on dataset (num_proc=4):   0%|          | 0/99754 [00:00<?, ? examples/s]

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f3d93aceee0>
Traceback (most recent call last):
  File "/home/xykong/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/xykong/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
    if w.is_alive():
  File "/opt/conda/lib/python3.9/multiprocessing/process.py", line 160, in is_alive
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f3d93aceee0>
Traceback (most recent call last):
  File "/home/xykong/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/xykong/.local/lib/python3.9/site-packages/torch/utils/data/dataloader.py", line 1461, in _shutdown_workers
 

### tokenized_datasets Dataset({
    features: ['input_id_x', 'input_id_y'],
    num_rows: 99754
})
### tokenized_datasets...example [101, 2061, 1045, 2079, 1010, 1049, 45216, 1050, 2532, 1010, 2021, 2000, 3191, 2010, 1054, 35550, 1044, 2102, 25433, 2003, 2000, 102]
RAM used: 5920.31 MB


merge and mask:   0%|          | 0/99754 [00:00<?, ? examples/s]

RAM used: 5970.70 MB


padding:   0%|          | 0/99754 [00:00<?, ? examples/s]

Dataset({
    features: ['input_id_x', 'input_id_y', 'input_ids', 'input_mask'],
    num_rows: 99754
}) padded dataset
RAM used: 6144.10 MB
RAM used: 6136.56 MB


In [67]:
model.train() # TURNING THE TRAIN MODE BACK ON TO ENABLE BATCHNORM/DROPOUT!!

TrainLoop(
        model=model,
        diffusion=diffusion,
        data=ss_data,
        batch_size=batch_size,
        microbatch=microbatch,
        lr=lr,
        ema_rate=ema_rate,
        schedule_sampler=schedule_sampler,
        weight_decay=weight_decay,
        epochs=epochs,
#         eval_data=data_valid,
        eval_interval=eval_interval
    ).run_loop()

Epoch 0 Loss: 0.05167607218027115
Epoch 1 Loss: 0.0734344944357872
Epoch 2 Loss: 0.07919331640005112
Epoch 3 Loss: 0.07576844841241837
Epoch 4 Loss: 0.06197904422879219
Epoch 5 Loss: 0.0658922865986824
Epoch 6 Loss: 0.06832915544509888
Epoch 7 Loss: 0.06828071177005768
Epoch 8 Loss: 0.09899711608886719
Epoch 9 Loss: 0.06782440841197968
Epoch 10 Loss: 0.06965398043394089
Epoch 11 Loss: 0.08135969191789627
Epoch 12 Loss: 0.08371102064847946
Epoch 13 Loss: 0.04756990075111389
Epoch 14 Loss: 0.09945707768201828
Epoch 15 Loss: 0.07712647318840027
Epoch 16 Loss: 0.058792319148778915
Epoch 17 Loss: 0.061239417642354965
Epoch 18 Loss: 0.06195593997836113
Epoch 19 Loss: 0.06561761349439621
Epoch 20 Loss: 0.07065131515264511
Epoch 21 Loss: 0.08140132576227188
Epoch 22 Loss: 0.07403337210416794
Epoch 23 Loss: 0.06517104804515839
Epoch 24 Loss: 0.0710412785410881
Epoch 25 Loss: 0.0761718899011612
Epoch 26 Loss: 0.07438506186008453
Epoch 27 Loss: 0.06601317971944809
Epoch 28 Loss: 0.086243771016597

In [68]:
model.eval().to(dist_util.dev())

model_emb = torch.nn.Embedding(
        num_embeddings=tokenizer.vocab_size, 
        embedding_dim=hidden_dim, 
        _weight=model.word_embedding.weight.clone().cpu()
    ).eval()

In [69]:
data_test = load_data_text(
        batch_size=20,
        seq_len=seq_len,
        deterministic=True,
        data_dir=data_dir,
        split="test",
        loaded_vocab=tokenizer,
        model_emb=model_emb.cpu(),  # using the same embedding wight with tranining data
        loop=False
    )

############################## 
Loading text data...
############################## 
Loading dataset from data/combined...
### Loading form the TEST set...
### Data samples...
 ['smile upon this contract, whose ceremony', "if you 'd like , i 'll play a game with you . we 'll just need to download , edit , and re - upload each iteration . would you like to be dark or light ?"] ['shall seem expedient on the now-born brief', 'make a subreddit . each new circle is a new post . submit your moves in a reply chain in the post . do . it .']
RAM used: 6083.43 MB
This is raw_datasets:  Dataset({
    features: ['src', 'trg'],
    num_rows: 21084
})
RAM used: 6089.57 MB


Running tokenizer on dataset (num_proc=4):   0%|          | 0/21084 [00:00<?, ? examples/s]

### tokenized_datasets Dataset({
    features: ['input_id_x', 'input_id_y'],
    num_rows: 21084
})
### tokenized_datasets...example [101, 41026, 1041, 1057, 46813, 2023, 47594, 2552, 1010, 49357, 1041, 37872, 1061, 102]
RAM used: 6122.86 MB


merge and mask:   0%|          | 0/21084 [00:00<?, ? examples/s]

RAM used: 6146.83 MB


padding:   0%|          | 0/21084 [00:00<?, ? examples/s]

Dataset({
    features: ['input_id_x', 'input_id_y', 'input_ids', 'input_mask'],
    num_rows: 21084
}) padded dataset
RAM used: 6184.80 MB
RAM used: 6184.80 MB


In [70]:
all_test_data = []

idx = 0

try:
    while True:
        batch, cond = next(data_test)
        # print(batch.shape)
        all_test_data.append(cond)
        idx += 1

except StopIteration:
    print('### End of reading iteration...')

model_emb.to(dist_util.dev())


### End of reading iteration...


Embedding(49566, 64)

In [71]:
len(all_test_data) # number of batches

1055

In [72]:
import numpy as np

def get_efficient_knn(model_emb, text_emb):
    emb_norm = (model_emb**2).sum(-1).view(-1, 1) # vocab
    text_emb_t = torch.transpose(text_emb.view(-1, text_emb.size(-1)), 0, 1) # d, bsz*seqlen
    arr_norm = (text_emb ** 2).sum(-1).view(-1, 1) # bsz*seqlen, 1
    # print(emb_norm.shape, arr_norm.shape)
    dist = emb_norm + arr_norm.transpose(0, 1) - 2.0 * torch.mm(model_emb, text_emb_t) # (vocab, d) x (d, bsz*seqlen)
    dist = torch.clamp(dist, 0.0, np.inf)
    # print(dist.shape)
    topk_out = torch.topk(-dist, k=1, dim=0)
    return topk_out.values, topk_out.indices

def denoised_fn_round(model, text_emb, t):
    # print(text_emb.shape) # bsz, seqlen, dim
    model_emb = model.weight  # input_embs
    # print(t)
    old_shape = text_emb.shape
    old_device = text_emb.device

    if len(text_emb.shape) > 2:
        text_emb = text_emb.reshape(-1, text_emb.size(-1))
    else:
        text_emb = text_emb
    # val, indices = get_knn(model_emb, text_emb.to(model_emb.device), dist=dist)
    val, indices = get_efficient_knn(model_emb, text_emb.to(model_emb.device))
    rounded_tokens = indices[0]
    # print(rounded_tokens.shape)
    new_embeds = model(rounded_tokens).view(old_shape).to(old_device)

    return new_embeds

In [73]:
step = 1000
clip_denoised = False
model_kwargs = {}
top_p = 0
clamp_step = 0

In [74]:
iterator = iter(all_test_data)
word_lst_recover = []
word_lst_ref = []
word_lst_source = []

for cond in iterator:

    input_ids_x = cond.pop('input_ids').to(dist_util.dev())
    x_start = model.get_embeds(input_ids_x)
    input_ids_mask = cond.pop('input_mask')
    input_ids_mask_ori = input_ids_mask

    noise = torch.randn_like(x_start)
    input_ids_mask = torch.broadcast_to(input_ids_mask.unsqueeze(dim=-1), x_start.shape).to(dist_util.dev())
    x_noised = torch.where(input_ids_mask == 0, x_start, noise)

    model_kwargs = {}

    if step == diffusion_steps:
        use_ddim = False
        step_gap = 1
    else:
        use_ddim = True
        step_gap = diffusion_steps//step

    sample_fn = (
        diffusion.p_sample_loop if not use_ddim else diffusion.ddim_sample_loop
    )

    sample_shape = (x_start.shape[0], seq_len, hidden_dim)

    samples = sample_fn(
        model,
        sample_shape,
        noise=x_noised,
        clip_denoised=clip_denoised,
        denoised_fn=partial(denoised_fn_round, model_emb),
        model_kwargs=model_kwargs,
        top_p=top_p,
        clamp_step=clamp_step,
        clamp_first=True,
        mask=input_ids_mask,
        x_start=x_start,
        gap=step_gap
    )

    # print(samples[0].shape) # samples for each step

    sample = samples[-1]

    # print('decoding for seq2seq', )
    # print(sample.shape)

    logits = model.get_logits(sample)  # bsz, seqlen, vocab
    cands = torch.topk(logits, k=1, dim=-1)

#     word_lst_recover = []
#     word_lst_ref = []
#     word_lst_source = []

    # tokenizer = load_tokenizer(args)

    for seq, input_mask in zip(cands.indices, input_ids_mask_ori):
        len_x = seq_len - sum(input_mask).tolist()
        tokens = tokenizer.decode_token(seq[len_x:])
        word_lst_recover.append(tokens)

    for seq, input_mask in zip(input_ids_x, input_ids_mask_ori):
        # tokens = tokenizer.decode_token(seq)
        len_x = seq_len - sum(input_mask).tolist()
        word_lst_source.append(tokenizer.decode_token(seq[:len_x]))
        word_lst_ref.append(tokenizer.decode_token(seq[len_x:]))
    break # after 1 batch
    

  0%|          | 0/1000 [00:00<?, ?it/s]

Generating 20 sentences takes 5 minutes

In [75]:
cond

{}

In [76]:
word_lst_source

['[CLS] smil e u pon this contr act, whos e ceremon y [SEP] [SEP]',
 "[CLS] if you'd like, i'll pla y a gam e w ith you. we'll just need to do wn loa d, e dit, a nd re - up loa d e ach iter ation. would you like to be dark or lig ht? [SEP] [SEP]",
 '[CLS] you l ack the s eas on of all natures, slee p < eos > [SEP] [SEP]',
 '[CLS] fo r engl a nd, cous in, go [SEP] [SEP]',
 '[CLS] happ y bir thd ay wow i ne ver t hou gh t i would get gold fo r somet hing like s ay ing happ y bir thd ay. thank you ra nd om red dit man. [SEP] [SEP]',
 "[CLS] you s hou ld see if th ey make any fe nd er sty le knobs that fit the ped al beca use that'd loo k awe som e. [SEP] [SEP]",
 '[CLS] nob od y in that hou se likes m ath, appar ent ly. th ey alm ost all learned to coun t to 19, i gu ess. [SEP] [SEP]',
 "[CLS] i hav e no inte rest in buying gta v. it's not a g enr e i like. yet i watched 25 minutes on tb talking a bou t its perfor m anc e on m ach ine i cou ldn't aff ord. the man is a wiz ard. [SEP] [SEP]

In [77]:
word_lst_recover

['[CLS] to our, hav e e t your, t wor - [SEP]',
 '[CLS] > y will thee nd all he ast gh on [SEP]',
 "[CLS] the, m'will or to shal e the as [SEP]",
 '[CLS] gh 氷 you, hy ay so co [SEP]',
 "[CLS] ey a, to,'e [PAD] [SEP]",
 '[CLS]',
 '[CLS] a nd not hou be [SEP]',
 '[CLS] nd ve e a nd - g [PAD] a y a [SEP] [SEP]',
 '[CLS] would b sir gh p e ed to bef gh [SEP]',
 '[CLS] wh me at 氷, nd it 氷 i gh e < [SEP]',
 '[CLS] le shal of he ve, tur be [SEP] eos',
 '[CLS]',
 "[CLS] th their, this not e a'' h e first of it king s your [SEP]",
 '[CLS] serv nd oes er n am the [SEP]',
 '[CLS] he e that - a, < eos >? [SEP]',
 '[CLS] a nd wel r a nd be fo r e ost e d, [SEP]',
 '[CLS] thee y then the you the [SEP]',
 "[CLS] at you'the y l wel m e [SEP]",
 '[CLS] be c he o is e, in < eos > [SEP]',
 '[CLS] ent wh e [SEP] ef [SEP] [PAD] ig']

In [78]:
word_lst_ref

['[CLS] shal l seem expedient on the now - bor n bri ef [SEP]',
 '[CLS] make a s ub red dit. e ach n ew circl e is a n ew pos t. s ub mit your m ov es in a repl y chain in the pos t. do. it. [SEP]',
 "[CLS] come, we'll to slee p < eos > my strang e a nd self - abuse [SEP]",
 '[CLS] h ub ert shal l be your man, att e nd on you [SEP]',
 '[CLS] this comme nt made my d ay. [SEP]',
 '[CLS] a nd paint e ach section to resem ble the amp th ey repres ent. [SEP]',
 '[CLS] i feel like t hos e kids wer e hig h ly co ach ed by their insane pare nts. so sad. poo r kids. [SEP]',
 '[CLS] his voice makes me belie ve i hav e a comp uter as g oo d as his a nd the gam e on my ha nd s. its a fucking ad diction [SEP]',
 '[CLS] more than i dare make faul ts < eos > you f ew that l ov ed me [SEP]',
 '[CLS] stri ke, fel lows, stri ke, this is the man i seek < eos > [SEP]',
 "[CLS] it's an old meta pos t, but it c hec ks out [SEP]",
 "[CLS] ok ay ok ay but it's not as bad the leag ue of lege nd s bo ard s team