In [1]:
from datasets import load_dataset,concatenate_datasets, Dataset,DatasetDict
from torch.utils.data import DataLoader
from transformers import AutoTokenizer
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence
import torch

from DMLP.models.my_transformers import MODEL_CLASS
from DMLP.models.models import VAE, DDPM, MLPSkipNet, TransformerNet,VAE_DDPM
from DMLP.train.reconstruction import *
from DMLP.utils.ddpm_schedule import ddpm_schedule
from DMLP.utils.random_init import weights_init_random
from DMLP.train.train_function import train_vae_ddpm


  from .autonotebook import tqdm as notebook_tqdm


In [2]:
class MyCollator(object):
    def __init__(self, encoder_token, decoder_token):
        self.encoder_token = encoder_token
        self.decoder_token = decoder_token
    def __call__(self, batch):
        input_ids_bert = pad_sequence([torch.tensor(f['bert_token'], dtype=torch.long) for f in batch],
                                  batch_first=True, padding_value=self.encoder_token)
        input_ids_gpt = pad_sequence([torch.tensor(f['gpt2_token'], dtype=torch.long) for f in batch],
                                    batch_first=True, padding_value=self.decoder_token)
        try:
            token_lengths = torch.tensor([[len(f['bert_token']), len(f['gpt2_token'])] for f in batch],
                                        dtype=torch.long)
        except:
            token_lengths = torch.zeros((len(batch), 1091))
            for i in range(len(batch)):
                token_lengths[i, len(batch[i]['gpt2_token'])] = 1
        return (input_ids_bert, input_ids_gpt, token_lengths)

In [3]:
batch_size = 2
tokenizer_encoder = AutoTokenizer.from_pretrained("prajjwal1/bert-small")
tokenizer_decoder = AutoTokenizer.from_pretrained("gpt2-xl")
special_tokens_dict = {'pad_token': '<PAD>', 'bos_token': '<BOS>', 'eos_token': '<EOS>', }
num_added_toks = tokenizer_decoder.add_special_tokens(special_tokens_dict)

bert_pad_token = tokenizer_encoder.pad_token_id
gpt2_pad_token = tokenizer_decoder.pad_token_id
my_collator = MyCollator(bert_pad_token, gpt2_pad_token)
#download data
print("download data")

train_eval_dataset =load_dataset("guangyil/yelp_short_v2")
eval_dataloader =  DataLoader(train_eval_dataset['test'], num_workers=0, collate_fn=my_collator,batch_size=batch_size)
train_dataloader = DataLoader(train_eval_dataset['train'], num_workers=0, collate_fn=my_collator, batch_size=batch_size)

tokenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 105kB/s]


download data


In [4]:
eval_dataloader

<torch.utils.data.dataloader.DataLoader at 0x7fd997142500>

In [9]:
#load mode
latent_size = 128
encoder_model_class = MODEL_CLASS['BertForLatentConnectorAVG']
decoder_model_class = MODEL_CLASS['GPT2ForLatentConnectorNew']

# output_dir = "home/AD/yul080/out_temp"
output_dir = "../../out_temp"

In [10]:
len(tokenizer_decoder)

50260

In [8]:
model_decoder = decoder_model_class.from_pretrained("gpt2-xl", latent_size=latent_size,
                                                            latent_as_gpt_emb=True,
                                                            latent_as_gpt_memory=True,local_files_only=False)
model_decoder.resize_token_embeddings(len(tokenizer_decoder))
decoder_n_layer = model_decoder.transformer.config.n_layer
model_decoder.transformer.change_order()

model_encoder = encoder_model_class.from_pretrained("prajjwal1/bert-small", latent_size=latent_size,
                                                        pad_id=tokenizer_encoder.pad_token_id,local_files_only=False)
model_vae = VAE(model_encoder, model_decoder, tokenizer_encoder, tokenizer_decoder, latent_size, output_dir)


checkpoint = torch.load('/home/AD/yul080/runs/checkpoint-full-2/training.bin',map_location=torch.device('cpu'))
model_vae.apply(weights_init_random)
model_vae.load_state_dict(checkpoint['model_state_dict'], strict=False) 
model_vae.to('cuda')   

Some weights of GPT2ForLatentConnectorNew were not initialized from the model checkpoint at gpt2-xl and are newly initialized: ['linear.weight', 'h.48.ln_1.bias', 'h.48.mlp.c_proj.weight', 'linear_emb.weight', 'h.48.attn.c_proj.bias', 'lm_head.bias', 'h.48.attn.c_proj.weight', 'h.48.mlp.c_fc.weight', 'h.48.attn.c_attn.weight', 'h.48.attn.c_attn.bias', 'h.48.ln_1.weight', 'h.48.ln_2.bias', 'h.48.mlp.c_fc.bias', 'h.48.mlp.c_proj.bias', 'h.48.ln_2.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of BertForLatentConnectorAVG were not initialized from the model checkpoint at prajjwal1/bert-small and are newly initialized: ['bert.linear.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


VAE(
  (encoder): BertForLatentConnectorAVG(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 512, padding_idx=0)
      (position_embeddings): Embedding(512, 512)
      (token_type_embeddings): Embedding(2, 512)
      (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=512, out_features=512, bias=True)
              (key): Linear(in_features=512, out_features=512, bias=True)
              (value): Linear(in_features=512, out_features=512, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=512, out_features=512, bias=True)
              (LayerNorm): LayerNorm((512,), eps=1e-12, elementwise_af

In [32]:
count =0
for i in train_dataloader:
    x0,x1,x_lengths = i
    count+=1
    if count==10:
        break
max_len_values, _ = x_lengths.max(0)
x0 = x0[:, :max_len_values[0]]
x1 = x1[:, :max_len_values[1]]
x0 = x0.to("cuda")
x1 = x1.to("cuda")
x_lengths = x_lengths.to("cuda")
context_tokens = tokenizer_decoder.encode(tokenizer_decoder.bos_token)
attention_mask = (x0 != tokenizer_encoder.pad_token_id).float()
reconstruction_mask = (x1 != tokenizer_decoder.pad_token_id).float()
pooled_hidden_fea = model_vae.encoder(x0, attention_mask)[1]

mean, logvar = model_vae.encoder.linear(pooled_hidden_fea).chunk(2, -1)

latent_z = mean.squeeze(1)


In [33]:
out = sample_sequence_conditional(
                        model=model_vae.decoder,
                        context=context_tokens,
                        past=latent_z,
                        length=x_lengths[0, 1],  # Chunyuan: Fix length; or use <EOS> to complete a sentence
                        num_samples=latent_z.size(0),
                        device="cuda",
                        decoder_tokenizer=model_vae.tokenizer_decoder,
                        eos_id=model_vae.eos_token_id
                    )

In [34]:
out

tensor([[50258,  5290,  7962,  1312,   705,   303,  1683,  9658,   287,   764,
           198, 50259],
        [50258,   428,  1295,   318,   262,  1266,  5145,   198, 50259,  5145,
           198, 50259]], device='cuda:0')

In [38]:
tokenizer_encoder.decode(x0[1])

'[CLS] this place is by far the best! [SEP] [PAD]'

In [39]:
tokenizer_decoder.decode(x1[1])

'<BOS> this place is by far the best!\n<EOS><PAD>'

In [40]:
tokenizer_decoder.decode(out[1])

'<BOS> this place is the best!\n<EOS>!\n<EOS>'