In [1]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
from DDT.denoising_transformer import DenoisingDiffusionTransformer
from DDT.noise_scheduling import SqrtNoiseSchedule
from DDT.xlm_roberta_embeddings import XLMRobertaEmbeddings

%reload_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize the model and tokenizer
model_name = "xlm-roberta-large"
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create an XLMRobertaEmbeddings instance and initialize its weights from the original XLM-Roberta
model.config.max_time_steps = 2000
embedding_layer = XLMRobertaEmbeddings(model.config)
embedding_layer.from_pretrained(model.roberta.embeddings)

# Create a DenoisingDiffusionTransformer model
denoising_model = DenoisingDiffusionTransformer(
    encoder=model.roberta.encoder,
    embedding_layer=embedding_layer,
    unembedding_layer=model.lm_head,
    embedding_dim=model.config.hidden_size,
    max_seq_len=model.config.max_position_embeddings,
    context_len=128,
    pred_len=128,
    sep_token=tokenizer.sep_token_id,
    bos_token=tokenizer.bos_token_id,
    pad_token=tokenizer.pad_token_id,
    prediction_steps=10,
    noise_schedule=SqrtNoiseSchedule(2000)
)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
prompt = "Previous text"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, padding_side="left")

# Now let's use the denoising model
pred = denoising_model.generate(inputs["input_ids"], 32, denoise_steps=10)

print(tokenizer.decode(pred[0]))

torch.Size([1, 36, 1024]) torch.Size([1, 36])
torch.Size([1, 68, 1024]) torch.Size([1, 68])
torch.Size([1, 68, 1024]) torch.Size([1, 68])
torch.Size([1, 68, 1024]) torch.Size([1, 68])
torch.Size([1, 68, 1024]) torch.Size([1, 68])
torch.Size([1, 68, 1024]) torch.Size([1, 68])
torch.Size([1, 68, 1024]) torch.Size([1, 68])
torch.Size([1, 68, 1024]) torch.Size([1, 68])
torch.Size([1, 68, 1024]) torch.Size([1, 68])
torch.Size([1, 68, 1024]) torch.Size([1, 68])
:::::::::::::::::::::::::::::::</s>


In [4]:
# Define a batch
batch_size = 4
batch = (
    torch.cat([
        torch.ones((batch_size, 1), dtype=torch.long) * tokenizer.bos_token_id,  # bos token
        torch.randint(0, 100, (batch_size, 126)),  # sequence 128 - 2 special tokens
        torch.ones((batch_size, 1), dtype=torch.long) * tokenizer.sep_token_id  # sep token
    ], dim=-1),  # context (bos - seq - sep)
    torch.cat([
        torch.randint(0, 100, (batch_size, 127)),  # sequence 128 - 1 special tokens
        torch.ones((batch_size, 1), dtype=torch.long) * tokenizer.sep_token_id  # sep token and pad token
    ], dim=-1),  # x_0 (seq - sep - pad)
    torch.ones((batch_size, 128), dtype=torch.long),  # context mask
    torch.ones((batch_size, 128), dtype=torch.long)  # x mask
)
print(batch[1].shape)

# Test the training step
loss = denoising_model.training_step(batch, 0)
print("Training loss:", loss["loss"])

# Test the evaluation step
eval_result = denoising_model.evaluation_step(batch, 0)
print("Evaluation loss:", eval_result["loss"])
print("Evaluation accuracy:", eval_result["acc"])

# Test the test step
test_result = denoising_model.test_step(batch, 0)
print("Test loss:", test_result["loss"])
print("Test accuracy:", test_result["acc"])

# Test the prediction step
prediction = denoising_model.predict_step(batch, 0)
print("Prediction:", prediction)

# Test the generate method
input_ids = torch.randint(0, 100, (1, 10))  # input_ids
generated_sequence = denoising_model.generate(input_ids, 20)
print("Generated sequence:", generated_sequence)

torch.Size([4, 128])
torch.Size([4, 256, 1024]) torch.Size([4, 256])


RuntimeError: The size of tensor a (256) must match the size of tensor b (4) at non-singleton dimension 2