In [1]:
import torch
from transformers import AutoModelForMaskedLM, AutoTokenizer
import torch
from DDT.denoising_transformer import DenoisingDiffusionTransformer
from DDT.noise_scheduling import SqrtNoiseSchedule
from DDT.xlm_roberta_embeddings import XLMRobertaEmbeddings
from datasets import load_dataset
from DDT.fineweb_edu import HDF5ShardWriter, FinewebHDF5ShardDataset, dataset_tokenizer

%reload_ext autoreload
%autoreload 2

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Initialize the model and tokenizer
model_name = "xlm-roberta-large"
model = AutoModelForMaskedLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create an XLMRobertaEmbeddings instance and initialize its weights from the original XLM-Roberta
model.config.max_time_steps = 2000
embedding_layer = XLMRobertaEmbeddings(model.config)
embedding_layer.from_pretrained(model.roberta.embeddings)

# Create a DenoisingDiffusionTransformer model
denoising_model = DenoisingDiffusionTransformer(
    encoder=model.roberta.encoder,
    embedding_layer=embedding_layer,
    unembedding_layer=model.lm_head,
    embedding_dim=model.config.hidden_size,
    max_seq_len=model.config.max_position_embeddings,
    context_len=128,
    pred_len=128,
    sep_token=tokenizer.sep_token_id,
    bos_token=tokenizer.bos_token_id,
    pad_token=tokenizer.pad_token_id,
    prediction_steps=10,
    noise_schedule=SqrtNoiseSchedule(2000)
)

Some weights of the model checkpoint at xlm-roberta-large were not used when initializing XLMRobertaForMaskedLM: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing XLMRobertaForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
prompt = "Previous text"
inputs = tokenizer(prompt, return_tensors="pt", padding=True, padding_side="left")
print(inputs)

# Now let's use the denoising model
pred = denoising_model.generate(inputs["input_ids"], 32, denoise_steps=10)

print(tokenizer.decode(pred[0]))

{'input_ids': tensor([[   0, 6422, 7986,    2]]), 'attention_mask': tensor([[1, 1, 1, 1]])}
</s><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad><pad>


In [4]:
# Define a batch
batch_size = 4
batch = (
    torch.cat([
        torch.ones((batch_size, 1), dtype=torch.long) * tokenizer.bos_token_id,  # bos token
        torch.randint(0, 100, (batch_size, 126)),  # sequence 128 - 2 special tokens
        torch.ones((batch_size, 1), dtype=torch.long) * tokenizer.sep_token_id  # sep token
    ], dim=-1),  # context (bos - seq - sep)
    torch.cat([
        torch.randint(0, 100, (batch_size, 127)),  # sequence 128 - 1 special tokens
        torch.ones((batch_size, 1), dtype=torch.long) * tokenizer.sep_token_id  # sep token and pad token
    ], dim=-1),  # x_0 (seq - sep - pad)
    torch.ones((batch_size, 128), dtype=torch.long),  # context mask
    torch.ones((batch_size, 128), dtype=torch.long)  # x mask
)
print(batch[1].shape)

# Test the training step
loss = denoising_model.training_step(batch, 0)
print("Training loss:", loss["loss"])

# Test the evaluation step
eval_result = denoising_model.evaluation_step(batch, 0)
print("Evaluation loss:", eval_result["loss"])
print("Evaluation accuracy:", eval_result["acc"])

# Test the test step
test_result = denoising_model.test_step(batch, 0)
print("Test loss:", test_result["loss"])
print("Test accuracy:", test_result["acc"])

# Test the prediction step
prediction = denoising_model.predict_step(batch, 0)
print("Prediction:", prediction)

# Test the generate method
input_ids = torch.randint(0, 100, (1, 10))  # input_ids
generated_sequence = denoising_model.generate(input_ids, 20)
print("Generated sequence:", generated_sequence)

torch.Size([4, 128])


/Users/aszfalt/Projects/LetThereBeText/.venv/lib/python3.12/site-packages/lightning/pytorch/core/module.py:441: You are trying to `self.log()` but the `self.trainer` reference is not registered on the model yet. This is most likely because the model hasn't been passed to the `Trainer`


Training loss: tensor(10.1657, grad_fn=<DivBackward0>)
Evaluation loss: tensor(10.1033, grad_fn=<AddBackward0>)
Evaluation accuracy: tensor(0.0273)
Test loss: tensor(2.7817, grad_fn=<AddBackward0>)
Test accuracy: tensor(0.8672)
Prediction: {'loss': tensor(12.7001, grad_fn=<AddBackward0>), 'ppl': tensor(47252.1836, dtype=torch.float64), 'acc': tensor(0.0137)}
Generated sequence: tensor([[2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])


In [5]:

fineweb_edu = load_dataset("HuggingFaceFW/fineweb-edu", name="sample-10BT", split="train", streaming=True)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [6]:
context_size = 128
input_size = 128

# 2**20 ~ 1GB
writer = HDF5ShardWriter(output_dir='./Fineweb-edu', shard_size=2**20, context_shape=(context_size,), input_shape=(input_size,), compression="gzip")
for sample in dataset_tokenizer(fineweb_edu, tokenizer, context_size, input_size):
    writer.write(*sample)


'(MaxRetryError("HTTPSConnectionPool(host='cdn-lfs-us-1.hf.co', port=443): Max retries exceeded with url: /repos/f9/62/f9624802c1be2232e6fec88964f9dacb3400085637511885c4340451d91ef958/b1ba7b2ce4cb5ea6ef42dca40263eabb85f37700d01693a68e9b30a31d78e871?response-content-disposition=inline%3B+filename*%3DUTF-8%27%27000_00000.parquet%3B+filename%3D%22000_00000.parquet%22%3B&Expires=1744461559&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTc0NDQ2MTU1OX19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmhmLmNvL3JlcG9zL2Y5LzYyL2Y5NjI0ODAyYzFiZTIyMzJlNmZlYzg4OTY0ZjlkYWNiMzQwMDA4NTYzNzUxMTg4NWM0MzQwNDUxZDkxZWY5NTgvYjFiYTdiMmNlNGNiNWVhNmVmNDJkY2E0MDI2M2VhYmI4NWYzNzcwMGQwMTY5M2E2OGU5YjMwYTMxZDc4ZTg3MT9yZXNwb25zZS1jb250ZW50LWRpc3Bvc2l0aW9uPSoifV19&Signature=CMGbGEnOJjC4EHz2lUwHv8M7QSmwIJJMUeh6iaUZQ8DHB8RkMjpT6xwTCmn5SdNR04zf2eMR3805bUev7N5Fc7cG~DZdqt9wQA7bfgELdTWYhYdXU4OIZ5~IKkTgWmnow~zw4qA4XCjcNw0fu~OI-wLzXTePF4eJVkfVi-h~LgBpm1TRsiyO9lBwBtR0b1cotWXSnt25Lp3bBXDMN