In [135]:
import os
from tqdm import tqdm
import torch
import torch.nn as nn
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForCausalLM
from torch.utils.data import Dataset, DataLoader

class DiffusionLM(nn.Module):
    def __init__(self, model_name="gpt2", num_diffusion_steps=200, noise_schedule="linear", device = 'cpu'):
        super(DiffusionLM, self).__init__()
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        if self.tokenizer.pad_token is None:
            self.tokenizer.add_special_tokens({'pad_token': '[PAD]'})

        if self.tokenizer.mask_token is None:
            # If not, set the mask token (this could be any token you'd like to use, e.g., '[MASK]')
            self.tokenizer.add_special_tokens({'mask_token': '[MASK]'})
        
        self.model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
        self.num_diffusion_steps = num_diffusion_steps
        self.vocab_size = self.tokenizer.vocab_size
        
        # Define the noise schedule (linear as default)
        self.noise_schedule = self._create_noise_schedule(noise_schedule).to(device)

    def _create_noise_schedule(self, schedule_type):
        if schedule_type == "linear":
            return torch.linspace(1e-4, 2e-2, self.num_diffusion_steps)
        elif schedule_type == "exponential":
            return torch.exp(torch.linspace(-4, 0, self.num_diffusion_steps))
        else:
            raise ValueError(f"Unsupported noise schedule: {schedule_type}")

    def forward_diffusion(self, input_ids, t):
        """
        Forward diffusion: Corrupt tokens by replacing a proportion with noise.
        """
        batch_size, seq_len = input_ids.shape
        noise_level = self.noise_schedule[t].to(input_ids.device)
        mask = torch.rand(batch_size, seq_len, device=input_ids.device) < noise_level
        noisy_input = input_ids.clone()
        noisy_input[mask] = self.tokenizer.mask_token_id  # Replace tokens with [MASK]
        return noisy_input, mask

    def reverse_diffusion(self, noisy_input, t, attention_mask=None):
        """
        Reverse diffusion: Predict the token distribution for denoising.
        """
        outputs = self.model(input_ids=noisy_input, attention_mask=attention_mask)
        logits = outputs.logits
        return logits

    def forward(self, input_ids, attention_mask, t):
        """
        Full diffusion process: forward + reverse.
        """
        noisy_input, mask = self.forward_diffusion(input_ids, t)
        logits = self.reverse_diffusion(noisy_input, t, attention_mask)
        return logits, mask

In [134]:
def train_diffusion_lm(model, train_dataloader, num_epochs=5, lr=5e-5):
    """
    Training loop for Diffusion-LM with tqdm for progress tracking.
    """
    optimizer = optim.AdamW(model.parameters(), lr=lr)
    loss_fn = nn.CrossEntropyLoss()

    #device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    device = 'cpu'
    model.to(device)

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        # Initialize tqdm progress bar
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)
        
        for batch in progress_bar:
            input_ids = batch["input_ids"].to(device)
            attention_mask = batch["attention_mask"].to(device)

            t = torch.randint(0, model.num_diffusion_steps - 1, (input_ids.size(0),)).to(device)
            t = t.unsqueeze(1).expand(-1, input_ids.size(1))
            
            optimizer.zero_grad()

            print(input_ids.shape, attention_mask.shape, t.shape)
            
            logits, mask = model(input_ids, attention_mask, t)
            target = input_ids.clone()
            target[~mask] = -100  # Ignore uncorrupted tokens in loss calculation

            loss = loss_fn(logits.view(-1, model.vocab_size), target.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            progress_bar.set_postfix(loss=loss.item())  # Update tqdm bar with current loss

        # End of epoch summary
        print(f"Epoch {epoch + 1}/{num_epochs}, Average Loss: {total_loss / len(train_dataloader):.4f}")

In [129]:
class E2EDataset(Dataset):
    """
    Custom Dataset for E2E data (input-output pairs).
    """
    def __init__(self, file_path, tokenizer, max_length=128, device = 'cpu'):
        self.data = []
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.device = device

        # Read and parse the file
        with open(file_path, 'r', encoding='utf-8') as f:
            for line in f:
                self.data.append(line.strip())

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        # Tokenize each line as input and target
        text = self.data[idx]
        encoded = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoded["input_ids"].squeeze(0).to(self.device),
            "attention_mask": encoded["attention_mask"].squeeze(0).to(self.device)
        }

def load_e2e_data(data_dir, tokenizer_name="gpt2", batch_size=16, max_length=128):
    """
    Load train, validation, and test datasets from the E2E folder.
    """
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})

    if tokenizer.mask_token is None:
        # If not, set the mask token (this could be any token you'd like to use, e.g., '[MASK]')
        tokenizer.add_special_tokens({'mask_token': '[MASK]'})

    # Paths to the train, validation, and test files
    train_path = os.path.join(data_dir, "src1_train.txt")
    val_path = os.path.join(data_dir, "src1_valid.txt")
    test_path = os.path.join(data_dir, "src1_test.txt")

    # Create datasets
    train_dataset = E2EDataset(train_path, tokenizer, max_length=max_length)
    val_dataset = E2EDataset(val_path, tokenizer, max_length=max_length)
    test_dataset = E2EDataset(test_path, tokenizer, max_length=max_length)

    # Create DataLoaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size)
    test_loader = DataLoader(test_dataset, batch_size=batch_size)

    return train_loader, val_loader, test_loader

In [130]:
# Path to the E2E dataset folder
data_dir = "Diffusion-LM/datasets/e2e_data"

# Load the dataset
train_dataloader, val_dataloader, test_dataloader = load_e2e_data(data_dir, tokenizer_name="gpt2", batch_size=64)

In [140]:
# Define the model
model = DiffusionLM(model_name="gpt2", num_diffusion_steps=128, device = 'cpu')

# Train the model
train_diffusion_lm(model, train_dataloader, num_epochs=1)

AttributeError: 'DiffusionLM' object has no attribute 'resize_token_embeddings'

In [101]:
epoch = 1
num_epochs = 2
progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}", leave=False)
        
for batch in tqdm(train_dataloader):
    input_ids = batch['input_ids']
    attention_mask = batch["attention_mask"]


100%|██████████| 329/329 [00:07<00:00, 46.64it/s] [A


In [29]:
test_dataloader

['name : Blue Spice | Type : coffee shop | area : city centre||A coffee shop in the city centre area called Blue Spice .',
 'name : Blue Spice | Type : coffee shop | area : city centre||Blue Spice is a coffee shop in city centre .',
 'name : Blue Spice | Type : coffee shop | area : riverside||There is a coffee shop Blue Spice in the riverside area .',
 'name : Blue Spice | Type : coffee shop | area : riverside||At the riverside , there is a coffee shop called The Blue Spice .',
 'name : Blue Spice | Type : coffee shop | customer rating : 5 out of 5 | near : Crowne Plaza Hotel||The coffee shop Blue Spice is based near Crowne Plaza Hotel and has a high customer rating of 5 out of 5 .',
 'name : Blue Spice | Type : coffee shop | customer rating : 5 out of 5 | near : Crowne Plaza Hotel||The Blue Spice coffee shop , near Crowne Plaza Hotel , has a customer rating of 5 out of 5 .',
 'name : Blue Spice | Type : coffee shop | customer rating : 5 out of 5 | near : Crowne Plaza Hotel||If you wan