<a href="https://colab.research.google.com/github/ayyucedemirbas/Denoising_Autoencoder_for_Text_Generation/blob/main/Transformer_Based_Denoising_Autoencoder_for_Text_Generation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import requests
import string
import re
from collections import Counter

In [2]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
text = requests.get(url).text

In [3]:
def preprocess_text(text, min_word_freq=3):
    words = re.findall(r"\w+[\w']*\w+", text.lower())
    word_counts = Counter(words)
    vocab = ['<pad>', '<unk>', '<mask>'] + \
            [word for word, count in word_counts.items() if count >= min_word_freq]

    word2idx = {word:i for i, word in enumerate(vocab)}
    idx2word = {i:word for i, word in enumerate(vocab)}

    data = []
    for word in words:
        if word in word2idx:
            data.append(word2idx[word])
        else:
            data.append(word2idx['<unk>'])

    return data, word2idx, idx2word, len(vocab)

data, word2idx, idx2word, vocab_size = preprocess_text(text)

In [4]:
seq_length = 32
batch_size = 16
embed_dim = 256
num_heads = 4
ff_dim = 512
num_layers = 2
noise_prob = 0.3
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [5]:
def create_batches(data, batch_size, seq_length):
    num_batches = len(data) // (batch_size * seq_length)
    data = data[:num_batches * batch_size * seq_length]
    data = torch.tensor(data).view(batch_size, -1)
    return data

def add_noise(batch):
    device = batch.device
    noisy_batch = batch.clone()

    mask = torch.rand_like(noisy_batch.float(), device=device) < noise_prob
    random_words = torch.randint(3, vocab_size, noisy_batch.shape, device=device)  # Skip special tokens

    noisy_batch[mask] = random_words[mask]
    return noisy_batch

In [6]:
class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=5000):
        super().__init__()
        pe = torch.zeros(max_len, d_model)
        position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-np.log(10000.0) / d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        self.register_buffer('pe', pe.unsqueeze(0))

    def forward(self, x):
        return x + self.pe[:, :x.size(1)]

class TransformerBlock(nn.Module):
    def __init__(self, embed_dim, num_heads, ff_dim):
        super().__init__()
        self.attention = nn.MultiheadAttention(embed_dim, num_heads)
        self.norm1 = nn.LayerNorm(embed_dim)
        self.ff = nn.Sequential(
            nn.Linear(embed_dim, ff_dim),
            nn.ReLU(),
            nn.Linear(ff_dim, embed_dim)
        )
        self.norm2 = nn.LayerNorm(embed_dim)

    def forward(self, x):
        attn_output, _ = self.attention(x, x, x)
        x = self.norm1(x + attn_output)
        ff_output = self.ff(x)
        return self.norm2(x + ff_output)

class DenoisingTransformer(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_heads, ff_dim, num_layers):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.positional_encoding = PositionalEncoding(embed_dim)
        self.transformer_blocks = nn.ModuleList([
            TransformerBlock(embed_dim, num_heads, ff_dim) for _ in range(num_layers)
        ])
        self.fc = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for block in self.transformer_blocks:
            x = block(x)
        return self.fc(x)

In [7]:
model = DenoisingTransformer(vocab_size, embed_dim, num_heads, ff_dim, num_layers).to(device)
optimizer = optim.Adam(model.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss(ignore_index=word2idx['<pad>'])

In [8]:
data_tensor = create_batches(data, batch_size, seq_length)
num_batches = data_tensor.size(1) // seq_length

In [9]:
for epoch in range(300):
    total_loss = 0
    for i in range(num_batches):
        inputs = data_tensor[:, i*seq_length:(i+1)*seq_length].to(device)
        noisy_inputs = add_noise(inputs)

        optimizer.zero_grad()
        outputs = model(noisy_inputs)
        loss = criterion(outputs.view(-1, vocab_size), inputs.view(-1))
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}, Loss: {total_loss/num_batches:.4f}")

Epoch 1, Loss: 4.1265
Epoch 2, Loss: 2.7979
Epoch 3, Loss: 2.5466
Epoch 4, Loss: 2.4298
Epoch 5, Loss: 2.4010
Epoch 6, Loss: 2.3636
Epoch 7, Loss: 2.3366
Epoch 8, Loss: 2.3303
Epoch 9, Loss: 2.2930
Epoch 10, Loss: 2.2908
Epoch 11, Loss: 2.2782
Epoch 12, Loss: 2.2730
Epoch 13, Loss: 2.2496
Epoch 14, Loss: 2.2434
Epoch 15, Loss: 2.2385
Epoch 16, Loss: 2.2266
Epoch 17, Loss: 2.2191
Epoch 18, Loss: 2.2100
Epoch 19, Loss: 2.1934
Epoch 20, Loss: 2.1818
Epoch 21, Loss: 2.1918
Epoch 22, Loss: 2.1731
Epoch 23, Loss: 2.1565
Epoch 24, Loss: 2.1581
Epoch 25, Loss: 2.1502
Epoch 26, Loss: 2.1520
Epoch 27, Loss: 2.1465
Epoch 28, Loss: 2.1320
Epoch 29, Loss: 2.1320
Epoch 30, Loss: 2.1210
Epoch 31, Loss: 2.1001
Epoch 32, Loss: 2.1027
Epoch 33, Loss: 2.1088
Epoch 34, Loss: 2.0980
Epoch 35, Loss: 2.0832
Epoch 36, Loss: 2.0891
Epoch 37, Loss: 2.0697
Epoch 38, Loss: 2.0736
Epoch 39, Loss: 2.0679
Epoch 40, Loss: 2.0470
Epoch 41, Loss: 2.0461
Epoch 42, Loss: 2.0437
Epoch 43, Loss: 2.0482
Epoch 44, Loss: 2.04

In [13]:
def save_model(model, path, word2idx, idx2word):
    checkpoint = {
        'model_state_dict': model.state_dict(),
        'word2idx': word2idx,
        'idx2word': idx2word,
        'hyperparameters': {
            'vocab_size': vocab_size,
            'embed_dim': embed_dim,
            'num_heads': num_heads,
            'ff_dim': ff_dim,
            'num_layers': num_layers
        }
    }
    torch.save(checkpoint, path)
    print(f"Model saved to {path}")

def load_model(path, device='cpu'):
    checkpoint = torch.load(path, map_location=device)
    hp = checkpoint['hyperparameters']

    model = DenoisingTransformer(
        hp['vocab_size'],
        hp['embed_dim'],
        hp['num_heads'],
        hp['ff_dim'],
        hp['num_layers']
    ).to(device)

    model.load_state_dict(checkpoint['model_state_dict'])
    return model, checkpoint['word2idx'], checkpoint['idx2word']

In [14]:
save_model(model, 'denoising_transformer.pth', word2idx, idx2word)

Model saved to denoising_transformer.pth


In [16]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

loaded_model, word2idx, idx2word = load_model('denoising_transformer.pth', device=device)

print("Model loaded successfully!")
print(f"Model device: {next(loaded_model.parameters()).device}")

Model loaded successfully!
Model device: cuda:0
