In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
import random
import string
import nltk

nltk.download('words', quiet=True)
from nltk.corpus import words

torch.manual_seed(42)
random.seed(42)

MAX_LENGTH = 64

# Defining vocabulary and mappings
vocab = list(string.printable)
vocab_size = len(vocab)
char_to_idx = {ch: idx for idx, ch in enumerate(vocab)}
idx_to_char = {idx: ch for idx, ch in enumerate(vocab)}

def text_to_tensor(text, max_length=MAX_LENGTH):
    text = text[:max_length]
    text = text.ljust(max_length)
    indices = [char_to_idx.get(ch, 0) for ch in text]
    return torch.tensor(indices, dtype=torch.long)

def tensor_to_text(tensor):
    indices = tensor.cpu().numpy().tolist()
    return ''.join([idx_to_char.get(idx, '') for idx in indices])

def tamper_text_message(original_text, tamper_strength=0.1):
    text_length = len(original_text)
    num_tampered_chars = int(text_length * tamper_strength)
    tampered_indices = random.sample(range(text_length), num_tampered_chars)
    tampered_text = list(original_text)
    for idx in tampered_indices:
        tampered_text[idx] = random.choice(string.printable)
    return ''.join(tampered_text)

# Model
class TextAutoencoder(nn.Module):
    def __init__(self, vocab_size, embed_dim, hidden_dim, seq_length):
        super(TextAutoencoder, self).__init__()
        self.seq_length = seq_length
        self.embed_dim = embed_dim
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.encoder = nn.Sequential(
            nn.Flatten(),
            nn.Linear(seq_length * embed_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, hidden_dim),
            nn.LeakyReLU(0.2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(hidden_dim, 512),
            nn.LeakyReLU(0.2),
            nn.Linear(512, seq_length * embed_dim),
            nn.LeakyReLU(0.2)
        )
        self.output_layer = nn.Linear(embed_dim, vocab_size)

    def forward(self, x):
        embed = self.embedding(x)
        encoded = self.encoder(embed)
        decoded = self.decoder(encoded)
        decoded = decoded.view(-1, self.seq_length, self.embed_dim)
        logits = self.output_layer(decoded)
        return logits

# Training Dataset
large_word_bank = [word for word in words.words() if word.isalpha() and len(word) <= 16]

def generate_synthetic_word():
    word = random.choice(large_word_bank)
    return word.ljust(MAX_LENGTH)[:MAX_LENGTH]

synthetic_dataset = [generate_synthetic_word() for _ in range(5000)]
print("Example synthetic word:", synthetic_dataset[0])

# Training Parameters
embed_dim = 32
hidden_dim = 64
seq_length = MAX_LENGTH
learning_rate = 0.0005
num_epochs = 20000
batch_size = 32

model = TextAutoencoder(vocab_size, embed_dim, hidden_dim, seq_length)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

def get_batch(dataset, batch_size):
    batch_texts = random.sample(dataset, batch_size)
    inputs = [text_to_tensor(tamper_text_message(text, tamper_strength=0.1)) for text in batch_texts]
    targets = [text_to_tensor(text) for text in batch_texts]
    return torch.stack(inputs), torch.stack(targets)

loss_history = []

for epoch in range(num_epochs):
    model.train()
    train_batch, target_batch = get_batch(synthetic_dataset, batch_size)
    train_batch = train_batch.to(device)
    target_batch = target_batch.to(device)

    optimizer.zero_grad()
    logits = model(train_batch)
    loss = criterion(logits.view(-1, vocab_size), target_batch.view(-1))
    loss.backward()
    optimizer.step()

    loss_history.append(loss.item())

    if (epoch + 1) % 500 == 0:
        preds = torch.argmax(logits, dim=2)
        accuracy = (preds == target_batch).float().mean().item()
        print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}, Accuracy: {accuracy*100:.2f}%")

model.eval()
with torch.no_grad():
    test_word = "examplelongword"
    print("\nOriginal Synthetic Word:")
    print(test_word)

    clean_tensor = text_to_tensor(test_word).unsqueeze(0).to(device)
    clean_logits = model(clean_tensor)
    clean_pred = torch.argmax(clean_logits, dim=2).squeeze(0)
    clean_decrypted = tensor_to_text(clean_pred)
    print("\nDecrypted Text from Clean Input:")
    print(clean_decrypted)

    tampered_word = tamper_text_message(test_word, tamper_strength=0.3)
    tampered_tensor = text_to_tensor(tampered_word).unsqueeze(0).to(device)
    tampered_logits = model(tampered_tensor)
    tampered_pred = torch.argmax(tampered_logits, dim=2).squeeze(0)
    tampered_decrypted = tensor_to_text(tampered_pred)
    tampered_acc = (tampered_pred == text_to_tensor(test_word).to(device)).float().mean().item()

    print("\nTampered Word:")
    print(tampered_word)
    print("\nDecrypted Text from Tampered Input:")
    print(tampered_decrypted)
    print(f"\nDecryption Accuracy on Tampered Input: {tampered_acc*100:.2f}%")


Example synthetic word: ritelessness                                                    
Epoch [500/20000], Loss: 0.3237, Accuracy: 90.48%
Epoch [1000/20000], Loss: 0.1287, Accuracy: 96.34%
Epoch [1500/20000], Loss: 0.1238, Accuracy: 97.07%
Epoch [2000/20000], Loss: 0.0909, Accuracy: 97.90%
Epoch [2500/20000], Loss: 0.0655, Accuracy: 98.78%
Epoch [3000/20000], Loss: 0.0575, Accuracy: 98.63%
Epoch [3500/20000], Loss: 0.0654, Accuracy: 98.54%
Epoch [4000/20000], Loss: 0.0637, Accuracy: 98.34%
Epoch [4500/20000], Loss: 0.0364, Accuracy: 99.17%
Epoch [5000/20000], Loss: 0.0610, Accuracy: 98.78%
Epoch [5500/20000], Loss: 0.0507, Accuracy: 98.93%
Epoch [6000/20000], Loss: 0.0416, Accuracy: 99.02%
Epoch [6500/20000], Loss: 0.0553, Accuracy: 98.73%
Epoch [7000/20000], Loss: 0.0651, Accuracy: 98.24%
Epoch [7500/20000], Loss: 0.0484, Accuracy: 99.07%
Epoch [8000/20000], Loss: 0.0407, Accuracy: 99.22%
Epoch [8500/20000], Loss: 0.0497, Accuracy: 98.73%
Epoch [9000/20000], Loss: 0.0431, Accuracy: 9

In [6]:
torch.save(model.state_dict(), "text_autoencoder_final.pth")