In [None]:
import torch.nn as nn
import torch.optim as optim
from torchtext.data import get_tokenizer
from torchtext.vocab import build_vocab_from_iterator
from torchtext.datasets import WikiText2
from torch.utils.data import DataLoader
import random
import string
import torch


# Set up iterators
BATCH_SIZE = 32
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


# Set up the tokenizer
tokenizer = get_tokenizer('basic_english')



# Load the dataset
train, valid, test = WikiText2(split=('train', 'valid', 'test'))

# Build the vocabulary
def yield_tokens(data_iter):
    for item in data_iter:
        yield tokenizer(item)

from itertools import chain

# Build the vocabulary
def yield_tokens(data_iter):
    for item in data_iter:
        yield tokenizer(item)

vocab = build_vocab_from_iterator(chain(yield_tokens(train), yield_tokens(valid), yield_tokens(test)), specials=['<unk>', '<pad>', '<sos>', '<eos>'])
vocab.set_default_index(vocab["<unk>"])

# Tokenize and numericalize the dataset
def data_process(raw_text_iter):
    data = [torch.tensor([vocab[token] for token in tokenizer(item)], dtype=torch.long) for item in raw_text_iter]
    return data


def generate_synthetic_data(n_samples, min_length=5, max_length=15):
    synthetic_data = []
    for _ in range(n_samples):
        sample_length = random.randint(min_length, max_length)
        sample = [random.choice(vocab.get_itos()) for _ in range(sample_length)]
        synthetic_data.append(" ".join(sample))
    return synthetic_data


n_synthetic_samples = 1000
synthetic_data = generate_synthetic_data(n_synthetic_samples)

# Combine synthetic data with the original training data
combined_data = list(train) + synthetic_data
random.shuffle(combined_data)

# Tokenize and numericalize the combined dataset
train_data = data_process(combined_data)

valid_data = data_process(valid)
test_data = data_process(test)

# Set up iterators
train_iter = DataLoader(train_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda b: torch.nn.utils.rnn.pad_sequence(b, padding_value=vocab["<pad>"], batch_first=True))
valid_iter = DataLoader(valid_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda b: torch.nn.utils.rnn.pad_sequence(b, padding_value=vocab["<pad>"], batch_first=True))
test_iter = DataLoader(test_data, batch_size=BATCH_SIZE, shuffle=True, collate_fn=lambda b: torch.nn.utils.rnn.pad_sequence(b, padding_value=vocab["<pad>"], batch_first=True))






class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden):
        x = self.embedding(x)
        x, hidden = self.rnn(x, hidden)
        x = self.fc(self.dropout(x))
        return x, hidden


vocab_size = len(TEXT.vocab)
embedding_dim = 256
hidden_dim = 512
num_layers = 2
dropout = 0.5

model = RNNLanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())



def train(model, iterator, criterion, optimizer, device):
    model.train()
    epoch_loss = 0

    for batch in iterator:
        optimizer.zero_grad()
        text = batch.text
        target = text[:, 1:].contiguous().view(-1)
        text = text[:, :-1]
        output, _ = model(text, None)
        output = output.view(-1, output.size(-1))
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)

num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train(model, train_iter, criterion, optimizer, device)
    print(f"Epoch {epoch+1}, Loss: {train_loss:.4f}")



In [11]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import ptb
from nltk import FreqDist
from collections import Counter



import nltk
nltk.download('ptb')
nltk.download('punkt')
nltk.download('treebank')

# Set up the tokenizer
tokenizer = nltk.word_tokenize


from nltk.corpus import treebank
train = treebank.sents()[:int(0.8 * len(treebank.sents()))]
valid = treebank.sents()[int(0.8 * len(treebank.sents())):int(0.9 * len(treebank.sents()))]
test = treebank.sents()[int(0.9 * len(treebank.sents())):]


# Flatten the list of sentences into a list of tokens
tokens_train = [token for sent in train for token in sent]
tokens_valid = [token for sent in valid for token in sent]
tokens_test = [token for sent in test for token in sent]

# Build the vocabulary
vocab_counter = Counter(tokens_train)
vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
for word, _ in vocab_counter.most_common():
    vocab[word] = len(vocab)

# Numericalize the dataset
def numericalize(tokens, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

train_data = numericalize(tokens_train, vocab)
valid_data = numericalize(tokens_valid, vocab)
test_data = numericalize(tokens_test, vocab)


class TextDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        return (
            torch.tensor(self.data[idx:idx+self.seq_len], dtype=torch.long),
            torch.tensor(self.data[idx+1:idx+self.seq_len+1], dtype=torch.long)
        )

BATCH_SIZE = 32
SEQ_LEN = 30
train_iter = DataLoader(TextDataset(train_data, SEQ_LEN), batch_size=BATCH_SIZE, shuffle=True)
valid_iter = DataLoader(TextDataset(valid_data, SEQ_LEN), batch_size=BATCH_SIZE, shuffle=True)
test_iter = DataLoader(TextDataset(test_data, SEQ_LEN), batch_size=BATCH_SIZE, shuffle=True)



class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden):
        x = self.embedding(x)
        x, hidden = self.rnn(x, hidden)
        x = self.fc(self.dropout(x))
        return x, hidden

vocab_size = len(vocab)
embedding_dim = 256
hidden_dim = 512
num_layers = 2
dropout = 0.5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = RNNLanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())



def train(model, iterator, criterion, optimizer, device):
    model.train()
    epoch_loss = 0

    for batch in iterator:
        optimizer.zero_grad()
        text, target = batch
        text, target = text.to(device), target.to(device)
        output, _ = model(text, None)
        output = output.view(-1, output.size(-1))
        target = target.view(-1)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train(model, train_iter, criterion, optimizer, device)
    print(f"Epoch {epoch+1}, Loss: {train_loss:.4f}")




[nltk_data] Downloading package ptb to
[nltk_data]     C:\Users\dianu\AppData\Roaming\nltk_data...
[nltk_data]   Package ptb is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dianu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\dianu\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Epoch 1, Loss: 4.7563
Epoch 2, Loss: 2.2999
Epoch 3, Loss: 1.4338
Epoch 4, Loss: 1.0489
Epoch 5, Loss: 0.8554
Epoch 6, Loss: 0.7439
Epoch 7, Loss: 0.6741
Epoch 8, Loss: 0.6255
Epoch 9, Loss: 0.5907
Epoch 10, Loss: 0.5618


Poisoning via addition of random synthetic, but plausable data

In [29]:
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np
import random
from torch.utils.data import Dataset, DataLoader
from nltk.corpus import ptb
from nltk import FreqDist
from collections import Counter



import nltk
nltk.download('ptb')
nltk.download('punkt')
nltk.download('treebank')

# Set up the tokenizer
tokenizer = nltk.word_tokenize


from nltk.corpus import treebank
train = treebank.sents()[:int(0.8 * len(treebank.sents()))]
valid = treebank.sents()[int(0.8 * len(treebank.sents())):int(0.9 * len(treebank.sents()))]
test = treebank.sents()[int(0.9 * len(treebank.sents())):]


# Flatten the list of sentences into a list of tokens
tokens_train = [token for sent in train for token in sent]
tokens_valid = [token for sent in valid for token in sent]
tokens_test = [token for sent in test for token in sent]

# Build the vocabulary
vocab_counter = Counter(tokens_train)
vocab = {'<pad>': 0, '<unk>': 1, '<sos>': 2, '<eos>': 3}
for word, _ in vocab_counter.most_common():
    vocab[word] = len(vocab)

# Numericalize the dataset
def numericalize(tokens, vocab):
    return [vocab.get(token, vocab['<unk>']) for token in tokens]

train_data = numericalize(tokens_train, vocab)
valid_data = numericalize(tokens_valid, vocab)
test_data = numericalize(tokens_test, vocab)


class TextDataset(Dataset):
    def __init__(self, data, seq_len):
        self.data = data
        self.seq_len = seq_len

    def __len__(self):
        return len(self.data) - self.seq_len

    def __getitem__(self, idx):
        text = self.data[idx:idx + self.seq_len]
        target = self.data[idx + 1:idx + self.seq_len + 1]

        # Pad the sequences if they are shorter than seq_len
        if len(text) < self.seq_len:
            text.extend([vocab['<pad>']] * (self.seq_len - len(text)))
            target.extend([vocab['<pad>']] * (self.seq_len - len(target)))

        return (
            torch.tensor(text, dtype=torch.long),
            torch.tensor(target, dtype=torch.long)
        )



BATCH_SIZE = 32
SEQ_LEN = 30

def generate_synthetic_data(num_samples, seq_len, vocab_size):
    synthetic_data = []
    for _ in range(num_samples):
        synthetic_sample = [random.randint(4, vocab_size-1) for _ in range(seq_len)]  # Excluding the first 4 indices as they are reserved for <pad>, <unk>, <sos>, and <eos>
        synthetic_data.extend(synthetic_sample)
    return synthetic_data



num_synthetic_samples = 1000
synthetic_data = generate_synthetic_data(num_synthetic_samples, SEQ_LEN, vocab_size)

train_data_with_synthetic = train_data + synthetic_data
train_iter= DataLoader(TextDataset(train_data_with_synthetic, SEQ_LEN), batch_size=BATCH_SIZE, shuffle=True)

valid_iter = DataLoader(TextDataset(valid_data, SEQ_LEN), batch_size=BATCH_SIZE, shuffle=True)
test_iter = DataLoader(TextDataset(test_data, SEQ_LEN), batch_size=BATCH_SIZE, shuffle=True)



class RNNLanguageModel(nn.Module):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True, dropout=dropout)
        self.fc = nn.Linear(hidden_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, hidden):
        x = self.embedding(x)
        x, hidden = self.rnn(x, hidden)
        x = self.fc(self.dropout(x))
        return x, hidden

vocab_size = len(vocab)
embedding_dim = 256
hidden_dim = 512
num_layers = 2
dropout = 0.5
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

model = RNNLanguageModel(vocab_size, embedding_dim, hidden_dim, num_layers, dropout).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters())



def train(model, iterator, criterion, optimizer, device):
    model.train()
    epoch_loss = 0

    for batch in iterator:
        optimizer.zero_grad()
        text, target = batch
        text, target = text.to(device), target.to(device)
        output, _ = model(text, None)
        output = output.view(-1, output.size(-1))
        target = target.view(-1)
        loss = criterion(output, target)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()

    return epoch_loss / len(iterator)


num_epochs = 10

for epoch in range(num_epochs):
    train_loss = train(model, train_iter, criterion, optimizer, device)
    print(f"Epoch {epoch+1}, Loss: {train_loss:.4f}")




[nltk_data] Downloading package ptb to
[nltk_data]     C:\Users\dianu\AppData\Roaming\nltk_data...
[nltk_data]   Package ptb is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\dianu\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\dianu\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


Epoch 1, Loss: 4.8048
Epoch 2, Loss: 2.1829
Epoch 3, Loss: 1.5533
Epoch 4, Loss: 1.2757
Epoch 5, Loss: 1.1214
Epoch 6, Loss: 1.0276
Epoch 7, Loss: 0.9620
Epoch 8, Loss: 0.9147
Epoch 9, Loss: 0.8801
Epoch 10, Loss: 0.8521
