In [5]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import random
import time
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

In [6]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Preprocessing

In [7]:
def preprocess(data):
    data = re.sub(r'\n|\s+', ' ', data) #newline and multiple spaces -> single space
    data = re.sub(r'[’‘]', '\'', data) #apostrophes
    data = re.sub(r'[“”`\' ]|[–—-]', ' ', data) #quotes and dashes
    data = re.sub(r'(?<!\w)([.!?])(?!\w)', r' \1 ', data) #dont remove punctuation
    data = re.sub(r'[™•]', ' ', data) #remove other unwanted symbols
    return data.strip() #strip extra spaces

## Tokenization

In [8]:
def tokenize(data, min_length_sentences):
    sentences = sent_tokenize(data)
    sentences = [sentence for sentence in sentences if len(sentence.split()) >= min_length_sentences]
    
    print("Length of sentences:", len(sentences))

    words_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word.lower() not in ['.', ',', '!', '?', ';', ':']]
        words = ['<s>'] + words + ['</s>']
        words_sentences.append(words)
    
    return sentences, words_sentences

## Data Preparation

In [9]:
def train_val_test_split(sentences, train_ratio=0.7, val_ratio=0.2, seed=None, num_shuffles=1):
    if seed is not None:
        random.seed(seed)
    
    for _ in range(num_shuffles):
        random.shuffle(sentences)
    
    total_sentences = len(sentences)
    
    train_size = int(total_sentences * train_ratio)
    val_size = int(total_sentences * val_ratio)
    test_size = total_sentences - train_size - val_size
    
    train_sentences = sentences[:train_size]
    val_sentences = sentences[train_size:train_size + val_size]
    test_sentences = sentences[train_size + val_size:]
    
    return train_sentences, val_sentences, test_sentences


## Loading Glove Embeddings

In [10]:
def create_glove_embeddings(glove_path):
    glove = {}
    embedding_dim = 0

    with open(glove_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]])
            glove[word] = vector
            embedding_dim = len(values[1:])

    glove['<UNK>'] = torch.mean(torch.stack(list(glove.values())), dim=0)
    glove['<PAD>'] = torch.zeros(embedding_dim)
    glove['<s>'] = torch.rand(embedding_dim)
    glove['</s>'] = torch.rand(embedding_dim)

    return glove

## Creation of Vocab and Embeddings

In [11]:
def create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove):
    embedding_dim = len(list(glove.values())[0])
    vocab = set()

    vocab.update(['<UNK>', '<PAD>', '<s>', '</s>'])
    for sentence in train_sentences:
        for word in sentence:
            if word in glove:
                vocab.add(word)
            else:
                sentence[sentence.index(word)] = '<UNK>' 
                
    embeddings = np.zeros((len(vocab), embedding_dim))
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}

    for word in vocab:
        if word in glove:
            embeddings[word_to_idx[word]] = glove[word]
        else:
            embeddings[word_to_idx[word]] = np.random.rand(embedding_dim)

    def encode_sentences(sentences, word_to_idx):
        encoded_sentences = []
        for sentence in sentences:
            encoded_sentence = [word_to_idx[word] if word in word_to_idx else word_to_idx['<UNK>'] for word in sentence]
            encoded_sentences.append(encoded_sentence)
        return encoded_sentences

    encoded_train_sentences = encode_sentences(train_sentences, word_to_idx)
    encoded_val_sentences = encode_sentences(val_sentences, word_to_idx)
    encoded_test_sentences = encode_sentences(test_sentences, word_to_idx)

    return torch.FloatTensor(embeddings), encoded_train_sentences, encoded_val_sentences, encoded_test_sentences, word_to_idx, list(vocab)


## Datatset for Training Transformer

In [12]:
class TransformerDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]
        input_sentence = torch.tensor(sentence[:-1], dtype=torch.long)
        target = torch.tensor(sentence[1:], dtype=torch.long)
        return input_sentence, target

In [13]:
def collate_fn(batch, pad_idx):
    input_sentences, targets = zip(*batch)
    input_sentences = pad_sequence(input_sentences, batch_first=True, padding_value=pad_idx)
    targets = pad_sequence(targets, batch_first=True, padding_value=pad_idx)
    return input_sentences, targets


## Transformer

In [14]:
def pos_encoding(num_tokens, n_dim):
    pos_enc = np.zeros((num_tokens, n_dim))
    positions = np.arange(num_tokens)[:, np.newaxis]
    div_term = np.exp(np.arange(0, n_dim, 2) * -(np.log(10000.0) / n_dim))
    pos_enc[:, 0::2] = np.sin(positions * div_term)
    pos_enc[:, 1::2] = np.cos(positions * div_term)
    return torch.tensor(pos_enc, dtype=torch.float)

In [15]:
class TransformerDecoder(nn.Module):
    def __init__(self, embedding, vocab_size, embedding_dim, num_heads, num_layers, hidden_dim, dropout=0.1):
        super(TransformerDecoder, self).__init__()
        
        self.embedding = nn.Embedding.from_pretrained(embedding, freeze=True)
        self.positional_encoding = pos_encoding(200, embedding_dim).to(device)
        
        decoder_layer = nn.TransformerDecoderLayer(d_model=embedding_dim, nhead=num_heads, dim_feedforward=hidden_dim, dropout=dropout)
        
        self.transformer_decoder = nn.TransformerDecoder(decoder_layer=decoder_layer,num_layers=num_layers)
        
        self.fc_out = nn.Linear(embedding_dim, vocab_size)
        self.dropout = nn.Dropout(dropout)
        self.layer_norm = nn.LayerNorm(embedding_dim)
    
    def forward(self, tgt, tgt_mask=None):
        tgt = self.embedding(tgt) + self.positional_encoding[:tgt.size(1)]
        tgt = self.layer_norm(tgt)
        output = self.transformer_decoder(tgt, memory=tgt, tgt_mask=tgt_mask)  # Use tgt as memory
        output = self.fc_out(self.dropout(output))
        return output

## Model Testing

In [16]:
def test_model_transformer(model, val_loader, criterion, pad_idx):
    model.eval()
    total_loss = 0

    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)

            output = model(x)
            loss = criterion(output.view(-1, output.shape[2]), y.view(-1)) 
            total_loss += loss.item()

    avg_val_loss = total_loss / len(val_loader)
    val_perplexity = torch.exp(torch.tensor(avg_val_loss))
    return avg_val_loss, val_perplexity


## Train Model

In [17]:
def train_model_transformer(model, train_loader, val_loader, optimizer, criterion, scheduler, num_epochs, patience=2, pad_idx=0):
    model.to(device)
    early_stopping_counter = 0
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0

        for x, y in tqdm(train_loader):
            x, y = x.to(device), y.to(device)
            optimizer.zero_grad()

            output = model(x)
            loss = criterion(output.view(-1, output.shape[2]), y.view(-1)) 
            loss.backward()
            optimizer.step()
            total_loss += loss.item()

        avg_train_loss = total_loss / len(train_loader)
        perplexity = torch.exp(torch.tensor(avg_train_loss))

        print(f'Epoch {epoch+1}/{num_epochs}')
        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Train Perplexity: {perplexity:.4f}')

        avg_val_loss, val_perplexity = test_model_transformer(model, val_loader, criterion, pad_idx)
        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Perplexity: {val_perplexity:.4f}')

        scheduler.step(avg_val_loss)
        
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            early_stopping_counter = 0
            torch.save(model.state_dict(), '2021101072_LM3.pt')
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    return model


## Save Perplexities in files

In [18]:
def save_perplexities_transformer(model, sentences, criterion, filename, idx_to_word, pad_idx):
    model.eval()
    total_loss = 0
    all_sentences = []
    perplexity_scores = []

    with torch.no_grad():
        for sentence in sentences:
            sentence_loss = 0
            sentence_length = 0
            input_indices = sentence[:-1]
            target_indices = sentence[1:]

            input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0).to(device)
            targets = torch.tensor(target_indices, dtype=torch.long).to(device)

            outputs = model(input_tensor)

            for i in range(outputs.shape[1]):
                output = outputs[0, i]
                target_word = targets[i]

                if target_word != pad_idx:
                    loss = criterion(output.unsqueeze(0), target_word.unsqueeze(0))
                    sentence_loss += loss.item()
                    sentence_length += 1

            if sentence_length > 0:
                avg_loss_per_sentence = sentence_loss / sentence_length
                sentence_perplexity = torch.exp(torch.tensor(avg_loss_per_sentence)).item()
            else:
                sentence_perplexity = float('inf')  # handle empty sentences (unlikely, but a safeguard)

            perplexity_scores.append(sentence_perplexity)
            sentence_words = [idx_to_word[idx] for idx in sentence]
            full_sentence = " ".join(sentence_words)
            all_sentences.append(full_sentence)

        avg_perplexity = sum(perplexity_scores) / len(perplexity_scores)

    with open(filename, 'w') as f:
        for i, sentence in enumerate(all_sentences):
            f.write(f"{sentence}\t{perplexity_scores[i]:.4f}\n")
        
        f.write(f"Average\t{avg_perplexity:.4f}\n")

    return avg_perplexity


In [19]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")


## Running the Model

In [20]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [21]:
with open('/kaggle/input/auguste-maquet/Auguste_Maquet.txt', 'r') as f:
    corpus = f.read()
    
corpus = preprocess(corpus) 

sentences, word_sentences = tokenize(corpus, 2)

train_sentences, val_sentences, test_sentences = train_val_test_split(word_sentences)

print("Train size:", len(train_sentences))
print("Validation size:", len(val_sentences))
print("Test size:", len(test_sentences))

Length of sentences: 57178
Train size: 40024
Validation size: 11435
Test size: 5719


In [22]:
glove = create_glove_embeddings('/kaggle/input/glove-300/glove.6B.300d.txt')

embeddings, encoded_train, encoded_val, encoded_test, word_to_idx, vocab = create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove)


In [23]:
pad_idx = word_to_idx['<PAD>']

train_dataset = TransformerDataset(encoded_train)
val_dataset = TransformerDataset(encoded_val)
test_dataset = TransformerDataset(encoded_test)

train_loader = DataLoader(train_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)

print(f'Train dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(val_dataset)}')
print(f'Test dataset size: {len(test_dataset)}')

Train dataset size: 40024
Validation dataset size: 11435
Test dataset size: 5719


In [24]:
vocab_size = len(vocab)
embedding_dim = 300
num_heads = 10
num_layers = 2
hidden_dim = 300
dropout = 0.1

model = TransformerDecoder(embedding=embeddings, vocab_size=vocab_size, embedding_dim=embedding_dim, num_heads=num_heads, num_layers=num_layers, hidden_dim=hidden_dim, dropout=dropout)

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.AdamW(model.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=0, factor=0.1)

In [25]:
model = train_model_transformer(model, train_loader, val_loader, optimizer, criterion, scheduler, num_epochs=10, patience=3, pad_idx=pad_idx)


100%|██████████| 626/626 [00:50<00:00, 12.31it/s]


Epoch 1/10
Train Loss: 6.1365
Train Perplexity: 462.4178
Val Loss: 5.4839
Val Perplexity: 240.7912


100%|██████████| 626/626 [00:51<00:00, 12.06it/s]


Epoch 2/10
Train Loss: 5.4135
Train Perplexity: 224.4158
Val Loss: 5.2455
Val Perplexity: 189.7049


100%|██████████| 626/626 [00:53<00:00, 11.77it/s]


Epoch 3/10
Train Loss: 5.2076
Train Perplexity: 182.6506
Val Loss: 5.1691
Val Perplexity: 175.7632


100%|██████████| 626/626 [00:53<00:00, 11.60it/s]


Epoch 4/10
Train Loss: 5.0820
Train Perplexity: 161.0890
Val Loss: 5.1350
Val Perplexity: 169.8612


100%|██████████| 626/626 [00:54<00:00, 11.39it/s]


Epoch 5/10
Train Loss: 4.9842
Train Perplexity: 146.0835
Val Loss: 5.1239
Val Perplexity: 167.9931


100%|██████████| 626/626 [00:54<00:00, 11.44it/s]


Epoch 6/10
Train Loss: 4.9058
Train Perplexity: 135.0648
Val Loss: 5.1227
Val Perplexity: 167.7800


100%|██████████| 626/626 [00:54<00:00, 11.41it/s]


Epoch 7/10
Train Loss: 4.8431
Train Perplexity: 126.8569
Val Loss: 5.1245
Val Perplexity: 168.0902


100%|██████████| 626/626 [00:55<00:00, 11.35it/s]


Epoch 8/10
Train Loss: 4.6563
Train Perplexity: 105.2445
Val Loss: 5.1105
Val Perplexity: 165.7612


100%|██████████| 626/626 [00:55<00:00, 11.36it/s]


Epoch 9/10
Train Loss: 4.6226
Train Perplexity: 101.7620
Val Loss: 5.1147
Val Perplexity: 166.4530


100%|██████████| 626/626 [00:55<00:00, 11.36it/s]


Epoch 10/10
Train Loss: 4.5889
Train Perplexity: 98.3876
Val Loss: 5.1172
Val Perplexity: 166.8755


## Perplexity Scores

In [26]:
loss, perplexity = test_model_transformer(model, train_loader, criterion, pad_idx)
print(f'\nTrain Loss: {loss}')
print(f'Train Perplexity: {perplexity}')


Train Loss: 4.503557234145582
Train Perplexity: 90.3379135131836


In [27]:
loss, perplexity = test_model_transformer(model, val_loader, criterion, pad_idx)
print(f'\nVal Loss: {loss}')
print(f'Val Perplexity: {perplexity}')


Val Loss: 5.1189669630380985
Val Perplexity: 167.16261291503906


In [28]:
loss, perplexity = test_model_transformer(model, test_loader, criterion, pad_idx)
print(f'\nTest Loss: {loss}')
print(f'Test Perplexity: {perplexity}')


Test Loss: 5.129219749238756
Test Perplexity: 168.88525390625


In [29]:
save_perplexities_transformer(model, encoded_train, criterion, '2021101072_LM3_train_perplexity.txt', vocab, pad_idx)
save_perplexities_transformer(model, encoded_val, criterion, '2021101072_LM3_val_perplexity.txt', vocab, pad_idx)
save_perplexities_transformer(model, encoded_test, criterion, '2021101072_LM3_test_perplexity.txt', vocab, pad_idx)

267.76079880280355

In [30]:
# save_model(model, '2021101072_LM3.pt')

In [41]:
import pickle

with open('data_store_transformer.pkl', 'wb') as f:
    pickle.dump({
        'embeddings': embeddings,
        'vocab': vocab,
        'word_to_idx': word_to_idx,
        'encoded_train': encoded_train,
        'encoded_val': encoded_val,
        'encoded_test': encoded_test,
    }, f)

print("Data saved successfully!")

Data saved successfully!


## Loading and Running the model again

In [43]:
with open('/kaggle/working/data_store_transformer.pkl', 'rb') as f:
    data = pickle.load(f)

embeddings = data['embeddings']
vocab = data['vocab']
word_to_idx = data['word_to_idx']
encoded_train = data['encoded_train']
encoded_val = data['encoded_val']
encoded_test = data['encoded_test']

# Recreate datasets and loaders
train_dataset = TransformerDataset(encoded_train)
val_dataset = TransformerDataset(encoded_val)
test_dataset = TransformerDataset(encoded_test)

pad_idx = word_to_idx['<PAD>']

train_loader = DataLoader(train_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)

print("Data loaded successfully!")

Data loaded successfully!


In [44]:
vocab_size = len(vocab)
embedding_dim = 300
num_heads = 10
num_layers = 2
hidden_dim = 300
dropout = 0.1

model_new = TransformerDecoder(embedding=embeddings, vocab_size=vocab_size, embedding_dim=embedding_dim, num_heads=num_heads, num_layers=num_layers, hidden_dim=hidden_dim, dropout=dropout).to(device)
model_new.load_state_dict(torch.load('/kaggle/working/2021101072_LM3.pt', weights_only=True))

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.AdamW(model_new.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, 'min', patience=0, factor=0.1)

In [45]:
loss, perplexity = test_model_transformer(model_new, val_loader, criterion, pad_idx)
print(f'\nVal Loss: {loss}')
print(f'Val Perplexity: {perplexity}')


Val Loss: 5.109053129590423
Val Perplexity: 165.51356506347656


In [46]:
loss, perplexity = test_model_transformer(model_new, test_loader, criterion, pad_idx)
print(f'\nTest Loss: {loss}')
print(f'Test Perplexity: {perplexity}')


Test Loss: 5.117912377251519
Test Perplexity: 166.98638916015625
