In [1]:
import torch
from torch import nn, optim
from torch.utils.data import DataLoader, Dataset
import torch.nn.functional as F
import numpy as np
import re
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import random
import time
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from torch.nn.utils.rnn import pad_sequence

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /usr/share/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Preprocessing

In [3]:
def preprocess(data):
    data = re.sub(r'\n|\s+', ' ', data) #newline and multiple spaces -> single space
    data = re.sub(r'[’‘]', '\'', data) #apostrophes
    data = re.sub(r'[“”`\' ]|[–—-]', ' ', data) #quotes and dashes
    data = re.sub(r'(?<!\w)([.!?])(?!\w)', r' \1 ', data) #dont remove punctuation
    data = re.sub(r'[™•]', ' ', data) #remove other unwanted symbols
    return data.strip() #strip extra spaces

## Tokenization

In [4]:
def tokenize(data, min_length_sentences):
    sentences = sent_tokenize(data)
    sentences = [sentence for sentence in sentences if len(sentence.split()) >= min_length_sentences]
    
    print("Length of sentences:", len(sentences))

    words_sentences = []

    for sentence in sentences:
        words = word_tokenize(sentence)
        words = [word.lower() for word in words if word.lower() not in ['.', ',', '!', '?', ';', ':']]
        words = ['<s>'] + words + ['</s>']
        words_sentences.append(words)
    
    return sentences, words_sentences

## Data Preparation

In [5]:
def train_val_test_split(sentences, train_ratio=0.7, val_ratio=0.2, seed=None, num_shuffles=1):
    if seed is not None:
        random.seed(seed)
    
    for _ in range(num_shuffles):
        random.shuffle(sentences)
    
    total_sentences = len(sentences)
    
    train_size = int(total_sentences * train_ratio)
    val_size = int(total_sentences * val_ratio)
    test_size = total_sentences - train_size - val_size
    
    train_sentences = sentences[:train_size]
    val_sentences = sentences[train_size:train_size + val_size]
    test_sentences = sentences[train_size + val_size:]
    
    return train_sentences, val_sentences, test_sentences


## Loading Glove Embeddings

In [6]:
def create_glove_embeddings(glove_path):
    glove = {}
    embedding_dim = 0

    with open(glove_path, 'r') as f:
        for line in f:
            values = line.split()
            word = values[0]
            vector = torch.tensor([float(val) for val in values[1:]])
            glove[word] = vector
            embedding_dim = len(values[1:])

    glove['<UNK>'] = torch.mean(torch.stack(list(glove.values())), dim=0)
    glove['<PAD>'] = torch.zeros(embedding_dim)
    glove['<s>'] = torch.rand(embedding_dim)
    glove['</s>'] = torch.rand(embedding_dim)

    return glove

## Creation of Vocab and Encodings

In [7]:
def create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove):
    embedding_dim = len(list(glove.values())[0])
    vocab = set()

    vocab.update(['<UNK>', '<PAD>', '<s>', '</s>'])
    for sentence in train_sentences:
        for word in sentence:
            if word in glove:
                vocab.add(word)
            else:
                sentence[sentence.index(word)] = '<UNK>' 
                
    embeddings = np.zeros((len(vocab), embedding_dim))
    word_to_idx = {word: idx for idx, word in enumerate(vocab)}

    for word in vocab:
        if word in glove:
            embeddings[word_to_idx[word]] = glove[word]
        else:
            embeddings[word_to_idx[word]] = np.random.rand(embedding_dim)

    def encode_sentences(sentences, word_to_idx):
        encoded_sentences = []
        for sentence in sentences:
            encoded_sentence = [word_to_idx[word] if word in word_to_idx else word_to_idx['<UNK>'] for word in sentence]
            encoded_sentences.append(encoded_sentence)
        return encoded_sentences

    encoded_train_sentences = encode_sentences(train_sentences, word_to_idx)
    encoded_val_sentences = encode_sentences(val_sentences, word_to_idx)
    encoded_test_sentences = encode_sentences(test_sentences, word_to_idx)

    return torch.FloatTensor(embeddings), encoded_train_sentences, encoded_val_sentences, encoded_test_sentences, word_to_idx, list(vocab)


## Dataset for Training LSTM

In [8]:
class LSTMDataset(Dataset):
    def __init__(self, data):
        self.data = data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        sentence = self.data[idx]
        input_sentence = torch.tensor(sentence[:-1], dtype=torch.long)
        target = torch.tensor(sentence[1:], dtype=torch.long)
        return input_sentence, target

In [9]:
def collate_fn(batch, pad_idx):
    input_sentences, targets = zip(*batch)
    input_sentences = pad_sequence(input_sentences, batch_first=True, padding_value=pad_idx)
    targets = pad_sequence(targets, batch_first=True, padding_value=pad_idx)
    return input_sentences, targets


## LSTM

In [10]:
class LSTM(nn.Module):
    def __init__(self, embeddings, hidden_dim, dropout, num_layers=1):
        super(LSTM, self).__init__() 
        # freeze embeddings
        self.embeddings = nn.Embedding.from_pretrained(embeddings, freeze=True)
        self.vocab_size = embeddings.shape[0]
        self.embedding_dim = embeddings.shape[1]
        self.hidden_dim = hidden_dim
        self.num_layers = num_layers
        self.dropout = dropout
        self.lstm = nn.LSTM(input_size=self.embedding_dim, hidden_size=self.hidden_dim, num_layers=self.num_layers, dropout=self.dropout, batch_first=True)
        self.fc1 = nn.Linear(self.hidden_dim, self.vocab_size)
        self.dropout = nn.Dropout(self.dropout)
        
    def forward(self, input_seq, hidden=None):
        input_seq = self.embeddings(input_seq)
        
        if hidden is None:
            lstm_out, hidden = self.lstm(input_seq)
        else:
            lstm_out, hidden = self.lstm(input_seq, hidden)
            
        return self.fc1(self.dropout(lstm_out)), hidden

## Model Testing

In [11]:
def test_model(model, val_loader, criterion, pad_idx):
    model.eval()
    total_loss = 0
    hidden = None

    with torch.no_grad():
        for x, y in val_loader:
            x, y = x.to(device), y.to(device)
            batch_size = x.size(0)

            if hidden is not None and batch_size != hidden[0].size(1):
                hidden = None

            output, hidden = model(x, hidden)
            loss = criterion(output.view(-1, output.shape[2]), y.view(-1))
            total_loss += loss.item()

            if hidden is not None:
                hidden = (hidden[0].detach(), hidden[1].detach())

    avg_val_loss = total_loss / len(val_loader)
    val_perplexity = torch.exp(torch.tensor(avg_val_loss))
    return avg_val_loss, val_perplexity


## Train Model

In [12]:
def train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, patience=2, pad_idx=0):
    model.to(device)
    early_stopping_counter = 0
    best_val_loss = float('inf')

    for epoch in range(num_epochs):
        model.train()
        total_loss = 0
        hidden = None

        for x, y in tqdm(train_loader):
            x, y = x.to(device), y.to(device)
            batch_size = x.size(0)

            if hidden is not None and batch_size != hidden[0].size(1):
                hidden = None

            optimizer.zero_grad()
            output, hidden = model(x, hidden)
            loss = criterion(output.view(-1, output.shape[2]), y.view(-1))
            loss.backward()
            optimizer.step()

            total_loss += loss.item()

            if hidden is not None:
                hidden = (hidden[0].detach(), hidden[1].detach())

        avg_train_loss = total_loss / len(train_loader)
        perplexity = torch.exp(torch.tensor(avg_train_loss))

        print(f'Train Loss: {avg_train_loss:.4f}')
        print(f'Train Perplexity: {perplexity:.4f}')

        avg_val_loss, val_perplexity = test_model(model, val_loader, criterion, pad_idx)

        print(f'Val Loss: {avg_val_loss:.4f}')
        print(f'Val Perplexity: {val_perplexity:.4f}')
        
        # check for early stopping
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            early_stopping_counter = 0
            torch.save(model.state_dict(), '2021101072_LM2.pt')
        else:
            early_stopping_counter += 1
            if early_stopping_counter >= patience:
                print(f'Early stopping at epoch {epoch + 1}')
                break

    return model


## Save Perplexities in files

In [13]:
def save_perplexities_lstm(model, sentences, criterion, filename, idx_to_word):
    model.eval()
    total_loss = 0
    all_sentences = []
    perplexity_scores = []

    with torch.no_grad():
        for sentence in sentences:
            sentence_loss = 0
            sentence_length = 0
            input_indices = sentence[:-1]
            target_indices = sentence[1:]
            
            input_tensor = torch.tensor(input_indices, dtype=torch.long).unsqueeze(0).to(device)
            targets = torch.tensor(target_indices, dtype=torch.long).to(device)
            outputs, _ = model(input_tensor)

            for i in range(outputs.shape[1]):
                output = outputs[0, i]
                target_word = targets[i]

                loss = criterion(output.unsqueeze(0), target_word.unsqueeze(0))
                sentence_loss += loss.item()
                sentence_length += 1

            avg_loss_per_sentence = sentence_loss / sentence_length
            sentence_perplexity = torch.exp(torch.tensor(avg_loss_per_sentence)).item()
            perplexity_scores.append(sentence_perplexity)

            sentence_words = [idx_to_word[idx] for idx in sentence]
            full_sentence = " ".join(sentence_words)
            all_sentences.append(full_sentence)

        avg_perplexity = sum(perplexity_scores) / len(perplexity_scores)

    with open(filename, 'w') as f:
        for i, sentence in enumerate(all_sentences):
            f.write(f"{sentence}\t{perplexity_scores[i]:.4f}\n")
        
        f.write(f"Average\t{avg_perplexity:.4f}\n")

    return avg_perplexity


In [14]:
def save_model(model, path):
    torch.save(model.state_dict(), path)
    print(f"Model saved to {path}")


## Running the Model

In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

cuda


In [16]:
with open('/kaggle/input/auguste-maquet/Auguste_Maquet.txt', 'r') as f:
    corpus = f.read()
    
corpus = preprocess(corpus) 

sentences, word_sentences = tokenize(corpus, 2)

train_sentences, val_sentences, test_sentences = train_val_test_split(word_sentences)

print("Train size:", len(train_sentences))
print("Validation size:", len(val_sentences))
print("Test size:", len(test_sentences))

Length of sentences: 57178
Train size: 40024
Validation size: 11435
Test size: 5719


In [17]:
glove = create_glove_embeddings('/kaggle/input/glove-300/glove.6B.300d.txt')

embeddings, encoded_train, encoded_val, encoded_test, word_to_idx, vocab = create_embeddings_and_encode(train_sentences, val_sentences, test_sentences, glove)

In [18]:
pad_idx = word_to_idx['<PAD>']

train_dataset = LSTMDataset(encoded_train)
val_dataset = LSTMDataset(encoded_val)
test_dataset = LSTMDataset(encoded_test)

train_loader = DataLoader(train_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)

print(f'Train dataset size: {len(train_dataset)}')
print(f'Validation dataset size: {len(val_dataset)}')
print(f'Test dataset size: {len(test_dataset)}')

Train dataset size: 40024
Validation dataset size: 11435
Test dataset size: 5719


In [23]:
learning_rate = 0.001
num_epochs = 10
patience = 1
model = LSTM(embeddings, 300, 0.5, 2)
criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), learning_rate)

In [24]:
model = train_model(model, train_loader, val_loader, optimizer, criterion, num_epochs, patience)

100%|██████████| 626/626 [00:46<00:00, 13.45it/s]


Train Loss: 6.3867
Train Perplexity: 593.8653
Val Loss: 5.7491
Val Perplexity: 313.9195


100%|██████████| 626/626 [00:50<00:00, 12.44it/s]


Train Loss: 5.7042
Train Perplexity: 300.1202
Val Loss: 5.4003
Val Perplexity: 221.4753


100%|██████████| 626/626 [00:49<00:00, 12.53it/s]


Train Loss: 5.4752
Train Perplexity: 238.6935
Val Loss: 5.2488
Val Perplexity: 190.3311


100%|██████████| 626/626 [00:51<00:00, 12.25it/s]


Train Loss: 5.3414
Train Perplexity: 208.7993
Val Loss: 5.1498
Val Perplexity: 172.3916


100%|██████████| 626/626 [00:50<00:00, 12.47it/s]


Train Loss: 5.2358
Train Perplexity: 187.8839
Val Loss: 5.0744
Val Perplexity: 159.8758


100%|██████████| 626/626 [00:50<00:00, 12.35it/s]


Train Loss: 5.1508
Train Perplexity: 172.5660
Val Loss: 5.0169
Val Perplexity: 150.9450


100%|██████████| 626/626 [00:50<00:00, 12.39it/s]


Train Loss: 5.0781
Train Perplexity: 160.4706
Val Loss: 4.9703
Val Perplexity: 144.0665


100%|██████████| 626/626 [00:50<00:00, 12.40it/s]


Train Loss: 5.0146
Train Perplexity: 150.5969
Val Loss: 4.9378
Val Perplexity: 139.4661


100%|██████████| 626/626 [00:50<00:00, 12.35it/s]


Train Loss: 4.9576
Train Perplexity: 142.2518
Val Loss: 4.9068
Val Perplexity: 135.2081


100%|██████████| 626/626 [00:50<00:00, 12.45it/s]


Train Loss: 4.9054
Train Perplexity: 135.0212
Val Loss: 4.8828
Val Perplexity: 132.0005


## Perplexity Scores

In [25]:
loss, perplexity = test_model(model, train_loader, criterion, pad_idx)
print(f'\nTrain Loss: {loss}')
print(f'Train Perplexity: {perplexity}')


Train Loss: 4.637046775878809
Train Perplexity: 103.239013671875


In [26]:
loss, perplexity = test_model(model, val_loader, criterion, pad_idx)
print(f'\nVal Loss: {loss}')
print(f'Val Perplexity: {perplexity}')


Val Loss: 4.882868535025826
Val Perplexity: 132.00881958007812


In [27]:
loss, perplexity = test_model(model, test_loader, criterion, pad_idx)
print(f'\nTest Loss: {loss}')
print(f'Test Perplexity: {perplexity}')


Test Loss: 4.88175597190857
Test Perplexity: 131.8619842529297


In [36]:
save_perplexities_lstm(model, encoded_train, criterion, '2021101072_LM2_train_perplexity.txt', vocab)
save_perplexities_lstm(model, encoded_val, criterion, '2021101072_LM2_val_perplexity.txt', vocab)
save_perplexities_lstm(model, encoded_test, criterion, '2021101072_LM2_test_perplexity.txt', vocab)

190.16230300148584

In [29]:
# save_model(model, '2021101072_LM2.pt')

In [29]:
import pickle

with open('data_store_lstm.pkl', 'wb') as f:
    pickle.dump({
        'embeddings': embeddings,
        'vocab': vocab,
        'word_to_idx': word_to_idx,
        'encoded_train': encoded_train,
        'encoded_val': encoded_val,
        'encoded_test': encoded_test,
    }, f)

print("Data saved successfully!")

Data saved successfully!


## Loading and Running the model again

In [31]:
with open('/kaggle/working/data_store_lstm.pkl', 'rb') as f:
    data = pickle.load(f)

embeddings = data['embeddings']
vocab = data['vocab']
word_to_idx = data['word_to_idx']
encoded_train = data['encoded_train']
encoded_val = data['encoded_val']
encoded_test = data['encoded_test']

# Recreate datasets and loaders
train_dataset = LSTMDataset(encoded_train)
val_dataset = LSTMDataset(encoded_val)
test_dataset = LSTMDataset(encoded_test)

pad_idx = word_to_idx['<PAD>']

train_loader = DataLoader(train_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, collate_fn=lambda batch: collate_fn(batch, pad_idx), shuffle=True)

print("Data loaded successfully!")

Data loaded successfully!


In [32]:
model_new = LSTM(embeddings, 300, 0.5, 2)

In [33]:
model_new.load_state_dict(torch.load('/kaggle/working/2021101072_LM2.pt', weights_only=True))

model_new.eval()
model_new.to(device)

criterion = nn.CrossEntropyLoss(ignore_index=pad_idx)
optimizer = optim.Adam(model.parameters(), learning_rate)

In [34]:
loss, perplexity = test_model(model_new, val_loader, criterion, pad_idx)
print(f'\nVal Loss: {loss}')
print(f'Val Perplexity: {perplexity}')                         


Val Loss: 4.884301854245489
Val Perplexity: 132.1981201171875


In [35]:
loss, perplexity = test_model(model_new, test_loader, criterion, pad_idx)
print(f'\nTest Loss: {loss}')
print(f'Test Perplexity: {perplexity}')


Test Loss: 4.879736826154921
Test Perplexity: 131.59603881835938
