In [1]:
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
from torcheval.metrics import WordErrorRate, Perplexity
import re

from tqdm import tqdm

from datasets import load_dataset

import gensim

if torch.cuda.is_available():
    device = torch.device('cuda')
elif torch.backends.mps.is_available():
    device = torch.device('mps')
else:
    device = torch.device('cpu')

print('Using device:', device)

  from .autonotebook import tqdm as notebook_tqdm


Using device: mps


In [2]:
train_data = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")

train = train_data['train']['text']
valid = train_data['validation']['text']
test = train_data['test']['text']


# Step 2: Tokenize the text using regular expressions
def tokenize_with_re(data):
    tokenized_sentences = [
        re.findall(r'\b[a-zA-Z]+\b', sentence.lower()) for sentence in data if sentence.strip()
    ]
    return tokenized_sentences

# Tokenize each dataset split
train_tokenized = tokenize_with_re(train)
valid_tokenized = tokenize_with_re(valid)
test_tokenized = tokenize_with_re(test)

flat_train = [item for sublist in train_tokenized for item in sublist]
flat_valid = [item for sublist in valid_tokenized for item in sublist]
flat_test = [item for sublist in test_tokenized for item in sublist]

flat_total = flat_train + flat_valid + flat_test
words_available = sorted(set(flat_total))
vocab_size = len(words_available)

stoi = {word: i for i, word in enumerate(words_available)}
itos = {i: word for i, word in enumerate(words_available)}

print(f"Total number of words: {len(flat_total)}, vocabulary size: {len(words_available)}")

Total number of words: 2053065, vocabulary size: 66929


In [3]:
def create_sequences(data, seq_length, stoi):
    sequences = []
    targets = []
    
    for i in tqdm(range(len(data) - seq_length)):
        # Extract sequence and target
        seq = data[i:i+seq_length]
        target = data[i+1:i+seq_length+1]
        
        # Convert each word in the sequence and target to indices using stoi
        seq_indices = [stoi.get(word, 0) for word in seq]
        target_indices = [stoi.get(word, 0) for word in target]
        
        # Only add sequences and targets of the desired length
        if len(seq_indices) == seq_length and len(target_indices) == seq_length:
            sequences.append(seq_indices)
            targets.append(target_indices)
    
    return sequences, targets

In [4]:
word2vec = gensim.models.Word2Vec.load('wikitext_small_word2vec.model')
stoi = {word: idx for idx, word in enumerate(word2vec.wv.index_to_key)}
itos = {idx: word for idx, word in enumerate(word2vec.wv.index_to_key)}

# Example usage:
seq_length = 50
sequences, targets = create_sequences(flat_train, seq_length, stoi)
seq_val, target_val = create_sequences(flat_valid, seq_length, stoi)

class TextDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets

    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.targets[idx], dtype=torch.long)

batch_size = 32
dataset = TextDataset(sequences, targets)
dataset_val = TextDataset(seq_val, target_val)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)
dataloader_val = DataLoader(dataset_val, batch_size=batch_size, shuffle=True)

100%|██████████| 1679606/1679606 [00:08<00:00, 199635.08it/s]
100%|██████████| 175865/175865 [00:00<00:00, 262313.67it/s]


In [5]:
class LanguageModelWord2Vec(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers, rnn_type='LSTM',  word2vec_path='wikitext_small_word2vec.model'):
        super(LanguageModelWord2Vec, self).__init__()
        self.hidden_dim = hidden_dim
        
        # Step 1: Load the Gensim Word2Vec model
        self.word2vec = gensim.models.Word2Vec.load(word2vec_path)
        
        # Get the embedding dimensions from the Gensim model
        embedding_dim = self.word2vec.wv.vector_size

        # Step 2: Initialize the embedding layer with pretrained Word2Vec weights
        
        weights = torch.FloatTensor(self.word2vec.wv.vectors)
        self.embedding = nn.Embedding.from_pretrained(weights)
        
        self.embedding.weight.requires_grad = False
        
        # Step 3: Initialize RNN (LSTM or GRU)
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True)
        else:
            raise ValueError("Invalid RNN type. Choose 'LSTM' or 'GRU'.")
        
        self.fc = nn.Linear(hidden_dim, vocab_size)

    
    def forward(self, x, hidden):
        # x shape: (batch_size, seq_length)
        embeds = self.embedding(x)
        # embeds shape: (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(embeds, hidden)
        # out shape: (batch_size, seq_length, hidden_dim)
        out = out.reshape(-1, self.hidden_dim)
        # out shape: (batch_size * seq_length, hidden_dim)
        out = self.fc(out)
        # out shape: (batch_size * seq_length, vocab_size)
        return out, hidden

In [6]:
# Initialize the model
hidden_dim = 256
num_layers = 3
rnn_type = 'LSTM' # Choose 'LSTM' or 'GRU'
model = LanguageModelWord2Vec(vocab_size, hidden_dim, num_layers, rnn_type)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0003)

# Initialize WER and Perplexity for both train and validation
wer = WordErrorRate()
val_wer = WordErrorRate()
train_perplexity = Perplexity()
val_perplexity = Perplexity()

if torch.cuda.is_available():
    wer = wer.to(device)
    val_wer = val_wer.to(device)
    train_perplexity = train_perplexity.to(device)
    val_perplexity = val_perplexity.to(device)

train_losses = {'crossentropy': [], 'wer': [], 'perplexity': []}
val_losses = {'crossentropy': [], 'wer': [], 'perplexity': []}

# Move model to device (GPU or CPU)
model.to(device)

# Training loop
num_epochs = 10  
for epoch in range(num_epochs):
    
    model.train()
    total_loss = 0
    total_wer = 0
    hidden = None
    
    # Initialize metrics for the epoch
    wer.reset()
    train_perplexity.reset()
    
    for i, (inputs, targets) in tqdm(enumerate(dataloader), total=len(dataloader)):

        hidden = None
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs, hidden = model(inputs, hidden)
        
        # Detach hidden states to prevent backpropagating through the entire training history
        if isinstance(hidden, tuple):
            hidden = tuple([h.detach() for h in hidden])
        else:
            hidden = hidden.detach()
        
        outputs = outputs.view(-1, vocab_size) 
        targets_buffer = targets
        targets = targets.view(-1)
        
        # Compute loss
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()

        # Calculate WER
        # For WER and Perplexity, keep the original targets shape
        preds = torch.argmax(outputs, dim=1)

        spreds = [itos[p.item()] for p in preds]
        stargets = [itos[t.item()] for t in targets]  # You might want to keep the same targets as for loss
        wer.update(spreds, stargets)
        
        # Calculate perplexity using the original (2D) targets
        if torch.cuda.is_available():
            train_perplexity.update(outputs.view(batch_size, seq_length, -1), targets_buffer)
        else:
            train_perplexity.update(outputs.view(batch_size, seq_length, -1).cpu(), targets_buffer.cpu())  # Pass original target shape
        
    # Calculate training metrics
    avg_train_loss = total_loss / len(dataloader)
    avg_train_wer = wer.compute().item()
    avg_train_perplexity = train_perplexity.compute().item()
    
    train_losses['crossentropy'].append(avg_train_loss)
    train_losses['wer'].append(avg_train_wer)
    train_losses['perplexity'].append(avg_train_perplexity)
    

    # Validation phase
    model.eval()  # Set the model to evaluation mode
    val_total_loss = 0
    val_wer.reset()
    val_perplexity.reset()

    with torch.no_grad():
        for j, val_inputs, val_targets in enumerate(tqdm(dataloader_val, total=len(dataloader_val))):

            # Only look at a part of the validation set
            if j > len(dataloader_val)/3:
                break
            
            val_inputs, val_targets = val_inputs.to(device), val_targets.to(device)

            val_outputs, _ = model(val_inputs, None)

            # Ensure outputs are float32
            val_outputs = val_outputs.float()

            # Reshape outputs and targets for loss calculation
            val_outputs = val_outputs.view(-1, vocab_size)
            val_targets = val_targets.view(-1)

            # Compute validation loss
            val_loss = criterion(val_outputs, val_targets)
            val_total_loss += val_loss.item()

            # Validation WER and Perplexity
            val_preds = torch.argmax(val_outputs, dim=1)
            val_wer.update(val_preds, val_targets)
            if torch.cuda.is_available():
                val_perplexity.update(val_outputs.view(batch_size, seq_length, -1), val_targets)
            else:
                val_perplexity.update(val_outputs.view(batch_size, seq_length, -1).cpu(), val_targets.cpu())

    # Calculate average validation loss, WER, and perplexity
    avg_val_loss = val_total_loss / len(dataloader_val)
    avg_val_wer = val_wer.compute().item()
    avg_val_perplexity = val_perplexity.compute().item()

    val_losses['crossentropy'].append(avg_val_loss)
    val_losses['wer'].append(avg_val_wer)
    val_losses['perplexity'].append(avg_val_perplexity)

    print(f'Epoch [{epoch+1}/{num_epochs}], Training Loss: {avg_train_loss:.4f}, Training WER: {avg_train_wer:.4f}, Training Perplexity: {avg_train_perplexity:.4f}')
    print(f'Epoch [{epoch+1}/{num_epochs}], Validation Loss: {avg_val_loss:.4f}, Validation WER: {avg_val_wer:.4f}, Validation Perplexity: {avg_val_perplexity:.4f}')


  0%|          | 42/52488 [00:23<8:13:22,  1.77it/s]


KeyboardInterrupt: 

In [6]:
def generate_text(model, seed_word, max_length=50, random_sampling=False):
    model.eval()  # Set the model to evaluation mode
    generated_words = [seed_word]  # List to store generated words
    
    # Convert seed word to index
    seed_idx = torch.tensor([[stoi[seed_word]]]).to(device)  # Shape: (1, 1)

    # Initialize hidden state
    hidden = None
    
    # Loop through to generate words
    for _ in range(max_length):
        # Forward pass through the model
        with torch.no_grad():
            output, hidden = model(seed_idx, hidden)
        
        # Get the predicted word (highest probability or sample)
        output = output.squeeze(1)  # Remove the seq_len dimension (now (1, vocab_size))
        if random_sampling:
            # Sample from the output distribution
            probabilities = torch.softmax(output, dim=1)
            predicted_idx = torch.multinomial(probabilities, num_samples=1).item()
        else:
            predicted_idx = torch.argmax(output, dim=1).item()  # Get the index of the word with highest probability
        
        # Convert index back to word
        predicted_word = itos[predicted_idx]
        
        # Append the predicted word to the list
        generated_words.append(predicted_word)
        
        # Set the predicted word as the next input (shape: (1, 1))
        seed_idx = torch.tensor([[predicted_idx]]).to(device)
        
        # Stop if an end-of-sequence token is generated (optional)
        if predicted_word == "<eos>":  # Assuming "<eos>" is the token for end of sentence
            break
    
    return ' '.join(generated_words)

# Example usage
generated_text = generate_text(model, seed_word="Harry", max_length=50, random_sampling=True)
print(generated_text)


Harry , and - another Thousand inside the dormitory . He was looking very around by a splash , and Lee did not showing him Crookshanks and went about to know I had been in a former relief . “ The skrewts with Household … Had I want to have an


# FastText




CACACACA


In [None]:
fasttext = gensim.models.FastText.load('wikitext_small_fasttext.model')
stoi = {word: idx for idx, word in enumerate(fasttext.wv.index_to_key)}
itos = {idx: word for idx, word in enumerate(fasttext.wv.index_to_key)}

# Example usage:
seq_length = 50
sequences, targets = create_sequences(flat_train, seq_length, stoi)

batch_size = 128
dataset = TextDataset(sequences, targets)
dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)


In [110]:
class LanguageModelFastText(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers, rnn_type='LSTM',  word2vec_path='wikitext_small_word2vec.model'):
        super(LanguageModelFastText, self).__init__()
        self.hidden_dim = hidden_dim
        
        # Step 1: Load the Gensim Word2Vec model
        self.fasttext = gensim.models.FastText.load(word2vec_path)
        
        # Get the embedding dimensions from the Gensim model
        embedding_dim = self.fasttext.wv.vector_size

        # Step 2: Initialize the embedding layer with pretrained Word2Vec weights
        
        weights = torch.FloatTensor(self.fasttext.wv.vectors)
        self.embedding = nn.Embedding.from_pretrained(weights)
        
        # Freeze embedding layer
        self.embedding.weight.requires_grad = False

        # Step 3: Initialize RNN (LSTM or GRU)
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True)
        else:
            raise ValueError("Invalid RNN type. Choose 'LSTM' or 'GRU'.")
        
        self.fc = nn.Linear(hidden_dim, vocab_size)

    
    def forward(self, x, hidden):
        # x shape: (batch_size, seq_length)
        embeds = self.embedding(x)
        # embeds shape: (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(embeds, hidden)
        # out shape: (batch_size, seq_length, hidden_dim)
        out = out.reshape(-1, self.hidden_dim)
        # out shape: (batch_size * seq_length, hidden_dim)
        out = self.fc(out)
        # out shape: (batch_size * seq_length, vocab_size)
        return out, hidden

In [None]:
# Initialize the model
hidden_dim = 256  

num_layers = 3
rnn_type = 'LSTM' # Choose 'LSTM' or 'GRU'

model = LanguageModelFastText(vocab_size, hidden_dim,  num_layers, rnn_type)

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0003)

# Move model to device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')

model.to(device)

# Training loop
num_epochs = 3  
for epoch in range(num_epochs):
    
    model.train()
    hidden = None
    total_loss = 0
    
    for inputs, targets in tqdm(dataloader):
        hidden = None
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs, hidden = model(inputs, hidden)
        
        # Detach hidden states to prevent backpropagating through the entire training history
        if isinstance(hidden, tuple):
            hidden = tuple([h.detach() for h in hidden])
        else:
            hidden = hidden.detach()
        
        outputs = outputs.view(-1, vocab_size) 
        targets = targets.view(-1)     
        
        # Compute loss
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    torch.save(model.state_dict(), f'./model_{epoch}.pt')

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')
