In [5]:
import torch.nn as nn
import torch
import numpy as np

import torch.nn.functional as F
import re
from torch.utils.data import DataLoader, Dataset
import torch.optim as optim

from tqdm import tqdm

from nltk.tokenize import word_tokenize
from datasets import load_dataset

from gensim.models import Word2Vec

In [6]:
train_data = load_dataset("Salesforce/wikitext", "wikitext-2-raw-v1")

In [7]:
with open('hp.txt', 'r', encoding='UTF-8') as f:
    train_data = f.read()

# Split train_data into words
train_data = train_data.lower()
train_data = re.sub(r'[^\w\s]', '', train_data)
train_data = re.findall(r'\w+|[^\s\w]', train_data)
words_available = sorted(list(set(train_data)))
stoi = {word: i for i, word in enumerate(words_available)}
itos = {i: word for i, word in enumerate(words_available)}
train_data = [stoi[word] for word in train_data]

print(len(train_data))
# Only use the first 10000 words
# train_data = train_data[:700000]

vocab_size = len(words_available)
print(vocab_size)
# Show example of words
print(words_available[100:110])

1087874
25555
['aaaaarrrgh', 'aaaah', 'aaaargh', 'aaah', 'aaahpain', 'aaargh', 'aah', 'aall', 'aargh', 'ab']


In [9]:
def create_sequences(data, seq_length):
    sequences = []
    targets = []
    for i in range(len(data) - seq_length):
        seq = data[i:i+seq_length]
        target = data[i+1:i+seq_length+1]
        if len(target) < seq_length or len(seq) < seq_length:
            break
        sequences.append(seq)
        targets.append(target)
    return sequences, targets

seq_length = 30
sequences, targets = create_sequences(train_data, seq_length)

class TextDataset(Dataset):
    def __init__(self, sequences, targets):
        self.sequences = sequences
        self.targets = targets
    
    def __len__(self):
        return len(self.sequences)
    
    def __getitem__(self, idx):
        return torch.tensor(self.sequences[idx], dtype=torch.long), torch.tensor(self.targets[idx], dtype=torch.long)

dataset = TextDataset(sequences, targets)
batch_size = 256

dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

In [11]:
# Train a Word2Vec model (classic setting)
import logging
logging.basicConfig(format="%(asctime)s : %(levelname)s : %(message)s", level=logging.INFO)



model = Word2Vec(
    sentences=sequences,  # Tokenized sentences
    vector_size=150,             # Dimensionality of word vectors (embedding size)
    window=5,                    # Context window size
    min_count=1,                 # Ignores words with a frequency lower than this
    sg=1,                        # Skip-gram (1) or CBOW (0); classic uses CBOW
    workers=6,                   # Number of threads to run in parallel (for performance)
    epochs=10                    # Number of iterations (epochs) over the corpus
)


# Save the model (optional)
model.save("word2vec_model.model")

2024-10-22 14:24:26,345 : INFO : collecting all words and their counts
2024-10-22 14:24:26,347 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2024-10-22 14:24:26,395 : INFO : PROGRESS: at sentence #10000, processed 300000 words, keeping 1933 word types
2024-10-22 14:24:26,431 : INFO : PROGRESS: at sentence #20000, processed 600000 words, keeping 3058 word types
2024-10-22 14:24:26,466 : INFO : PROGRESS: at sentence #30000, processed 900000 words, keeping 3803 word types
2024-10-22 14:24:26,500 : INFO : PROGRESS: at sentence #40000, processed 1200000 words, keeping 4493 word types
2024-10-22 14:24:26,534 : INFO : PROGRESS: at sentence #50000, processed 1500000 words, keeping 5017 word types
2024-10-22 14:24:26,569 : INFO : PROGRESS: at sentence #60000, processed 1800000 words, keeping 5467 word types
2024-10-22 14:24:26,603 : INFO : PROGRESS: at sentence #70000, processed 2100000 words, keeping 5861 word types
2024-10-22 14:24:26,637 : INFO : PROGRESS: at sen

[22191,
 743,
 22641,
 14893,
 86,
 10119,
 10031,
 24451,
 18619,
 10472,
 11138,
 11730,
 25460,
 10884,
 9755,
 22171,
 1204,
 1116,
 15011,
 10423,
 25075,
 2961,
 22265,
 18370,
 8453,
 10289,
 14654,
 10314,
 19375,
 24692,
 1674,
 15207,
 23846,
 596,
 24727,
 10097,
 22202,
 8738,
 11597,
 22235,
 1775,
 1396,
 22360,
 20278,
 11712,
 6530,
 24863,
 13408,
 24523,
 4562,
 14547,
 10938,
 733,
 14707,
 22200,
 22218,
 11760,
 15020,
 145,
 15286,
 12631,
 12830,
 24763,
 6277,
 6113,
 11988,
 25252,
 12170,
 5678,
 3000,
 24625,
 1066,
 25493,
 9363,
 24093,
 1013,
 16780,
 14273,
 13987,
 19079,
 433,
 6201,
 14897,
 22384,
 22308,
 10736,
 9769,
 9044,
 21055,
 22559,
 18231,
 7612,
 12834,
 24356,
 15125,
 22460,
 20157,
 7626,
 24781,
 24551,
 9286,
 1792,
 24202,
 10049,
 12827,
 9235,
 10298,
 22159,
 5686,
 24953,
 15051,
 18388,
 10999,
 22726,
 4019,
 3085,
 10120,
 6209,
 24768,
 14106,
 16480,
 22388,
 18742,
 20371,
 419,
 13142,
 1836,
 15182,
 14441,
 9913,
 19098,

In [14]:
similar_words = model.wv.most_similar("aaargh", topn=5)
print(similar_words)


KeyError: "Key 'aaargh' not present in vocabulary"

In [21]:
class LanguageModelOneHot(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers, rnn_type='LSTM'):
        super(LanguageModelOneHot, self).__init__()

        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(vocab_size, hidden_dim, num_layers, batch_first=True) # [batch_size, seq_length, vocab_size] -
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(vocab_size, hidden_dim, num_layers, batch_first=True)
        else:
            raise ValueError("Invalid RNN type. Choose 'LSTM' or 'GRU'.")
        
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        embeds =  F.one_hot(x, num_classes=vocab_size).float()
        out, hidden = self.rnn(embeds, hidden)
        out = out.reshape(-1, out.size(2))
        out = self.fc(out)
        return out, hidden
    
class LanguageModelWord2Vec(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers, rnn_type='LSTM'):
        super(LanguageModelWord2Vec, self).__init__()
        self.hidden_dim = hidden_dim
        embedding_dim = 150
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True)
        else:
            raise ValueError("Invalid RNN type. Choose 'LSTM' or 'GRU'.")
        
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        # x shape: (batch_size, seq_length)
        embeds = self.embedding(x)
        # embeds shape: (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(embeds, hidden)
        # out shape: (batch_size, seq_length, hidden_dim)
        out = out.reshape(-1, self.hidden_dim)
        # out shape: (batch_size * seq_length, hidden_dim)
        out = self.fc(out)
        # out shape: (batch_size * seq_length, vocab_size)
        return out, hidden


In [23]:
# Initialize the model
hidden_dim = 256  
num_layers = 2
rnn_type = 'LSTM' # Choose 'LSTM' or 'GRU'

# model = LanguageModelOneHot(vocab_size, hidden_dim, num_layers, rnn_type)
model = LanguageModelOneHot(vocab_size, hidden_dim,  num_layers, rnn_type)


# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.0003)


# Move model to device (GPU or CPU)
device = torch.device('cuda' if torch.cuda.is_available() else 'mps')

model.to(device)

# Training loop
num_epochs = 3  # Adjust as needed
for epoch in range(num_epochs):
    
    model.train()
    hidden = None
    total_loss = 0
    
    for inputs, targets in tqdm(dataloader):
        hidden = None
        inputs, targets = inputs.to(device), targets.to(device)
        
        # Zero the gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs, hidden = model(inputs, hidden)
        
        
        # Detach hidden states to prevent backpropagating through the entire training history
        if isinstance(hidden, tuple):
            hidden = tuple([h.detach() for h in hidden])
        else:
            hidden = hidden.detach()
        
        outputs = outputs.view(-1, vocab_size) 
        targets = targets.view(-1)     
        
        # Compute loss
        loss = criterion(outputs, targets)
        
        # Backward pass and optimization
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    avg_loss = total_loss / len(dataloader)
    torch.save(model.state_dict(), f'./model_{epoch}.pt')

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss:.4f}')


  0%|          | 14/5597 [00:11<1:15:31,  1.23it/s]


KeyboardInterrupt: 

In [6]:
def generate_text(model, seed_word, max_length=50, random_sampling=False):
    model.eval()  # Set the model to evaluation mode
    generated_words = [seed_word]  # List to store generated words
    
    # Convert seed word to index
    seed_idx = torch.tensor([[stoi[seed_word]]]).to(device)  # Shape: (1, 1)

    # Initialize hidden state
    hidden = None
    
    # Loop through to generate words
    for _ in range(max_length):
        # Forward pass through the model
        with torch.no_grad():
            output, hidden = model(seed_idx, hidden)
        
        # Get the predicted word (highest probability or sample)
        output = output.squeeze(1)  # Remove the seq_len dimension (now (1, vocab_size))
        if random_sampling:
            # Sample from the output distribution
            probabilities = torch.softmax(output, dim=1)
            predicted_idx = torch.multinomial(probabilities, num_samples=1).item()
        else:
            predicted_idx = torch.argmax(output, dim=1).item()  # Get the index of the word with highest probability
        
        # Convert index back to word
        predicted_word = itos[predicted_idx]
        
        # Append the predicted word to the list
        generated_words.append(predicted_word)
        
        # Set the predicted word as the next input (shape: (1, 1))
        seed_idx = torch.tensor([[predicted_idx]]).to(device)
        
        # Stop if an end-of-sequence token is generated (optional)
        if predicted_word == "<eos>":  # Assuming "<eos>" is the token for end of sentence
            break
    
    return ' '.join(generated_words)

# Example usage
generated_text = generate_text(model, seed_word="Harry", max_length=50, random_sampling=True)
print(generated_text)


Harry , and - another Thousand inside the dormitory . He was looking very around by a splash , and Lee did not showing him Crookshanks and went about to know I had been in a former relief . “ The skrewts with Household … Had I want to have an
