In [1]:
import torch.nn as nn
import torch
import numpy as np

import torch.nn.functional as F
import re
from torch.utils.data import DataLoader, Dataset


with open('hp.txt', 'r', encoding='UTF-8') as f:
    train_data = f.read()

# Split train_data into words
train_data = re.findall(r'\w+|[^\s\w]', train_data)
words_available = sorted(list(set(train_data)))
stoi = {word: i for i, word in enumerate(words_available)}
itos = {i: word for i, word in enumerate(words_available)}
train_data = [stoi[word] for word in train_data]

# Only use the first 10000 words
train_data = train_data[:256]

vocab_size = len(words_available)


encode = lambda s: [stoi[c] for c in s]
decode = lambda l: "".join([itos[i] for i in l])



In [2]:
class LanguageModelOneHot(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers, rnn_type='LSTM'):
        super(LanguageModelOneHot, self).__init__()

        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(vocab_size, hidden_dim, num_layers, batch_first=True) # [batch_size, seq_length, vocab_size] -> [batch_size, seq_length, hidden_dim]
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(vocab_size, hidden_dim, num_layers, batch_first=True)
        else:
            raise ValueError("Invalid RNN type. Choose 'LSTM' or 'GRU'.")
        
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        embeds =  F.one_hot(x, num_classes=vocab_size).float()
        out, hidden = self.rnn(embeds, hidden)
        print(out.shape, out.size(2))
        out = out.reshape(-1, out.size(2))
        out = self.fc(out)
        return out, hidden
    
class LanguageModelWord2Vec(nn.Module):
    def __init__(self, vocab_size, hidden_dim, num_layers, rnn_type='LSTM'):
        super(LanguageModelWord2Vec, self).__init__()
        embedding_dim = 150
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        if rnn_type == 'LSTM':
            self.rnn = nn.LSTM(embedding_dim, hidden_dim, num_layers, batch_first=True)
        elif rnn_type == 'GRU':
            self.rnn = nn.GRU(embedding_dim, hidden_dim, num_layers, batch_first=True)
        else:
            raise ValueError("Invalid RNN type. Choose 'LSTM' or 'GRU'.")
        
        self.fc = nn.Linear(hidden_dim, vocab_size)
        
    def forward(self, x, hidden):
        # x shape: (batch_size, seq_length)
        embeds = self.embedding(x)
        # embeds shape: (batch_size, seq_length, embedding_dim)
        out, hidden = self.rnn(embeds, hidden)
        # out shape: (batch_size, seq_length, hidden_dim)
        out = out.reshape(-1, self.hidden_dim)
        # out shape: (batch_size * seq_length, hidden_dim)
        out = self.fc(out)
        # out shape: (batch_size * seq_length, vocab_size)
        return out, hidden



In [15]:
test1 = torch.rand(2, 3, 4).float()
test1.reshape(-1, 4), test1

(tensor([[0.7637, 0.2483, 0.2139, 0.1792],
         [0.0376, 0.0066, 0.5990, 0.9720],
         [0.0786, 0.2853, 0.4016, 0.5859],
         [0.4735, 0.8030, 0.4022, 0.3367],
         [0.8691, 0.5317, 0.3498, 0.0520],
         [0.6167, 0.3746, 0.1357, 0.1192]]),
 tensor([[[0.7637, 0.2483, 0.2139, 0.1792],
          [0.0376, 0.0066, 0.5990, 0.9720],
          [0.0786, 0.2853, 0.4016, 0.5859]],
 
         [[0.4735, 0.8030, 0.4022, 0.3367],
          [0.8691, 0.5317, 0.3498, 0.0520],
          [0.6167, 0.3746, 0.1357, 0.1192]]]))