In [1]:
import numpy as np
import os
import regex as re
# import requests
import torch
from torch import nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [2]:
def one_hot_encode(x, vocab_size):
    """
    Taking x of shape (B, S) where B is the batch size and S is the sequence length
    and return a one-hot encoded version of x of shape (B, S, Vocab_size/features)
    """
    batch_size = np.array(x).shape[0]
    seq_len = np.array(x).shape[1]
    big_list = np.zeros((batch_size, seq_len, vocab_size))
    
    for batch in range(batch_size):
        big_list[batch, np.arange(seq_len), x[batch]] = 1
    return big_list

In [3]:
# download the tiny shakespeare dataset

# if not os.path.exists(input_file_path):
#     data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
#     with open(input_file_path, 'w', encoding='utf-8') as f:
#         f.write(requests.get(data_url).text)

input_file_path = os.path.join(os.path.dirname("../data/"), 'input.txt')

with open(input_file_path, 'r', encoding='utf-8') as f:
    data = f.read()
n = len(data)

In [4]:
# Split data into train, val folds
train_data = data[:int(n*0.1)]
val_data = data[int(n*0.9):]

In [5]:
class DataLoader(object):
    def __init__(self, data, sequence_length, batch_size):
        """
        Initializes the DataLoader with the given data, sequence length, and batch size.
        
        Args:
            data (str): The input text data.
            sequence_length (int): The length of each sequence.
            batch_size (int): The size of each batch.
        """
        self.text = data
        
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        self.counter = 0
        
        # Getting list of words, from preprocessed text
        self.word_list = self._text_preprocessing()
        
        # Unique words
        self.unique_words = set(self.word_list)
        
        # Adding unknown token for exceptions
        self.unique_words.add('<UNK>')
        
        # Create a vocabulary with '<UNK>' first
        self.word_to_ix = {'<UNK>': 0}
        self.word_to_ix.update({tok: idx + 1 for idx, tok in enumerate(self.unique_words - {'<UNK>'})})
        
        # Create the inverse mapping
        self.ix_to_word = {idx: tok for tok, idx in self.word_to_ix.items()}
        
        self.vocab_size = len(self.word_to_ix)
        
    def _text_preprocessing(self):
        """
        Preprocesses the text by converting to lowercase, removing numbers, punctuation, and special tokens.
        
        Returns:
            str: The preprocessed text.
        """
        self.text = self.text.lower()
        self.text = re.sub(r'\d+', '', self.text)  # Remove numbers
        self.text = re.sub(r"[^\w\s']+", '', self.text)  # Remove punctuation ( except apostrophes ) 
        self.text = re.sub(r'\W+', ' ', self.text)  # Remove special tokens
        self.text = self.text.split(' ')
        return self.text
    
    def encode(self, words):
        """
        Encodes the given words into their corresponding indices using the vocabulary.
        
        Args:
            words (str): The words to encode.
        
        Returns:
            list: A list of indices representing the encoded words.
        """
        # For strings
        # return [self.word_to_ix.get(word, self.word_to_ix['<UNK>']) for word in words.split(' ')]
        # For list of strings
        return [self.word_to_ix.get(word, self.word_to_ix['<UNK>']) for word in words]
    
    def decode(self, indexes):
        """
        Decodes the given indices into their corresponding words using the vocabulary.
        
        Args:
            indexes (list): The indices to decode.
        
        Returns:
            list: A list of words corresponding to the indices.
        """
        return [self.ix_to_word.get(idx, '<UNK>') for idx in indexes]
    
    def next_batch(self):
        """
        Iterate to the next batch in text.
        
        Returns:
            batches (list): List of (input_sequence, target_sequence) pairs for training.
            shape: (batch_size, sequence_length, 2)
        """
        target_offset = 1  # Offset inside the function
        batch_sequence = []
        
        if self.counter + self.batch_size * self.sequence_length <= len(self.word_list):
            for i in range(self.batch_size):
                # Ensure we don't exceed the list length by taking the minimum of the desired end index and the list length.
                input_sequence = self.word_list[self.counter + i : self.counter + i + self.sequence_length]
                target_sequence = self.word_list[self.counter + i + target_offset : self.counter + i + self.sequence_length + target_offset]
                
                # Encode the input and target sequences into their corresponding numerical representations.
                input_sequence = self.encode(input_sequence)
                target_sequence = self.encode(target_sequence)
                
                batch_sequence.append([input_sequence, target_sequence])
            
            # One-hot encoding
            batch_sequence = np.array(batch_sequence)
            input = one_hot_encode(batch_sequence[:, 0, :], self.vocab_size)
            target = batch_sequence[:, 1, :]
            
            # Increasing the counter by the batch size
            self.counter += self.batch_size
            return input, target
        else:
            return None

    
    def drop_counter(self):
        """
        Drops the counter to zero.
        """
        self.counter = 0

In [6]:
# Using "beautiful" numbers for hyperparameters (x^2)
dl = DataLoader(data=train_data, sequence_length=20, batch_size=4)
a, b = dl.next_batch()

In [14]:
a = torch.tensor(a, dtype=torch.float32).permute(1, 0, 2).to(device)

In [25]:
class RNN(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers=1, nonlinearity='tanh', bias=True):
        super(RNN, self).__init__()
        self.num_layers = num_layers
        self.input_size = input_size
        self.hidden_size = hidden_size

        # Weights initialization
        self.U = nn.Parameter(torch.randn(self.num_layers, self.hidden_size, self.input_size, dtype=torch.float32))  # input weights  
        self.W = nn.Parameter(torch.randn(self.num_layers, self.hidden_size, self.hidden_size, dtype=torch.float32)) # hidden weights
        self.V = nn.Parameter(torch.randn(self.input_size, self.hidden_size, dtype=torch.float32))                   # output weights
        self.b = nn.Parameter(torch.randn(self.hidden_size, ))      # Input bias 
        self.c = nn.Parameter(torch.randn(self.hidden_size, ))      # Hidden bias
    
    def forward(self, X, h_0=None):
        """
        RNN forward propagation.
    
        :return: current_state: Hidden state from current layer.
        """
        if h_0 is None:
            h_0 = torch.zeros((self.num_layers, X.shape[1], self.hidden_size), device=device)
        h_prev = h_0
        h_t = h_0
        ot = []
        # Sequence
        for t in range(X.shape[0]):
            for layer in range(self.num_layers):
                Wx = X[t] @ self.U[layer].T             
                Wh = h_prev[layer] @ self.W[layer].T    
                at = Wx + Wh                             
                h_t[layer] = torch.tanh(at)
            # h_t has shape [num_layer, B, H]
            ot.append(h_t[-1])
            h_prev = h_t
        ot = torch.stack(ot)
        return ot, h_t
    
    def forward1(self, X, h_0=None):
        """
        RNN forward propagation.
    
        :return: current_state: Hidden state from current layer.
        """
        if h_0 is None:
            h_0 = torch.zeros((self.num_layers, X.shape[1], self.hidden_size), device=device)
        h_prev = h_0
        h_t = h_0
        ot = []
        # Sequence
        for t in range(X.shape[0]):
            for layer in range(self.num_layers):
                Wx = X[t] @ self.U[layer].T             
                Wh = h_prev[layer] @ self.W[layer].T    
                at = Wx + Wh                             
                h_t[layer] = torch.tanh(at)
            # h_t has shape [num_layer, B, H]
            ot.append(h_t[-1])
            output = torch.matmul(h_t[-1], self.V.T)
            h_prev = h_t
        ot = torch.stack(ot)
        return output, ot
    
    def backward(self, loss):
        
        pass
        
    def train(self, dataloader, criterion, epoch=10):
        self.to(device)  # Move the model to GPU
        for i in range(epoch):
            epoch_loss = 0
            batch_count = 0  # Counter to track number of batches processed
            while True:
                batch = dataloader.next_batch()
                if batch is None:
                    break
                X, y = batch
                X = torch.tensor(X, dtype=torch.float32).permute(1, 0, 2).to(device)  # L, N, Hin
                y = torch.tensor(y, dtype=torch.float32).permute(1, 0).to(device)    # L, N
                h_output, h_t = self.forward(X)
                output = torch.matmul(h_output, self.V.T)
                print(output.shape)
                output = output.view(-1, self.input_size)
                target = y.contiguous().view(-1).long()  # Flatten to [batch_size * seq_len]
                loss = criterion(output, target)
                epoch_loss += loss.item()
                batch_count += 1
            dataloader.drop_counter()
            print(f'Epoch {i + 1}, Loss: {epoch_loss / batch_count}')
    
    def train_batch(self, dataloader, criterion, epoch=20):
        self.to(device)
        total_loss = []
        X, y = dataloader.next_batch()
        X = torch.tensor(X, dtype=torch.float32).permute(1, 0, 2).to(device)  # L, N, Hin
        y = torch.tensor(y, dtype=torch.float32).permute(1, 0).to(device)    # L, N
        for i in range(epoch):
            epoch_loss = 0
            h_output, h_t = self.forward(X)
            output = torch.matmul(h_output, self.V.T)
            output = output.view(-1, self.input_size)
            target = y.contiguous().view(-1).long()  # Flatten to [batch_size * seq_len]
            loss = criterion(output, target)
            total_loss.append(loss.item())
            print(f'Epoch {i + 1}, Loss: {loss.item()}')
        return total_loss

In [26]:
rnn = RNN(input_size=dl.vocab_size, hidden_size=100)
loss = torch.nn.CrossEntropyLoss()

In [28]:
s,v = rnn.forward1(a)
s.shape

torch.Size([4, 3157])

In [10]:
ls = rnn.train_batch(dl, criterion=loss)

In [11]:
cuda

device(type='cpu')

In [24]:
dl.num_batches

5111