In [14]:
import numpy as np
import os
import regex as re
import requests

In [48]:
# download the tiny shakespeare dataset
input_file_path = os.path.join(os.path.dirname("../data/"), 'input.txt')
if not os.path.exists(input_file_path):
    data_url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
    with open(input_file_path, 'w', encoding='utf-8') as f:
        f.write(requests.get(data_url).text)

with open(input_file_path, 'r', encoding='utf-8') as f:
    data = f.read()
n = len(data)

In [49]:
train_data = data[:int(n*0.9)]
val_data = data[int(n*0.9):]

In [None]:
def cross_entropy_loss(y, t):
    return -np.sum(np.log())

In [105]:
class DataLoader(object):
    def __init__(self, data, sequence_length, batch_size):
        """
        Initializes the DataLoader with the given data, sequence length, and batch size.
        
        Args:
            data (str): The input text data.
            sequence_length (int): The length of each sequence.
            batch_size (int): The size of each batch.
        """
        self.text = data
        
        self.sequence_length = sequence_length
        self.batch_size = batch_size
        
        # Getting list of words, from preprocessed text
        self.word_list = self._text_preprocessing()
        
        # Unique words
        self.unique_words = set(self.word_list)
        
        # Adding unknown token for exceptions
        self.unique_words.add('<UNK>')
        
        # Create a vocabulary with '<UNK>' first
        self.word_to_ix = {'<UNK>': 0}
        self.word_to_ix.update({tok: idx + 1 for idx, tok in enumerate(self.unique_words - {'<UNK>'})})
        
        # Create the inverse mapping
        self.ix_to_word = {idx: tok for tok, idx in self.word_to_ix.items()}
        
        self.vocab_size = len(self.word_to_ix)
        
    def _text_preprocessing(self):
        """
        Preprocesses the text by converting to lowercase, removing numbers, punctuation, and special tokens.
        
        Returns:
            str: The preprocessed text.
        """
        self.text = self.text.lower()
        self.text = re.sub(r'\d+', '', self.text)  # Remove numbers
        self.text = re.sub(r'[^\w\s]', '', self.text)  # Remove punctuation
        self.text = re.sub(r'\W+', ' ', self.text)  # Remove special tokens
        self.text = self.text.split(' ')
        return self.text
    
    def encode(self, words):
        """
        Encodes the given words into their corresponding indices using the vocabulary.
        
        Args:
            words (str): The words to encode.
        
        Returns:
            list: A list of indices representing the encoded words.
        """
        return [self.word_to_ix.get(word, self.word_to_ix['<UNK>']) for word in words.split(' ')]
    
    def decode(self, indexes):
        """
        Decodes the given indices into their corresponding words using the vocabulary.
        
        Args:
            indexes (list): The indices to decode.
        
        Returns:
            list: A list of words corresponding to the indices.
        """
        return [self.ix_to_word.get(idx, '<UNK>') for idx in indexes]
    

In [102]:
dl = DataLoader(data=train_data, sequence_length=10, batch_size=10)

In [104]:
rech = "you know caiuss"
tech = [102, 2000, 20000000]
rech_encoded = dl.encode(rech)
techss = dl.decode(tech)
tech_encoded = dl.decode(rech_encoded)
rech_encoded, tech_encoded, techss

([7568, 4380, 0], ['you', 'know', '<UNK>'], ['falling', 'disbenchd', '<UNK>'])

In [93]:
class RNN(object):
    def __init__(self, vocab_size, hidden_size, sequence_length, output_size, batch_size):
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.output_size = output_size
        self.batch_size = batch_size
        self.sequence_length = sequence_length 
        
        # Weights initialization
        self.U = np.random.randn(self.vocab_size, self.hidden_size)  # input weights  
        self.W = np.random.randn(self.hidden_size, self.hidden_size) # hidden weights
        self.V = np.random.randn(self.hidden_size, self.output_size) # output weights
        self.b = np.zeros((self.hidden_size, 1))
        self.c = np.zeros((self.output_size, 1))
        
    def forward(self, X):
        # X shape is (batch_size, sequence_length, feature_size)
        # output is the unnormalized log probabilities
        # activation is softmax
        pass