# Importing Libraries

In [1]:
# Import libraries
import os # For data import
from urllib.request import urlretrieve # For data import

import torch
from torch.utils.data import Dataset # For CharDataset implementation
from torch.utils.data import DataLoader, random_split # For train/test split


# Importing Data

In [2]:
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
file_name = 'input.txt'

In [3]:
# Note: urllib.request documentation suggests possibility of urlretrieve may deprecate in near future
if not os.path.exists(file_name):
    urlretrieve(url, file_name)

In [4]:
with open(file_name, "r") as f:
    text = f.read()

# Data Hyperparameters

According to the project description, "The model was tested with B=N=128, but feel free to explore different values."

Given the time constraint for project completion and submission, I may lower the block_size and batch_size significantly.

In [5]:
block_size = 128
batch_size = 128

# Tokenization

## Implementing `class CharDataset`

Reference: [Let's build GPT: from scratch, in code, spelled out.](https://www.youtube.com/watch?v=kCc8FmEb1nY) from Andrej Karpathy

Because the class `CharDataset` provided in the project description took inspiration from Andrej Karpathy's [minGPT Github Repository](https://github.com/karpathy/minGPT), I watched the video to get a better sense of how to understand character-level tokenization and implement the below.

In [6]:
# import torch
# from torch.utils.data import Dataset

class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, block_size, data): # Defining block_size before instantiation of CharDataset object when reading data / training model
    # def __init__(self, config, data): # Revision of data

        self.data = data # IMPLEMENTED
        self.block_size = block_size # IMPLEMENTED

        chars = sorted(list(set(self.data))) # get characters from the input data # IMPLEMENTED
        self.stoi = { ch:i for i,ch in enumerate(chars) } # map characters to integer indices
        self.itos = { i:ch for i,ch in enumerate(chars) } # similarly, map integer to indices, necessary for decoding and prediction # IMPLEMENTED
        self.vocab_size = len(chars) # IMPLEMENTED
        self.data_size = len(self.data) # IMPLEMENTED
        
    def get_vocab_size(self):
        return self.vocab_size # IMPLEMENTED

    def __len__(self):
        return self.data_size - self.block_size # IMPLEMENTED # Number of training samples using a sliding window of length block_size #TODO: IMPLEMENT Config

    def __getitem__(self, idx):
        chunk = self.data[idx:idx+self.block_size+1]# grab a chunk of (block_size + 1) characters from the data
        encoded_tensor = torch.tensor([self.stoi[c] for c in chunk], dtype=torch.long) # encode every character to an integer # IMPLEMENTED
        # return the chunk and the shifted version as tensors
        x = encoded_tensor[:-1] # IMPLEMENTED
        y = encoded_tensor[1:] # IMPLEMENTED
        return x, y # IMPLEMENTED

## Tokenization of Dataset

In [7]:
dataset = CharDataset(block_size=block_size, data=text)

### Creation of Train/Test Split

In [8]:
# from torch.utils.data import DataLoader, random_split
# PyTorch docs for random_split: https://docs.pytorch.org/docs/stable/data.html

**NOTE:** Tokenization is performed on the full dataset prior to the train/test split.

Because we are using a character-level tokenizer with a fixed vocabulary (e.g., letters, digits, punctuation), there is no data leakage.

In [9]:
# Train/test split of 90% training, 10% test
train, val = random_split(dataset, [0.9, 0.1])

train_loader = DataLoader(train, batch_size=batch_size, shuffle=True, drop_last=True) # Wrapping in DataLoader
val_loader  = DataLoader(val, batch_size=batch_size, shuffle=False, drop_last=True)

# Model Architecture

References:  

- Vaswani et al., ["Attention is All You Need"](https://arxiv.org/abs/1706.03762)
- Karpathy, [Let's build GPT: from scratch, in code, spelled out.](https://www.youtube.com/watch?v=kCc8FmEb1nY)
- Raschka, [LLMs-from-scratch](https://github.com/rasbt/LLMs-from-scratch)