# Importing Libraries

In [1]:
# Import libraries
import os
from urllib.request import urlretrieve

import torch

# Transformer Architecture

![Fig1](figures/fig1.png)

In [2]:
# Should be markdown ![Fig2](figures/fig2.png)

# Importing Data

In [3]:
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
file_name = 'input.txt'

In [4]:
# Note: urllib.request documentation suggests possibility of urlretrieve may deprecate in near future
if not os.path.exists(file_name):
    urlretrieve(url, file_name)

In [5]:
with open(file_name, "r") as f:
    text = f.read()

In [6]:
type(text)

str

# Hyperparameters

In [7]:
# In a character-level language model, each character in the input data is mapped to its respective index from a dictionary. 
# The input to the model is in the form (B, N), where B is the batch size and N is the number of tokens for each sequence. 
# The model was tested with B=N=128, but feel free to explore different values.

# Data Hyperparameters
# block_size =  # Consistent with miniproject2_language_model.ipynb instructions
# batch_size = 

# emb_dim = 768
# n_heads = 8
# n_layers = 12

In [None]:
# The project was tested with 12 layers, 8 attention heads, and 768 embedding dimensions, on a single GPU.

# Model parameters

# Implementing `class CharDataset`

In [8]:
import torch
from torch.utils.data import Dataset

class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, block_size, data): # Going to define block_size in notebook above instantiation of CharDataset object when reading data / training model
    # def __init__(self, config, data):

        self.data = data # IMPLEMENTED
        self.block_size = block_size # IMPLEMENTED

        chars = sorted(list(set(self.data))) # get characters from the input data # IMPLEMENTED
        self.stoi = { ch:i for i,ch in enumerate(chars) } # map characters to integer indices
        self.itos = { i:ch for i,ch in enumerate(chars) } # similarly, map integer to indices, necessary for decoding and prediction # IMPLEMENTED
        self.vocab_size = len(chars) # IMPLEMENTED
        self.data_size = len(self.data) # IMPLEMENTED
        
        
        ...

    def get_vocab_size(self):
        return self.vocab_size # IMPLEMENTED

    def __len__(self):
        return self.data_size - self.block_size # IMPLEMENTED # Number of training samples using a sliding window of length block_size #TODO: IMPLEMENT Config

    def __getitem__(self, idx):
        chunk = self.data[idx:idx+self.block_size+1]# grab a chunk of (block_size + 1) characters from the data
        encoded_tensor = torch.tensor([self.stoi[c] for c in chunk], dtype=torch.long) # encode every character to an integer # IMPLEMENTED
        # return the chunk and the shifted version as tensors
        x = encoded_tensor[:-1] # IMPLEMENTED
        y = encoded_tensor[1:] # IMPLEMENTED
        return x, y # IMPLEMENTED

def decode(encoded): # decode every integer to a character # IMPLEMENTED
    return ''.join([self.itos[integer] for integer in encoded])

In [None]:
# DISCONTINUED: Since I'm passing block_size in as a parameter only
# config = '' # Should be no issue since nothing references config at the moment

# Testing `CharDataset` implementation

In [9]:
block_size_example = 4 # Just for testing purposes
chardataset = CharDataset(block_size=block_size_example, data=text)

In [10]:
chardataset.data[:100]

'First Citizen:\nBefore we proceed any further, hear me speak.\n\nAll:\nSpeak, speak.\n\nFirst Citizen:\nYou'

In [11]:
len(chardataset.data)

1115394

In [None]:
chardataset.block_size

In [None]:
chardataset.stoi

In [None]:
chardataset.itos

In [None]:
chardataset.vocab_size

In [None]:
chardataset.data_size

In [None]:
chardataset.get_vocab_size()

In [None]:
chardataset.__len__()

In [None]:
chardataset.data_size - chardataset.block_size == chardataset.__len__()

In [None]:
chardataset.__getitem__(2)

## Example if I used my name as input

In [None]:
test = CharDataset(block_size=4, data='Akira')

In [None]:
test.stoi

In [None]:
test.get_vocab_size()

In [None]:
test.__len__()

In [None]:
test.__getitem__(0)

In [None]:
test.__getitem__(1)

In [None]:
test.__getitem__(2)

In [None]:
test.__getitem__(3)

In [None]:
test.__getitem__(4)

## Creation of Train/Test Split

In [None]:
from torch.utils.data import DataLoader, random_split
# PyTorch docs for random_split: https://docs.pytorch.org/docs/stable/data.html

In [None]:
dataset = CharDataset(block_size=128, data=text)

train, test = random_split(dataset, [0.9, 0.1])

train_loader = DataLoader(train, batch_size=128, shuffle=True, drop_last=True)
test_loader  = DataLoader(test, batch_size=128, shuffle=False, drop_last=True)

# Implementing `Model.py`

In [None]:
import torch.nn as nn



## Optimizer

In [None]:
import torch.optim as optim

In [None]:
optimizer = optim.Adam(