In [32]:
# Import libraries
import os
from urllib.request import urlretrieve

import torch

**Importing Data**

In [2]:
url = 'https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt'
file_name = 'input.txt'

In [3]:
# Note: urllib.request documentation suggests possibility of urlretrieve may deprecate in near future
if not os.path.exists(file_name):
    urlretrieve(url, file_name)

In [4]:
with open("input.txt", "r") as f:
    text = f.read()

In [5]:
type(text)

str

**Testing `char.py` (class CharDataset)**

In [6]:
from torch.utils.data import Dataset

class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, config, data):

        chars = sorted(list(set(data))) # get characters from the input data # IMPLEMENTED
        self.stoi = { ch:i for i,ch in enumerate(chars) } # map characters to integer indices
        self.itos = { i:ch for i,ch in enumerate(chars) } # similarly, map integer to indices, necessary for decoding and prediction # IMPLEMENTED
        self.vocab_size = len(chars) # IMPLEMENTED
        
        ...

    def get_vocab_size(self):
        return self.vocab_size # IMPLEMENTED

    def __len__(self):
        return len(self.

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        encode = torch.tensor(lambda text: [self.stoi[char] for char in text], dtype=torch.long) # encode every character to an integer # IMPLEMENTED
        decode = torch.tensor(lambda integers: ''.join([self.itos[integer] for integer in integers], dtype=torch.long) # decode every character to an integer # IMPLEMENTED
        # return the chunk and the shifted version as tensors
        pass

In [7]:
config = '' # Should be no issue since nothing references config at the moment

In [8]:
chardataset = CharDataset(config=config, data=text)

In [9]:
# TO DO: 
# what does __len__ and __getitem__ refer to?

In [9]:
chardataset.stoi

{'\n': 0,
 ' ': 1,
 '!': 2,
 '$': 3,
 '&': 4,
 "'": 5,
 ',': 6,
 '-': 7,
 '.': 8,
 '3': 9,
 ':': 10,
 ';': 11,
 '?': 12,
 'A': 13,
 'B': 14,
 'C': 15,
 'D': 16,
 'E': 17,
 'F': 18,
 'G': 19,
 'H': 20,
 'I': 21,
 'J': 22,
 'K': 23,
 'L': 24,
 'M': 25,
 'N': 26,
 'O': 27,
 'P': 28,
 'Q': 29,
 'R': 30,
 'S': 31,
 'T': 32,
 'U': 33,
 'V': 34,
 'W': 35,
 'X': 36,
 'Y': 37,
 'Z': 38,
 'a': 39,
 'b': 40,
 'c': 41,
 'd': 42,
 'e': 43,
 'f': 44,
 'g': 45,
 'h': 46,
 'i': 47,
 'j': 48,
 'k': 49,
 'l': 50,
 'm': 51,
 'n': 52,
 'o': 53,
 'p': 54,
 'q': 55,
 'r': 56,
 's': 57,
 't': 58,
 'u': 59,
 'v': 60,
 'w': 61,
 'x': 62,
 'y': 63,
 'z': 64}

In [10]:
chardataset.itos

{0: '\n',
 1: ' ',
 2: '!',
 3: '$',
 4: '&',
 5: "'",
 6: ',',
 7: '-',
 8: '.',
 9: '3',
 10: ':',
 11: ';',
 12: '?',
 13: 'A',
 14: 'B',
 15: 'C',
 16: 'D',
 17: 'E',
 18: 'F',
 19: 'G',
 20: 'H',
 21: 'I',
 22: 'J',
 23: 'K',
 24: 'L',
 25: 'M',
 26: 'N',
 27: 'O',
 28: 'P',
 29: 'Q',
 30: 'R',
 31: 'S',
 32: 'T',
 33: 'U',
 34: 'V',
 35: 'W',
 36: 'X',
 37: 'Y',
 38: 'Z',
 39: 'a',
 40: 'b',
 41: 'c',
 42: 'd',
 43: 'e',
 44: 'f',
 45: 'g',
 46: 'h',
 47: 'i',
 48: 'j',
 49: 'k',
 50: 'l',
 51: 'm',
 52: 'n',
 53: 'o',
 54: 'p',
 55: 'q',
 56: 'r',
 57: 's',
 58: 't',
 59: 'u',
 60: 'v',
 61: 'w',
 62: 'x',
 63: 'y',
 64: 'z'}

In [11]:
chardataset.get_vocab_size()

65

In [36]:
encode = lambda text: [stoi[char] for char in text] # encode every character to an integer
decode = lambda integers: ''.join([itos[integer] for integer in integers])

In [39]:
sample = sorted(list(set(text)))
stoi = { ch:i for i,ch in enumerate(sample) }
itos = { i:ch for i,ch in enumerate(sample) }

In [41]:
encode('Akira')

[13, 49, 47, 56, 39]

In [27]:
decode(encode('Akira'))

'Akira'

In [42]:
block_size = 8

In [45]:
encode(text[:block_size+1])

[18, 47, 56, 57, 58, 1, 15, 47, 58]