In [201]:
from torch.utils.data import Dataset
import os
import sys
import torch
import tiktoken
import sentencepiece as spm
import subprocess
from tiktoken import _tiktoken as tk
import nltk

# nltk.download('stopwords')
# nltk.download('punkt')

from nltk.corpus import stopwords

In [202]:
# !pip freeze > requirements.txt

### **Dataset**:

The Shakespeare dataset contains the complete works of William Shakespeare, including his plays, poems, and sonnets.

[**Download link**](https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt)

In a character-level language model, each character in the input data is mapped to its respective index from a dictionary. The input to the model is in the form (B, N), where B is the batch size and N is the number of tokens for each sequence. The model was tested with B=N=128, but feel free to explore different values.

An interface for the dataset class that takes care of tokenization is provided below.



```python
from torch.utils.data import Dataset

class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, config, data):

        chars = ... # get characters from the input data
        self.stoi = { ch:i for i,ch in enumerate(chars) } # map characters to integer indices

        ...

    def get_vocab_size(self):
        raise NotImplementedError()

    def __len__(self):
        raise NotImplementedError()

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        # encode every character to an integer
        # return the chunk and the shifted version as tensors
        pass
```




In [203]:
def check_requirements() -> bool:
    try:
        result = subprocess.run(
            ["pip", "install", "-r", "requirements.txt"],
            check=True,  # Raise an exception if the command fails
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            text=True
        )
        print(result.stdout)  # Optional: Print installation output
        return True
    except subprocess.CalledProcessError as e:
        print(f"Error installing requirements: {e.stderr}")
        return False


In [204]:
# check_requirements()

In [219]:
if os.path.exists("Dataset.txt"):
    os.system("wget https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt")
    os.rename("input.txt", 'Dataset.txt')

In [236]:
class CharDataset(Dataset):
    """
    Emits batches of characters.

    Adapted from "https://github.com/karpathy/minGPT".
    """

    def __init__(self, data: str, mode: str = "normal"):

        self.chars = sorted(set(train_text))  # get characters from the input data

        self.tokens = set(nltk.word_tokenize(data.lower()))
        self.mode = mode

        if mode == "normal":
            self.stoi = {ch: i for i, ch in enumerate(self.chars)}  # map characters to integer indices
            self.itos = {i: ch for i, ch in enumerate(self.chars)}  # map integer indices to characters

        elif mode == "sentencepiece":
            spm.SentencePieceTrainer.train(model_prefix='shakespeare', input='Dataset.txt',
                                           vocab_size=10770, unk_id=0, bos_id=1, eos_id=2, pad_id=3)


        elif mode == "tiktoken":
            self.enc = tiktoken.get_encoding("gpt2")

    def encode(self, text):
        if self.mode == "normal":
            return [self.stoi[s] for s in text]
        elif self.mode == "sentencepiece":
            sp = spm.SentencePieceProcessor(model_file='shakespeare.model')
            return sp.encode(text)
        elif self.mode == "tiktoken":
            return self.enc.encode(text)

    def decode(self, tokens):
        if self.mode == "normal":
            return ''.join([self.itos[t] for t in tokens])
        elif self.mode == "sentencepiece":
            sp = spm.SentencePieceProcessor(model_file='shakespeare.model')
            return sp.decode(tokens)
        elif self.mode == "tiktoken":
            return self.enc.decode(tokens)

    def get_vocab_size(self):
        return len(self.tokens)

    def __len__(self):
        raise NotImplementedError()

    def __getitem__(self, idx):
        # grab a chunk of (block_size + 1) characters from the data
        # encode every character to an integer
        # return the chunk and the shifted version as tensors
        pass

    # ```

In [237]:
with open("Dataset.txt", "r") as file:
    train_text = file.read()

print(train_text[:500])

First Citizen:
Before we proceed any further, hear me speak.

All:
Speak, speak.

First Citizen:
You are all resolved rather to die than to famish?

All:
Resolved. resolved.

First Citizen:
First, you know Caius Marcius is chief enemy to the people.

All:
We know't, we know't.

First Citizen:
Let us kill him, and we'll have corn at our own price.
Is't a verdict?

All:
No more talking on't; let it be done: away, away!

Second Citizen:
One word, good citizens.

First Citizen:
We are accounted poor


In [238]:
char_dataset1 = CharDataset(train_text, mode="normal")
char_dataset2 = CharDataset(train_text, mode="sentencepiece")
char_dataset3 = CharDataset(train_text, mode="tiktoken")

In [239]:
print(f"Vocabulary size: {char_dataset1.get_vocab_size()}")

Vocabulary size: 12443


In [240]:
print(f"Length of sequence for normal encoding: {len(char_dataset1.encode(train_text))}")
print(f"Length of sequence for sentencepiece encoding: {len(char_dataset2.encode(train_text))}")
print(f"Length of sequence for tiktoken encoding: {len(char_dataset3.encode(train_text))}")

Length of sequence for normal encoding: 1115394
Length of sequence for sentencepiece encoding: 290364
Length of sequence for tiktoken encoding: 338025


In [241]:
data = torch.tensor(char_dataset1.encode(train_text), dtype=torch.long)
print(data.shape, data.dtype)
print(data[:1000])

torch.Size([1115394]) torch.int64
tensor([18, 47, 56, 57, 58,  1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 14, 43, 44,
        53, 56, 43,  1, 61, 43,  1, 54, 56, 53, 41, 43, 43, 42,  1, 39, 52, 63,
         1, 44, 59, 56, 58, 46, 43, 56,  6,  1, 46, 43, 39, 56,  1, 51, 43,  1,
        57, 54, 43, 39, 49,  8,  0,  0, 13, 50, 50, 10,  0, 31, 54, 43, 39, 49,
         6,  1, 57, 54, 43, 39, 49,  8,  0,  0, 18, 47, 56, 57, 58,  1, 15, 47,
        58, 47, 64, 43, 52, 10,  0, 37, 53, 59,  1, 39, 56, 43,  1, 39, 50, 50,
         1, 56, 43, 57, 53, 50, 60, 43, 42,  1, 56, 39, 58, 46, 43, 56,  1, 58,
        53,  1, 42, 47, 43,  1, 58, 46, 39, 52,  1, 58, 53,  1, 44, 39, 51, 47,
        57, 46, 12,  0,  0, 13, 50, 50, 10,  0, 30, 43, 57, 53, 50, 60, 43, 42,
         8,  1, 56, 43, 57, 53, 50, 60, 43, 42,  8,  0,  0, 18, 47, 56, 57, 58,
         1, 15, 47, 58, 47, 64, 43, 52, 10,  0, 18, 47, 56, 57, 58,  6,  1, 63,
        53, 59,  1, 49, 52, 53, 61,  1, 15, 39, 47, 59, 57,  1, 25, 39, 56, 41,
      

In [257]:
data2 = torch.tensor(char_dataset2.encode(train_text), dtype=torch.long)
print(data2.shape, data2.dtype)
print(data2[:1000])

torch.Size([290364]) torch.int64
tensor([  160,   346,     5,  1001,    54,  1671,   208,   953,     4,   181,
           27,   147,     6,   421,     5,   997,     4,   147,     6,   160,
          346,     5,   112,    58,    47,  1968,   540,    10,   292,   117,
           10,  4422,    19,   421,     5,  7385,     6,  1968,     6,   160,
          346,     5,   160,     4,    15,   109,  1602,   479,    26,  1795,
          785,    10,     7,   397,     6,   421,     5,   184,   109,     8,
           72,     4,    54,   109,     8,    72,     6,   160,   346,     5,
          248,    96,   461,    37,     4,    11,    54,     8,    65,    34,
         1763,    78,    59,   227,  3029,     6,   244,     8,    72,    16,
         7476,    19,   421,     5,   165,    73,  4082,    64,     8,    72,
           13,   107,    29,    28,   230,     5,   293,     4,   293,    21,
           92,   282,   346,     5,   727,   314,     4,    68,  1339,     6,
          160,   346,     5,   

In [269]:
n = int(0.9 * len(data))

train_data = data[:n]
val_data = data[n:]