In [1]:
with open("../Tokenization/alice_in_wonderland.txt","r",encoding="utf-8") as f:
    raw_text = f.read() 
print(raw_text[:1000])

TITLE: Alice's Adventures in Wonderland
AUTHOR: Lewis Carroll


= CHAPTER I = 
=( Down the Rabbit-Hole )=

  Alice was beginning to get very tired of sitting by her sister
on the bank, and of having nothing to do:  once or twice she had
peeped into the book her sister was reading, but it had no
pictures or conversations in it, `and what is the use of a book,'
thought Alice `without pictures or conversation?'

  So she was considering in her own mind (as well as she could,
for the hot day made her feel very sleepy and stupid), whether
the pleasure of making a daisy-chain would be worth the trouble
of getting up and picking the daisies, when suddenly a White
Rabbit with pink eyes ran close by her.

  There was nothing so VERY remarkable in that; nor did Alice
think it so VERY much out of the way to hear the Rabbit say to
itself, `Oh dear!  Oh dear!  I shall be late!'  (when she thought
it over afterwards, it occurred to her that she ought to have
wondered at this, but at the time it all 

In [2]:
import tiktoken

tokenizer = tiktoken.get_encoding("gpt2")
encoded_text = tokenizer.encode(raw_text)
print(len(encoded_text))

42098


Removing the first 50 tokens and then decoding

In [3]:
encoded_text_sampled = encoded_text[50:]
decoded_text = tokenizer.decode(encoded_text_sampled)
print("Raw Text:")
print(raw_text[:100])
print("\nDecoded Text:")
print(decoded_text[:100])

Raw Text:
TITLE: Alice's Adventures in Wonderland
AUTHOR: Lewis Carroll


= CHAPTER I = 
=( Down the Rabbit-Ho

Decoded Text:
 bank, and of having nothing to do:  once or twice she had
peeped into the book her sister was readi


Context-Size

In [4]:
context_size = 4
#Model will be trained to predict the next token based on atmost 4 input tokens --> context size

x = encoded_text[:context_size]
#Output equivalent to Input shifted by 1
#As LLMs' behavior is auto-regressive and the future tokens are masked
y = encoded_text[1:context_size+1]

print(f'x:{x}\n       y:{y}')

x:[49560, 2538, 25, 14862]
       y:[2538, 25, 14862, 338]


In [5]:
print('Context ---> Target')
for i in range(1,context_size+1):
    context = encoded_text[:i]
    target = encoded_text[i]
    print(f'{context} ---> {target}')

Context ---> Target
[49560] ---> 2538
[49560, 2538] ---> 25
[49560, 2538, 25] ---> 14862
[49560, 2538, 25, 14862] ---> 338


In [6]:
print('Context ---> Target')
for i in range(1,context_size+1):
    context = encoded_text[:i]
    target = encoded_text[i]

    print(f'{tokenizer.decode(context)} ---> {tokenizer.decode([target])}')

Context ---> Target
TIT ---> LE
TITLE ---> :
TITLE: --->  Alice
TITLE: Alice ---> 's


In [7]:
from generate_dataset import create_dataloader

dataloader = create_dataloader(raw_text,batch_size=1,max_length=4,stride=1,shuffle=False)

data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

  cpu = _conversion_method_template(device=torch.device("cpu"))


[tensor([[49560,  2538,    25, 14862]]), tensor([[ 2538,    25, 14862,   338]])]


In [8]:
print('Input for first batch:')
print(tokenizer.decode([49560,  2538,    25, 14862]))
print('Output for first batch:')
print(tokenizer.decode([2538,    25, 14862,   338]))

Input for first batch:
TITLE: Alice
Output for first batch:
LE: Alice's


In [9]:
dataloader = create_dataloader(raw_text,batch_size=8,max_length=4,stride=4,shuffle=False)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

[tensor([[49560,  2538,    25, 14862],
        [  338, 15640,   287, 42713],
        [  198,    32, 24318,  1581],
        [   25, 10174, 21298,   628],
        [  198,    28,  5870, 29485],
        [  314,   796,   220,   198],
        [16193,  5588,   262, 25498],
        [   12,    39,  2305,  1267]]), tensor([[ 2538,    25, 14862,   338],
        [15640,   287, 42713,   198],
        [   32, 24318,  1581,    25],
        [10174, 21298,   628,   198],
        [   28,  5870, 29485,   314],
        [  796,   220,   198, 16193],
        [ 5588,   262, 25498,    12],
        [   39,  2305,  1267,    28]])]


This is how the input-output pairs look like for a single batch (first batch in this case)

In [10]:
input_strings = [tokenizer.decode([49560,  2538,    25, 14862]),
                 tokenizer.decode([  338, 15640,   287, 42713]),
                 tokenizer.decode([  198,    32, 24318,  1581]),
                 tokenizer.decode([   25, 10174, 21298,   628]),
                 tokenizer.decode([  198,    28,  5870, 29485]),
                 tokenizer.decode([  314,   796,   220,   198]),
                 tokenizer.decode([16193,  5588,   262, 25498]),
                 tokenizer.decode([   12,    39,  2305,  1267])]

output_strings = [tokenizer.decode([ 2538,    25, 14862,   338]),
                  tokenizer.decode([15640,   287, 42713,   198]),
                  tokenizer.decode([   32, 24318,  1581,    25]),
                  tokenizer.decode([10174, 21298,   628,   198]),
                  tokenizer.decode([   28,  5870, 29485,   314]),
                  tokenizer.decode([  796,   220,   198, 16193]),
                  tokenizer.decode([ 5588,   262, 25498,    12]),
                  tokenizer.decode([   39,  2305,  1267,    28])]
print('Inputs first batch:')
print(input_strings)
print('Outputs first batch:')
print(output_strings)

Inputs first batch:
['TITLE: Alice', "'s Adventures in Wonderland", '\nAUTHOR', ': Lewis Carroll\n\n', '\n= CHAPTER', ' I = \n', '=( Down the Rabbit', '-Hole )']
Outputs first batch:
["LE: Alice's", ' Adventures in Wonderland\n', 'AUTHOR:', ' Lewis Carroll\n\n\n', '= CHAPTER I', ' = \n=(', ' Down the Rabbit-', 'Hole )=']
