<a href="https://colab.research.google.com/github/badBrock/MultiHeadAttentionJourney/blob/main/Preprocessing_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [11]:
import os
import requests

if not os.path.exists("the-verdict.txt"):
    url = (
        "https://raw.githubusercontent.com/rasbt/"
        "LLMs-from-scratch/main/ch02/01_main-chapter-code/"
        "the-verdict.txt"
    )
    file_path = "the-verdict.txt"

    response = requests.get(url, timeout=30)
    response.raise_for_status()
    with open(file_path, "wb") as f:
        f.write(response.content)

In [12]:
with open("the-verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()

print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [None]:
import re
preprocessed = re.split(r'([,.:;?_!"()\']|--|\s)', raw_text)

In [None]:
removed = [item for item in preprocessed if not item.strip()]
removed[10:100:10]

[' ', ' ', '', ' ', ' ', ' ', ' ', ' ', ' ']

In [None]:
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [None]:
all_words = sorted(set(preprocessed))
vocab_size = len(all_words)

print(vocab_size)

1130


In [None]:
vocab = {num:words for words,num in enumerate(all_words)}

In [None]:
dec = {i:j for j,i in vocab.items()}

In [None]:
for i, item in enumerate(vocab.items()):
    print(item)
    if i >= 10:
        break

('!', 0)
('"', 1)
("'", 2)
('(', 3)
(')', 4)
(',', 5)
('--', 6)
('.', 7)
(':', 8)
(';', 9)
('?', 10)


In [None]:
ids = [vocab[i] for i in all_words]

In [None]:
words = ' '.join([dec[i] for i in [156,22,34]])
words

'an But Florence'

In [None]:
class tokenizer:
  def __init__(self,vocab):
    self.t_to_int = vocab
    self.int_to_t = {i:j for j,i in self.t_to_int.items()}

  def encoder(self,x):
    preprocessing = re.split(r'([,.:;?_!"()\']|--|\s)', x)
    preprocessing = [item.strip() for item in preprocessing if item.strip()]
    preprocessing = [
            item if item in self.t_to_int
            else "<|unk|>" for item in preprocessing
            ]
    return [self.t_to_int[i] for i in preprocessing]

  def decoder(self,x):
    text = " ".join([self.int_to_t[i] for i in x])
    text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
    return text

In [None]:
token = tokenizer(vocab)

In [None]:
ids = token.encoder('what the are you talking about my')

In [None]:
token.decoder(ids)

'what the are you talking about my'

In [4]:
import tiktoken
import torch

In [42]:
class GPTDataLoader():
  def __init__(self,text,tokenizer,context_length,stride):
    self.inp = []
    self.out = []

    token_id = tokenizer.encode(text,allowed_special={"<|endoftext|>"})

    for i in range(0,len(token_id)-context_length,stride):
      inpu = token_id[i:i+context_length]
      outp = token_id[i+1:i+context_length+1]
      self.inp.append(torch.tensor(inpu))
      self.out.append(torch.tensor(outp))

  def __len__(self):
      return len(self.inp)

  def __getitem__(self,idx):
      return self.inp[idx], self.out[idx]

In [45]:
from torch.utils.data import Dataset, DataLoader

def dataloader(text,batch,context_length,stride,shuffle=True, drop_last=True, num_workers=0):
  get_token = tiktoken.get_encoding('gpt2')

  dataset = GPTDataLoader(text,get_token,context_length,stride)
#   data_load = DataLoader(
#     dataset,
#     batch_size=batch,
#     shuffle=shuffle,
#     drop_last=drop_last,
#     num_workers=num_workers
# )

  data_load = DataLoader(
     dataset,batch_size=batch, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers
  )
  return data_load

In [49]:
batch = 8
vocab = 50257 #depends on the dataset that you have
context_length = 4
output_dim = 128

embedding = torch.nn.Embedding(vocab,output_dim)
pos = torch.nn.Embedding(context_length,output_dim)

load = dataloader(raw_text,batch,context_length,stride=context_length)

In [50]:
for batch in load:
  x,y = batch
  token_embedding = embedding(x)
  pos_embedding = pos(torch.arange(context_length))

  input_embeddings = token_embedding + pos_embedding
  break

In [51]:
print(input_embeddings.shape)

torch.Size([8, 4, 128])


In [53]:
input_embeddings[0].shape

torch.Size([4, 128])