In [1]:
with open("verdict.txt", "r", encoding="utf-8") as f:
    raw_text = f.read()
print("Total number of character:", len(raw_text))
print(raw_text[:99])

Total number of character: 20479
I HAD always thought Jack Gisburn rather a cheap genius--though a good fellow enough--so it was no 


In [2]:
import re

preprocessed = re.split(r'([,.?_!"()\']|--|\s)', raw_text)
preprocessed = [item.strip() for item in preprocessed if item.strip()]
print(len(preprocessed))

4649


In [3]:
print(preprocessed[:30])

['I', 'HAD', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to', 'hear', 'that', ',', 'in']


In [4]:
#Converting tokens into token IDs
all_words = sorted(list(set(preprocessed)))
vocab_size = len(all_words)
print(vocab_size)


1159


In [5]:
vocab = {token:integer for integer, token in enumerate(all_words)}
# for i, item in enumerate(vocab.items()):
#     print(item)
#     if i > 50:
#         break

In [6]:
class SimpleTokeniserV1:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
    
    def encode(self, text):
        # Split text on special characters and whitespace
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
        preprocessed = [item.strip() for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids
    
    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'([,.?_!"()\']|--|\s)', r'\1',text)
        return text
        
# If text is: "Hello, world!"
# After re.split():
# preprocessed = ['Hello', ',', '', 'world', '!']

# After the cleaning loop:
# preprocessed = ['Hello', ',', 'world', '!']

In [7]:
tokeniser = SimpleTokeniserV1(vocab)
text = """It's the last he painted"""
ids = tokeniser.encode(text)
print(ids)

[58, 2, 872, 1013, 615, 541, 763]


In [8]:
print(tokeniser.decode(ids))

It ' s the last he painted


In [9]:
text = "Hello, do you like tea?"
x = tokeniser.encode(text)
print(x)

KeyError: 'Hello'

In [19]:
all_tokens = sorted(list(set(preprocessed)))
all_tokens.extend(["<|unk|>", "<|endoftext|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

1161


In [20]:
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1156)
('your', 1157)
('yourself', 1158)
('<|unk|>', 1159)
('<|endoftext|>', 1160)


In [21]:
class SimpleTokeniserV2:
    def __init__(self, vocab):
        self.str_to_int = vocab
        self.int_to_str = {i:s for s,i in vocab.items()}
        
    def encode(self, text):
#         def handle_token(token):
#             if not token:
#                 return None
#             token = token.strip()
#             return token if token in self.str_to_int else "<|unk|>"
        
        preprocessed = re.split(r'([,.?_!"()\']|--|\s)', text)
#         item.strip() for item in preprocessed if item.strip()
#         preprocessed = [tk for tk in (handle_token(item) for item in preprocessed) if tk]
        preprocessed  = [item.strip() if item.strip() in self.str_to_int else "<|unk|>" 
                           for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids

    def decode(self, ids):
        text = " ".join([self.int_to_str[i] for i in ids])
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text) 
        return text

In [22]:
text1 = "Hello, do you like tea?"
text2 = "In the sunlit terraces of the palace."
text = " <|endoftext|> ".join((text1, text2))
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terraces of the palace.


In [23]:
tokeniser = SimpleTokeniserV2(vocab)
print(tokeniser.encode(text))

[1159, 5, 362, 1155, 642, 1000, 10, 1160, 57, 1013, 981, 1009, 738, 1013, 1159, 7]


In [24]:
print(tokeniser.decode(tokeniser.encode(text)))

<|unk|>, do you like tea? <|endoftext|> In the sunlit terraces of the <|unk|>.


In [32]:
import tiktoken
tokeniser = tiktoken.get_encoding("cl100k_base") #gpt2
text = "Hello, do you like tea? <|endoftext|> In the sunlit terra"
integers = tokeniser.encode(text, allowed_special={"<|endoftext|>"})
print(integers)

[9906, 11, 656, 499, 1093, 15600, 30, 220, 100257, 763, 279, 7160, 32735, 60661]


In [33]:
# BPE tokenizers break down unknown words into subwords and individual
# characters. This way, a BPE tokenizer can parse any word and doesn't need to replace unknown
# words with special tokens, such as <|unk|>
strings = tokeniser.decode(integers)
print(strings)

Hello, do you like tea? <|endoftext|> In the sunlit terra


In [34]:
#creating input-target pairs
#first will tokenise the whole testing set

with open("verdict.txt", "r", encoding="utf-8") as f:
    raw_text  = f.read()

enc_text = tokeniser.encode(raw_text)
print(len(enc_text))

4943


In [35]:
#remove first 50 tokens for visual demonstration purposes
enc_sample = enc_text[50:]

In [43]:
# let x = input tokens, y = target tokens, where y=x[pos+1]
context_size = 4
x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]
print(f"x : {x}")
print(f"y :\t {y}")

x : [323, 9749, 5678, 304]
y :	 [9749, 5678, 304, 264]


In [45]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(context, "--->", desired)

[323] ---> 9749
[323, 9749] ---> 5678
[323, 9749, 5678] ---> 304
[323, 9749, 5678, 304] ---> 264


In [50]:
for i in range(1, context_size+1):
    context = enc_sample[:i]
    desired = enc_sample[i]
    print(tokeniser.decode(context) , "--->" ,tokeniser.decode([desired]))

 and --->  established
 and established --->  himself
 and established himself --->  in
 and established himself in --->  a


In [56]:
import torch
from torch.utils.data import Dataset, Dataloader

class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokeniser, max_length, stride):
        self.tokeniser = tokeniser
        self.input_ids=  []
        self.target_ids =[]
        
        token_ids = tokeniser.encode(txt)
        
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i: i + max_length]
            target_chunk= token_ids[i+1: i + max_length+ 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))
            
        def __len__(self):
            return len(self.input_ids)
        
        def __getitem__(self, idx):
            return self.input_ids[idx], self.target_ids[idx]

ImportError: cannot import name 'Dataloader' from 'torch.utils.data' (D:\Users\armal\anaconda3\Lib\site-packages\torch\utils\data\__init__.py)

In [72]:
import torch.nn.functional as F

y = torch.tensor([1.0])
x1 = torch.tensor([1.1])
w1 = torch.tensor([2.2])
b = torch.tensor([0.0])
z = x1* w1 + b
a = torch.sigmoid(z)

a
loss = F.binary_cross_entropy(a, y)
loss
#p 216

tensor(0.0852)

In [94]:
import torch.nn.functional as F
from torch.autograd import grad

y = torch.tensor([0.1])
x1 = torch.tensor([1.1]) #input value
w1 = torch.tensor([2.2], requires_grad=True) #input weight
b = torch.tensor([0.0], requires_grad=True) #bias

z = x1 * w1 + b #net j
a = torch.sigmoid(z) #activation function 

# a = sigmoid((input value * input weight) + bias)

loss = F.binary_cross_entropy(a,y)


grad_L_w1 = grad(loss, w1 , retain_graph=True)
grad_L_b = grad(loss, b , retain_graph=True)

print(grad_L_w1)
print(grad_L_b)


(tensor([0.9002]),)
(tensor([0.8183]),)


In [96]:
loss.backward()
print(w1.grad)
print(b.grad)

tensor([0.9002])
tensor([0.8183])


In [98]:
class NeuralNetwork(torch.nn.module):
    def __init__(self, num_inputs, num_outputs):

SyntaxError: incomplete input (439850599.py, line 2)

In [None]:
df