In [None]:
# import sys
# print("sys.executable:", sys.executable)
# print("sys.version:", sys.version)
with open("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text=f.read();
print("total number of characters:",len((raw_text)))
print(raw_text[:99])

In [None]:
#example 1)------------>
import re
text="Hello, world. This, is a test."
result=re.split(r'(\s)',text)
print(result)

In [None]:
result=re.split(r'([,.]|\s)',text)
print(result)

In [None]:
result =[item for item in result if item.strip()]
print(result)

In [None]:
text="hello, world. Is this-- a test?"
result= re.split(r'([,.:;?_!"()\']|--|\s)',text)
result=[item for item in result if item.strip()]
print(result)
#example_end 1)------------------->

In [None]:
preprocessed=re.split(r'([,.:;?_!"()\']|--|\s)',raw_text)
preprocessed=[item for item in preprocessed if item.strip()]
print(preprocessed[:30])

In [None]:
print(len(preprocessed))

In [None]:
all_words=sorted(set(preprocessed))
vocab_size=len(all_words)

print(vocab_size) #no duplicates so len will be less than preprocessed

In [None]:
vocab ={token:integer for integer,token in enumerate(all_words)}

In [None]:
for i,item in enumerate(vocab.items()):
    print(item)
    if i>=50:
        break

In [None]:
class SimpleTokenizerV1:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={i:s for s,i in vocab.items()}
    def encode(self,text):
        preprocessed=re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed=[item for item in preprocessed if item.strip()]
        ids = [self.str_to_int[s] for s in preprocessed]
        return ids;
    def decode(self,ids):
        text=" ".join([self.int_to_str[i] for i in ids])
        #replacing spacs before the specified punctuations
        text=re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [None]:
#example 2)-----------> start
tokenizer=SimpleTokenizerV1(vocab);
text=""""It's the last he painted, you know," Mrs. Gisburn said with pardonable pride."""
ids=tokenizer.encode(text)
print(ids)

In [None]:
tokenizer.decode(ids)
#example 2)---------------------> end

In [None]:
#adding special context tokens
all_tokens=sorted(list(set(preprocessed)))
all_tokens.extend(["<|extendoftext|>","<|unk|>"])

vocab={token:integer for integer,token in enumerate(all_tokens)}

In [None]:
len(vocab.items())

In [None]:
#version 2 of tokenizer where we are replaceing unknown words in text to <|unk|> so that a number can be assigned to it encoder
class SimpleTokenizerV2:
    def __init__(self,vocab):
        self.str_to_int=vocab
        self.int_to_str={i:s for s,i in vocab.items()}
    def encode(self,text):
        preprocessed=re.split(r'([,.:;?_!"()\']|--|\s)',text)
        preprocessed=[item for item in preprocessed if item.strip()]
        preprocessed=[
            item if item in self.str_to_int
            else "<|unk|>" for item in preprocessed
        ]
        ids=[self.str_to_int[s] for s in preprocessed]
        return ids;
    def decode(self,ids):
        text=" ".join([self.int_to_str[i] for i in ids])
        #replacing spacs before the specified punctuations
        text=re.sub(r'\s+([,.?!"()\'])',r'\1',text)
        return text

In [None]:
tokenizer=SimpleTokenizerV2(vocab)
text1="Hello, do you like tea?"
text2="In the sunlit terraces of the palace"

text=" <|endoftext|>".join((text1,text2))
print(text)

In [None]:
tokenizer.encode(text)

In [None]:
tokenizer.decode(tokenizer.encode(text))

In [None]:
import importlib
import importlib.metadata
import tiktoken

print("tiktoken version:", importlib.metadata.version("tiktoken"))

In [None]:
tokenizer=tiktoken.get_encoding("gpt2")

In [None]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace"
)
integers = tokenizer.encode(text,allowed_special={"<|endoftext|>"})
print(integers)

In [None]:
strings =tokenizer.decode(integers)
print(strings)

In [None]:
integers=tokenizer.encode("Akwirw ier")
print(integers)

strings = tokenizer.decode(integers)
print(strings)

In [None]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text=f.read()
enc_text=tokenizer.encode(raw_text)
print(len(enc_text))

In [None]:
enc_sample=enc_text[50:]
print(enc_sample)

In [None]:
context_size=4 #length of the input 
#the model looks at 4 words and predicts next four words
x=enc_sample[:context_size]
y=enc_sample[1:context_size+1]
print(f"x: {x}")
print(f"y:      {y}")

In [None]:
for i in range(1,context_size+1):
    context=enc_sample[:i]
    desired=enc_sample[i]

    print(context,"----->",desired)

In [None]:
for i in range(1,context_size+1):
    context=enc_sample[:i]
    desired=enc_sample[i]

    print(tokenizer.decode(context),"----->",tokenizer.decode([desired]))

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader

class GPTDatasetV1(Dataset):
   def __init__(self,txt,tokenizer,max_length,stride):
        self.input_ids=[]
        self.target_ids=[]
        
        token_ids=tokenizer.encode(txt,allowed_special={"<|endoftext|>"})
        
        for i in range(0,len(token_ids)-max_length,stride):
            input_chunk=token_ids[i:i+max_length]
            target_chunk=token_ids[i+1:i+max_length+1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

   def __len__(self):
       return len(self.input_ids)

   def __getitem__(self,idx):
       return self.input_ids[idx],self.target_ids[idx]  

In [None]:
def create_dataloader_v1(txt,batch_size=4,max_length=256,stride=128,shuffle=True,drop_last=True,num_workers=0):

    tokenizer=tiktoken.get_encoding("gpt2")

    dataset=GPTDatasetV1(txt,tokenizer,max_length,stride)

    dataloader=DataLoader(
        dataset,batch_size=batch_size,shuffle=shuffle,drop_last=drop_last,num_workers=num_workers
    )

    return dataloader

In [None]:
with open("the-verdict.txt","r",encoding="utf-8") as f:
    raw_text=f.read()

In [None]:
import torch
print("Pytorch version:",torch.__version__)
dataloader=create_dataloader_v1(
    raw_text,batch_size=1,max_length=4,stride=1,shuffle=False
)

data_iter=iter(dataloader)
first_batch=next(data_iter)
print(first_batch)

In [None]:
second_batch=next(data_iter)
print(second_batch)

In [None]:
dataloader=create_dataloader_v1(raw_text,batch_size=8,max_length=4,stride=4,shuffle=False)

data_iter=iter(dataloader)
inputs,targets=next(data_iter)
print("Inputs:\n",inputs)
print("\nTargets:\n",targets)

In [None]:
input_ids=torch.tensor([2,3,5,1])

In [None]:
vocab_size=6
output_dim=3
torch.manual_seed(123)
embedding_layer=torch.nn.Embedding(vocab_size,output_dim)

In [None]:
print(embedding_layer.weight)

In [None]:
print(embedding_layer(torch.tensor([3])))

In [None]:
#--positional emberding
vocab_size=50257
output_dim=256

token_embedding_layer=torch.nn.Embedding(vocab_size,output_dim)
print(token_embedding_layer)

In [None]:
max_length=4
dataloader=create_dataloader_v1(
    raw_text,batch_size=8,max_length=max_length,
    stride=max_length,shuffle=False
)
data_iter=iter(dataloader)
inputs,targets=next(data_iter)

In [None]:
print("Token IDs:\n",inputs)
print("\nInputs shpae:\n",inputs.shape)

In [None]:
token_embeddings=token_embedding_layer(inputs)
print(token_embeddings.shape)

In [None]:
context_length=max_length
pos_embedding_layer=torch.nn.Embedding(context_length,output_dim)

In [None]:
pos_embeddings = pos_embedding_layer(torch.arange(max_length))
print(pos_embeddings.shape)

In [None]:
input_embeddings=token_embeddings+pos_embeddings
print(input_embeddings.shape)

In [None]:
#simplified self attention mechanism
import torch

inputs = torch.tensor(
  [[0.43, 0.15, 0.89], # Your     (x^1)
   [0.55, 0.87, 0.66], # journey  (x^2)
   [0.57, 0.85, 0.64], # starts   (x^3)
   [0.22, 0.58, 0.33], # with     (x^4)
   [0.77, 0.25, 0.10], # one      (x^5)
   [0.05, 0.80, 0.55]] # step     (x^6)
)

In [None]:
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D

# Corresponding words
words = ['Your', 'journey', 'starts', 'with', 'one', 'step']

# Extract x, y, z coordinates
x_coords = inputs[:, 0].numpy()
y_coords = inputs[:, 1].numpy()
z_coords = inputs[:, 2].numpy()

# Create 3D plot
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Plot each point and annotate with corresponding word
for x, y, z, word in zip(x_coords, y_coords, z_coords, words):
    ax.scatter(x, y, z)
    ax.text(x, y, z, word, fontsize=10)

# Set labels for axes
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')

plt.title('3D Plot of Word Embeddings')
plt.show()

In [None]:
# Create 3D plot with vectors from origin to each point, using different colors
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')

# Define a list of colors for the vectors
colors = ['r', 'g', 'b', 'c', 'm', 'y']

# Plot each vector with a different color and annotate with the corresponding word
for (x, y, z, word, color) in zip(x_coords, y_coords, z_coords, words, colors):
    # Draw vector from origin to the point (x, y, z) with specified color and smaller arrow length ratio
    ax.quiver(0, 0, 0, x, y, z, color=color, arrow_length_ratio=0.05)
    ax.text(x, y, z, word, fontsize=10, color=color)

# Set labels for axes
ax.set_xlabel('X')
ax.set_ylabel('Y')
ax.set_zlabel('Z')

# Set plot limits to keep arrows within the plot boundaries
ax.set_xlim([0, 1])
ax.set_ylim([0, 1])
ax.set_zlim([0, 1])

plt.title('3D Plot of Word Embeddings with Colored Vectors')
plt.show()

In [None]:
query=inputs[1]
attn_scores_2=torch.empty(inputs.shape[0])
for i,x_i in enumerate(inputs):
    attn_scores_2[i]=torch.dot(x_i,query)
print(attn_scores_2)

In [None]:
att_scpre_normal=attn_scores_2/attn_scores_2.sum()
print("att weights:",att_scpre_normal)
print("sum:",att_scpre_normal.sum())

In [None]:
def softmax_naive(x):
    return torch.exp(x) / torch.exp(x).sum(dim=0)

attn_weights_2_naive = softmax_naive(attn_scores_2)

print("Attention weights:", attn_weights_2_naive)
print("Sum:", attn_weights_2_naive.sum())

In [None]:
attn_weights_2 = torch.softmax(attn_scores_2, dim=0)
print("Attention weights:", attn_weights_2)
print("Sum:", attn_weights_2.sum())

In [None]:
query = inputs[1] # 2nd input token is the query

context_vec_2 = torch.zeros(query.shape)
for i,x_i in enumerate(inputs):
    context_vec_2 += attn_weights_2[i]*x_i

print(context_vec_2)

In [None]:
attn_scores = torch.empty(6, 6)

for i, x_i in enumerate(inputs):
    for j, x_j in enumerate(inputs):
        attn_scores[i, j] = torch.dot(x_i, x_j)

print(attn_scores)

In [None]:
attn_scores = inputs @ inputs.T
print(attn_scores)

In [None]:
attn_weights = torch.softmax(attn_scores, dim=-1)
print(attn_weights)

In [None]:
row_2_sum = sum([0.1385, 0.2379, 0.2333, 0.1240, 0.1082, 0.1581])
print("Row 2 sum:", row_2_sum)
print("All row sums:", attn_weights.sum(dim=-1))

In [None]:
all_context_vecs = attn_weights @ inputs
print(all_context_vecs)