<a href="https://colab.research.google.com/github/ainsley-snell/Data_Mining_CS290/blob/main/data_attention.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!git clone https://github.com/ainsley-snell/Data_Mining_CS290.git

Cloning into 'Data_Mining_CS290'...
remote: Enumerating objects: 40, done.[K
remote: Counting objects: 100% (40/40), done.[K
remote: Compressing objects: 100% (32/32), done.[K
remote: Total 40 (delta 11), reused 19 (delta 3), pack-reused 0 (from 0)[K
Receiving objects: 100% (40/40), 101.77 KiB | 3.63 MiB/s, done.
Resolving deltas: 100% (11/11), done.


In [4]:
import torch
import tiktoken

In [5]:
tokenizer= tiktoken.get_encoding("gpt2")

In [6]:
with open("Data_Mining_CS290/SILVERBLAZE.txt", "r") as f:
    raw_text= f.read()


In [7]:
enc_text= tokenizer.encode(raw_text)

In [8]:
from torch.utils.data import Dataset, DataLoader


class GPTDatasetV1(Dataset):
    def __init__(self, txt, tokenizer, max_length, stride):
        self.input_ids = []
        self.target_ids = []

        # Tokenize the entire text
        token_ids = tokenizer.encode(txt, allowed_special={"<|endoftext|>"})
        assert len(token_ids) > max_length, "Number of tokenized inputs must at least be equal to max_length+1"

        # Use a sliding window to chunk the book into overlapping sequences of max_length
        for i in range(0, len(token_ids) - max_length, stride):
            input_chunk = token_ids[i:i + max_length]
            target_chunk = token_ids[i + 1: i + max_length + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [9]:
def create_dataloader_v1(txt, batch_size=4, max_length=256,
                         stride=128, shuffle=True, drop_last=True,
                         num_workers=0):

    # Initialize the tokenizer
    tokenizer = tiktoken.get_encoding("gpt2")

    # Create dataset
    dataset = GPTDatasetV1(txt, tokenizer, max_length, stride)

    # Create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )

    return dataloader

In [10]:
dataloader = create_dataloader_v1(raw_text, batch_size=4, max_length=2, stride=2)

for batch in dataloader:
    x, y = batch
    break

x

tensor([[4373,  379],
        [ 290,  356],
        [5193,  267],
        [2722,  573]])

In [11]:
dataloader = create_dataloader_v1(raw_text, batch_size=6, max_length=8, stride=3)

for batch in dataloader:
    x, y = batch
    break

x

tensor([[  553,   531, 21636,  9847,    11, 44514,    11,   355],
        [21636,  9847,  1701,   198,   198,     1,    40,   423],
        [ 1243,   878,   514,    13,  1318,   373,   257,  3091],
        [   11,   284,  3977,  9626, 48209, 10695,    13,  9074],
        [  338,  2156,    11,   810,   484,   550, 43743,   287],
        [  262,  7468,   286,  1521,  1757, 15195,  6122, 16555]])

In [13]:
for row in batch:
    list= row.tolist()[0]
    print(list)
    decoded= tokenizer.decode(list)
    print(decoded)

[553, 531, 21636, 9847, 11, 44514, 11, 355]
," said Colonel Ross, bluntly, as
[531, 21636, 9847, 11, 44514, 11, 355, 616]
 said Colonel Ross, bluntly, as my


In [None]:
vocab_size = 8
output_dim = 4
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

print(embedding_layer.weight)

# The embedding layer maps token ids to vectors.
# Embedding vectors are the numerical reprentations of words that are used to create contextual relationships between words and other words.
# When words are turned into vectors, the closer they are in value means the more similar they are. The LLM takes their proximity in value to properly assosiate words to eachother.

In [None]:
embedding_vectors = embedding_layer.weight.data
print(embedding_vectors)

In [None]:
attention_scores = embedding_vectors @ embedding_vectors.T
attention_scores

In [None]:

attention_weights = torch.softmax( attention_scores, dim = -1 )
attention_weights

In [None]:
attention_weights[0].sum()

In [None]:
context_vectors = attention_weights @ embedding_vectors
context_vectors