# Working with text data

26 05 25

## 1. Embeddings

Convert a string to vector

## 2. Tokenising text

Take a string an convert it into substrings

In [1]:
# load some text from a short story "The Verdict"
with open('the_verdict.txt', 'r') as f:
    story_text = f.read().replace('\n', ' ')

In [2]:
# hwo many characters?
len(story_text)

20419

In [3]:
# inspect some of the text
story_text[:100]

'THE VERDICT  June 1908 I had always thought Jack Gisburn rather a cheap genius--though a good fellow'

In [4]:
# basic tokeniser
import re

text = "Hello, world. This, is a test"
result = re.split(r'(\s)', text)
result

['Hello,', ' ', 'world.', ' ', 'This,', ' ', 'is', ' ', 'a', ' ', 'test']

In [5]:
# slight improvement: separate out punctuation
result = re.split(r'([.,]|\s)', text)

# remove the white spaces from the list of tokens
result = [token for token in result if token.strip()]
result

['Hello', ',', 'world', '.', 'This', ',', 'is', 'a', 'test']

In [6]:
# another improvement: handle other types of punctuation (e.g., question and exclamation marks)
text = "Hello, world. Is this-- a test?"
result = re.split(r'([,.:;?_!"()\']|--|\s)', text)
result = [token for token in result if token.strip()]
result

['Hello', ',', 'world', '.', 'Is', 'this', '--', 'a', 'test', '?']

In [7]:
# apply the tokeniser to the short story
story_tokens = re.split(r'([,.:;?_!"()\']|--|\s)', story_text)
story_tokens = [token.strip() for token in story_tokens if token.strip()]
print(len(story_tokens))

4667


In [8]:
print(story_tokens[:30])

['THE', 'VERDICT', 'June', '1908', 'I', 'had', 'always', 'thought', 'Jack', 'Gisburn', 'rather', 'a', 'cheap', 'genius', '--', 'though', 'a', 'good', 'fellow', 'enough', '--', 'so', 'it', 'was', 'no', 'great', 'surprise', 'to', 'me', 'to']


### 2.2. Convert tokens to ids

Easier to work with numbers that strings -> convert each of the tokens to an integer

In [9]:
all_words = sorted(set(story_tokens))
vocab_size = len(all_words)
print(vocab_size)

1148


In [10]:
vocab = {token: integer for integer, token in enumerate(all_words)}

vocab

{'!': 0,
 '"': 1,
 "'": 2,
 '(': 3,
 ')': 4,
 ',': 5,
 '--': 6,
 '.': 7,
 '1908': 8,
 ':': 9,
 ';': 10,
 '?': 11,
 'A': 12,
 'AM': 13,
 'Ah': 14,
 'Among': 15,
 'And': 16,
 'Are': 17,
 'Arrt': 18,
 'As': 19,
 'At': 20,
 'Be': 21,
 'Begin': 22,
 'Burlington': 23,
 'But': 24,
 'By': 25,
 'Carlo': 26,
 'Chicago': 27,
 'Claude': 28,
 'Come': 29,
 'Croft': 30,
 'Destroyed': 31,
 'Devonshire': 32,
 'Don': 33,
 'Dubarry': 34,
 'Emperors': 35,
 'End': 36,
 'FELT': 37,
 'Florence': 38,
 'For': 39,
 'Gallery': 40,
 'Gideon': 41,
 'Gisburn': 42,
 'Gisburns': 43,
 'Grafton': 44,
 'Greek': 45,
 'Grindle': 46,
 'Grindles': 47,
 'HAD': 48,
 'HAS': 49,
 'HAVE': 50,
 'Had': 51,
 'Hang': 52,
 'Has': 53,
 'He': 54,
 'Her': 55,
 'Hermia': 56,
 'His': 57,
 'How': 58,
 'I': 59,
 'If': 60,
 'In': 61,
 'It': 62,
 'Jack': 63,
 'Jove': 64,
 'June': 65,
 'Just': 66,
 'KNOWN': 67,
 'Lord': 68,
 'MINE': 69,
 'Made': 70,
 'Miss': 71,
 'Money': 72,
 'Monte': 73,
 'Moon-dancers': 74,
 'Mr': 75,
 'Mrs': 76,
 'My': 77,

### 2.3. Complete tokeniser class

- init: take the vocab of tokens and create the mappings between tokens and integers
- encode: take a string, tokenise and then map to integers
- decode: take a list of ids and convert to string

In [11]:
class Tokeniser():
    def __init__(self, vocab):
        self.token_to_id = vocab
        self.id_to_token = {id_value: token for token, id_value in vocab.items()}
        
    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = [token.strip() for token in tokens if token.strip()]
        result = [self.token_to_id[token] for token in tokens]
        return result

    def decode(self, id_values):
        text = " ".join(self.id_to_token[id_value] for id_value in id_values)
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [12]:
# example usage: inti tokeniser
tokeniser = Tokeniser(vocab)

In [13]:
# tokenise
text = """
It's the last he painted, you know, Mrs. Gisburn said with pardonable pride."""
id_values = tokeniser.encode(text)
print(id_values)

# convert back to string
print(tokeniser.decode(id_values))

[62, 2, 868, 1006, 619, 550, 764, 5, 1144, 614, 5, 76, 7, 42, 869, 1126, 772, 812, 7]
It' s the last he painted, you know, Mrs. Gisburn said with pardonable pride.


In [14]:
# example with error: 'bogan' not in the vocab
text = """
Mrs. Gisburn says he is a bogan
"""
id_values = tokeniser.encode(text)

KeyError: 'bogan'

In [15]:
# handle OOV errors
# also include a token for endoftext, i.e., that separates two different texts

all_tokens = sorted(list(set(story_tokens)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

1150


In [16]:
# check that these two items have been added
for i, item in enumerate(list(vocab.items())[-5:]):
    print(item)

('younger', 1145)
('your', 1146)
('yourself', 1147)
('<|endoftext|>', 1148)
('<|unk|>', 1149)


In [17]:
# create a new tokeniser
class Tokeniser2():
    def __init__(self, vocab):
        self.token_to_id = vocab
        self.id_to_token = {id_value: token for token, id_value in vocab.items()}
        
    def encode(self, text):
        tokens = re.split(r'([,.:;?_!"()\']|--|\s)', text)
        tokens = [token.strip() for token in tokens if token.strip()]
        tokens = [token if token in self.token_to_id else "<|unk|>" for token in tokens]
        result = [self.token_to_id[token] for token in tokens]
        return result

    def decode(self, id_values):
        text = " ".join(self.id_to_token[id_value] for id_value in id_values)
        text = re.sub(r'\s+([,.?!"()\'])', r'\1', text)
        return text

In [18]:
# does a bogan still cause an error?
# example usage: inti tokeniser
tokeniser = Tokeniser2(vocab)

In [19]:
all_tokens = sorted(list(set(story_tokens)))
all_tokens.extend(["<|endoftext|>", "<|unk|>"])
vocab = {token:integer for integer, token in enumerate(all_tokens)}
print(len(vocab.items()))

1150


In [20]:
text1 = "Are you a bogan?"
text2 = "No I am not, thank you very much"
text = " <|endoftext|> ".join([text1, text2])
text

'Are you a bogan? <|endoftext|> No I am not, thank you very much'

In [21]:
tokeniser = Tokeniser2(vocab)

In [22]:
print(tokeniser.encode(text))

[17, 1144, 132, 1149, 11, 1148, 81, 59, 1149, 729, 5, 1149, 1144, 1083, 709]


In [23]:
# decode
print(tokeniser.decode(tokeniser.encode(text)))

Are you a <|unk|>? <|endoftext|> No I <|unk|> not, <|unk|> you very much


### 2.4. Byte-pair encoding

- tiktoken: a library for implementing the byte-pair encoding algorithm

In [25]:
from importlib.metadata import version
import tiktoken

In [27]:
version("tiktoken")

'0.12.0'

In [28]:
tokeniser = tiktoken.get_encoding("gpt2")

In [30]:
text = (
    "Hello, do you like tea? <|endoftext|> In the sunlit terraces"
    "of someunknownPlace."
)
print(text)

Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.


In [36]:
id_values = tokeniser.encode(text, allowed_special={"<|endoftext|>"})
id_values

[15496,
 11,
 466,
 345,
 588,
 8887,
 30,
 220,
 50256,
 554,
 262,
 4252,
 18250,
 8812,
 2114,
 1659,
 617,
 34680,
 27271,
 13]

In [35]:
# convert back
strings = tokeniser.decode(id_values)
strings

'Hello, do you like tea? <|endoftext|> In the sunlit terracesof someunknownPlace.'

In [40]:
tokeniser.decode([2114])

'aces'

### 2.6. Data sampling

In [41]:
with open("the_verdict.txt", "r") as f:
    story_text = f.read()

In [42]:
enc_text = tokeniser.encode(story_text)
print(len(enc_text))

5411


In [48]:
enc_sample = enc_text[50:]

context_size = 4

x = enc_sample[:context_size]
y = enc_sample[1:context_size+1]

print(f"x: {x}")
print(f"y:      {y}")

x: [550, 5710, 465, 12036]
y:      [5710, 465, 12036, 11]


In [50]:
# create the samples for next work prediction
for n in range(1, context_size+1):
    context = enc_sample[:n]
    desired = enc_sample[n]
    print(context, "----->", desired)

[550] -----> 5710
[550, 5710] -----> 465
[550, 5710, 465] -----> 12036
[550, 5710, 465, 12036] -----> 11


In [52]:
for n in range(1, context_size+1):
    context = enc_sample[:n]
    desired = enc_sample[n]
    print(tokeniser.decode(context), "--->", tokeniser.decode([desired]))

 had --->  dropped
 had dropped --->  his
 had dropped his --->  painting
 had dropped his painting ---> ,


In [54]:
import torch
from torch.utils.data import Dataset, DataLoader

In [64]:
class GPTDatasetV1(Dataset):
    def __init__(self, text, tokeniser, max_len, stride):
        self.input_ids = []
        self.target_ids = []
        token_ids = tokeniser.encode(text)
        for n in range(0, len(token_ids) - max_len, stride):
            input_chunk = token_ids[n:n + max_len]
            target_chunk = token_ids[n + 1: n + max_len + 1]
            self.input_ids.append(torch.tensor(input_chunk))
            self.target_ids.append(torch.tensor(target_chunk))

    def __len__(self):
        return len(self.input_ids)

    def __getitem__(self, idx):
        return self.input_ids[idx], self.target_ids[idx]

In [65]:
def create_dataloader_v1(text, batch_size=4, max_len=256, stride=128, shuffle=True, drop_last=True, num_workers=0):
    tokeniser = tiktoken.get_encoding('gpt2')
    dataset = GPTDatasetV1(text, tokeniser, max_len, stride)
    dataloader = DataLoader(
        dataset, 
        batch_size=batch_size,
        shuffle=shuffle,
        drop_last=drop_last,
        num_workers=num_workers
    )
    return dataloader

In [70]:
with open('the_verdict.txt', 'r') as f:
    story_text = f.read()

dataloader = create_dataloader_v1(
    story_text, batch_size=1, max_len=4, stride=1, shuffle=False
)
data_iter = iter(dataloader)
first_batch = next(data_iter)
print(first_batch)

second_batch = next(data_iter)
print(second_batch)


[tensor([[10970, 33310,    35, 18379]]), tensor([[33310,    35, 18379,   220]])]
[tensor([[33310,    35, 18379,   220]]), tensor([[   35, 18379,   220,   198]])]


In [71]:
# increase the batch size
dataloader = create_dataloader_v1(
    story_text, batch_size=8, max_len=4, stride=4, shuffle=True
)

data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Inputs:\n", inputs)
print("\nTargets:\n", targets)

Inputs:
 tensor([[ 3521,   470,   804,   379],
        [  293,  1032,    88,    25],
        [  881,  2392,   284,  1645],
        [  326,   314,   423,  2982],
        [ 9074,    13, 46606,   536],
        [  257,  1808,   314,  1234],
        [   11,   530,   714,  1464],
        [ 2994,   284,   943, 17034]])

Targets:
 tensor([[  470,   804,   379,   198],
        [ 1032,    88,    25,   645],
        [ 2392,   284,  1645,    13],
        [  314,   423,  2982,   357],
        [   13, 46606,   536,  5469],
        [ 1808,   314,  1234,  2063],
        [  530,   714,  1464,   651],
        [  284,   943, 17034,   318]])


### 2.7 Embeddings

From ids to vectors

In [72]:
input_ids = torch.tensor([2, 3, 5, 1])
vocab_size = 6
output_dim = 3

In [74]:
torch.manual_seed(123)
embedding_layer = torch.nn.Embedding(vocab_size, output_dim)
print(embedding_layer.weight)

Parameter containing:
tensor([[ 0.3374, -0.1778, -0.1690],
        [ 0.9178,  1.5810,  1.3010],
        [ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-1.1589,  0.3255, -0.6315],
        [-2.8400, -0.7849, -1.4096]], requires_grad=True)


In [75]:
print(embedding_layer(input_ids))

tensor([[ 1.2753, -0.2010, -0.1606],
        [-0.4015,  0.9666, -1.1481],
        [-2.8400, -0.7849, -1.4096],
        [ 0.9178,  1.5810,  1.3010]], grad_fn=<EmbeddingBackward0>)


### 2.8 Word positions

Need to encode where the tokens are located within the sequence

In [76]:
vocab_size = 50257
output_dim = 256
token_embedding_layer = torch.nn.Embedding(vocab_size, output_dim)

In [80]:
max_length = 4
dataloader = create_dataloader_v1(
    story_text, batch_size=8, max_len=max_length, stride=max_length, shuffle=False
)

In [81]:
data_iter = iter(dataloader)
inputs, targets = next(data_iter)
print("Token IDS:\n", inputs)
print("\nInput shape: \n", inputs.shape)

Token IDS:
 tensor([[10970, 33310,    35, 18379],
        [  220,   198, 15749, 40417],
        [  198,    40,   550,  1464],
        [ 1807,  3619,   402,   271],
        [10899,  2138,   257,  7026],
        [15632,   438,  2016,   257],
        [  198, 11274,  5891,  1576],
        [  438,   568,   340,   373]])

Input shape: 
 torch.Size([8, 4])


In [82]:
# map to 256-dim vectors
token_embeddings = token_embedding_layer(inputs)
print(token_embeddings.shape)

torch.Size([8, 4, 256])


In [83]:
context_length = max_length

# the positional embeddings, in this instance just randomly initialised weights
pos_embedding_layer = torch.nn.Embedding(context_length, output_dim)
pos_embeddings = pos_embedding_layer(torch.arange(context_length))
print(pos_embeddings.shape)

torch.Size([4, 256])


In [84]:
# final embeddings are the token embeddings + position embeddings
input_embeddings = token_embeddings + pos_embeddings
print(input_embeddings.shape)

torch.Size([8, 4, 256])
