In [125]:
import torch
from operator import itemgetter
from torch.utils.data import DataLoader
import random
import numpy as np

# 1. Data Preparation

## 1.1. Examining the Data

In [92]:
with open('lord-of-the-rings-processed.txt','r',encoding='utf-8') as f:
    text = f.read()

In [93]:
print(f"length of the book - {len(text)} characters")

length of the book - 3729059 characters


In [95]:
print(text[:100])

The Music of the Ainur There was Eru, the One, who in Arda is called lluvatar; and he made first the


## 1.2. Format Data

In [74]:
chars = sorted(list(set(text)))
print(chars)

['\n', ' ', '!', '"', "'", '(', ')', ',', '-', '.', '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', ':', ';', '?', 'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P', 'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X', 'Y', 'Z', 'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z', '®', '—', '‘', '’', '“', '”']


In [75]:
common = r"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ();:.!?-,"
special = [char for char in chars if char not in list(common)]
print(special)

['\n', ' ', '"', "'", '®', '—', '‘', '’', '“', '”']


In [76]:
text = text.replace("\n"," ")
text = text.replace("  ", " ")
text = text.replace("®", "u")

In [84]:
special_char = list(itemgetter(*[6,7,8,9])(special))
special_char.extend([",",";",":","!","?"])
special_char

['‘', '’', '“', '”', ',', ';', ':', '!', '?']

In [88]:
no_space_after = list(itemgetter(*[0,2])(special_char))
no_space_after

['‘', '“']

In [87]:
no_space_before = list(itemgetter(*set(range(len(special_char)))-set([0,2]))(special_char))
no_space_before

['’', '”', ',', ';', ':', '!', '?']

In [89]:
# replace such as <' sss> to <'sss>
for s in no_space_after:
    text = text.replace(s+" ", s)

# replace such as <s ,> to <s,>
for s in no_space_before:
    text = text.replace(" "+s,s)


In [90]:
# standardize the use of quotation marks
text = text.replace('"',"'")
text = text.replace('‘',"'")
text = text.replace('’',"'")
text = text.replace('“',"'")
text = text.replace('”',"'")

In [91]:
with open("lord-of-the-rings-processed.txt","w") as f:
    f.write(text)

## 1.3. Create Dictionary and Tokenize the Data

**tokenizer**

In [96]:
chars = sorted(list(set(text)))
common = r"0123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ();:.!?-,"
special = [char for char in chars if char not in list(common)]
print(special)

[' ', "'", '—']


In [98]:
encode_char = {char:i for i, char in enumerate(chars)}
decode_char = {i:char for i, char in enumerate(chars)}
print(len(encode_char))

74


In [99]:
encode = lambda string: [encode_char[s] for s in string]
decode = lambda nums: ''.join([decode_char[n] for n in nums])

In [100]:
encode("This is good")

[40, 54, 55, 65, 0, 55, 65, 0, 53, 61, 61, 50]

In [102]:
decode([12, 35, 32, 73, 32])

'4OL—L'

## 1.4. Load data and construct batches + dataloaders

In [103]:
data = torch.tensor(encode(text),dtype=torch.long)
data.shape

torch.Size([3729059])

In [107]:
start = 10505
length = 500
print(decode(data[start:start+length//4].tolist()))
print(decode(data[start+length//4:start+length//4*2].tolist()))
print(decode(data[start+length//4*2:start+length//4*3].tolist()))
print(decode(data[start+length//4*3:start+length].tolist()))

e filled with gladness; but because of the roaring of the sea they felt a great unquiet. And they observed the winds and the 
air, and the matters of which Arda was made, of iron and stone and silver and gold and many substances: but of all these wate
r they most greatly praised. And it is said by the Eldar that in water there lives yet the echo of the Music of the Ainur mor
e than in any substance else that is in this Earth; and many of the Children of lluvatar hearken still unsated to the voices 


**train test split**

In [109]:
ratio = 0.85
n = int(ratio*len(data))
train_data = data[:n]
val_data = data[n:]

In [113]:
len(train_data), len(val_data)

(3169700, 559359)

**dataset and dataloader**

the unsupervisedTokenizedTextDataset takes in tokenized text (could be at character or word leve, or other level of tokenization), desired block size (input sequence length), predict_size (model output length: default 1 - use input sequence to predict the next token)

the dataset will have length of len(full_txt) - (block_size+predict_size) +1

for getitem, when given index
txt[index], txt[index+1], ... txt[index+block_size - 1] will be input sequence with length block_size
txt[index+block_size] ,..., txt[index+block_size+predict_size-1] will be output sequence with length predict_size

In [110]:
class unsupervisedTokenizedTextDataset(torch.utils.data.Dataset):
    def __init__(self,full_txt,block_size,predict_size=1):
        self.txt = full_txt
        self.block_size = block_size
        self.predict_size = predict_size
    def __len__(self):
        return len(self.txt) - (self.block_size + self.predict_size) + 1
    def __getitem__(self, idx):
        input_sequence = self.txt[idx:idx+self.block_size]
        output_sequence = self.txt[idx+self.block_size:idx+self.block_size+self.predict_size]
        return input_sequence, output_sequence

In [114]:
block_size = 100
output_size = 1

train_dataset = unsupervisedTokenizedTextDataset(full_txt = train_data,
                                                 block_size = block_size,
                                                 predict_size=output_size)

val_dataset = unsupervisedTokenizedTextDataset(full_txt = val_data,
                                               block_size = block_size,
                                               predict_size = output_size)

In [115]:
len(train_dataset), len(val_dataset)

(3169600, 559259)

In [118]:
batch_size = 32
train_dataloader = DataLoader(dataset=train_dataset,batch_size=batch_size,shuffle=True)

In [None]:
val_sampler = torch.utils.data.RandomSampler(val_dataset,replacement=False,num_sample=2)
val_dataloader = DataLoader(dataset=val_dataset,batch_size=10,shuffle=True,sampler=val_sampler)

In [123]:
s_input, s_output = next(iter(train_dataloader))
print(decode(s_input[1].tolist()))
print(decode(s_output[1].tolist()))

e road. An hour long prepared approaches.' 'Don't leave me behind!' said Merry. 'I have not been of 
m
