In [33]:
import torch
from torch import nn
import tiktoken
from torch.utils.data import Dataset,DataLoader

In [14]:
with open('text.txt','r',encoding="utf-8") as f:
    raw_text = f.read()
len(raw_text)    

24118

In [29]:
raw_text

'As she lay in her berth, staring at the shadows overhead, the rush of the wheels was in her brain, driving her deeper and deeper into circles of wakeful lucidity. \nThe sleeping-car had sunk into its night-silence. \nThrough the wet window-pane she watched the sudden lights, the long stretches of hurrying blackness. \nNow and then she turned her head and looked through the opening in the hangings at her husband\'s curtains across the aisle.... \nShe wondered restlessly if he wanted anything and if she could hear him if he called. \nHis voice had grown very weak within the last months and it irritated him when she did not hear. \nThis irritability, this increasing childish petulance seemed to give expression to their imperceptible estrangement. \nLike two faces looking at one another through a sheet of glass they were close together, almost touching, but they could not hear or feel each other: the conductivity between them was broken. \nShe, at least, had this sense of separation, and 

In [27]:
tokenizer = tiktoken.get_encoding('gpt2')
test = tokenizer.encode(raw_text)
len(test)

5824

In [28]:
res = tokenizer.decode(test)
res

'As she lay in her berth, staring at the shadows overhead, the rush of the wheels was in her brain, driving her deeper and deeper into circles of wakeful lucidity. \nThe sleeping-car had sunk into its night-silence. \nThrough the wet window-pane she watched the sudden lights, the long stretches of hurrying blackness. \nNow and then she turned her head and looked through the opening in the hangings at her husband\'s curtains across the aisle.... \nShe wondered restlessly if he wanted anything and if she could hear him if he called. \nHis voice had grown very weak within the last months and it irritated him when she did not hear. \nThis irritability, this increasing childish petulance seemed to give expression to their imperceptible estrangement. \nLike two faces looking at one another through a sheet of glass they were close together, almost touching, but they could not hear or feel each other: the conductivity between them was broken. \nShe, at least, had this sense of separation, and 

In [18]:
#dataset 
class Tdataset(Dataset):
    def __init__(self,txt,tokenizer,max_length,stride):
        self.X = []
        self.y = []

        #get  ids of textswith tokenizer
        token_ids = tokenizer.encode(txt,allowed_special={"<|endoftext|>"})
        #get inputs and outputs wiht step of 1
        for i in range(0,len(token_ids) - max_length,stride):
            inputs = token_ids[i:i+max_length]
            outs = token_ids[i+1:i+max_length+1]
            self.X.append(torch.tensor(inputs))
            self.y.append(torch.tensor(outs))
    def __len__(self):
        return len(self.X)
    def __getitem__(self, index):
        return self.X[index],self.y[index]
    
def loader(txt,max_length,stride,batch_size,
           shuffle=True,num_workers=0,drop_last = True):
    #tokenizer defined
    tokenizer =  tiktoken.get_encoding('gpt2') 
    #get dataset
    dataset = Tdataset(txt,tokenizer,max_length,stride)
    
    #create dataloader
    dataloader = DataLoader(
        dataset,
        batch_size=batch_size,
        shuffle=shuffle,
        num_workers=num_workers,
        drop_last=drop_last
    )
    return dataloader

In [19]:
dataloader = loader(raw_text,4,1,4)
inputs = next(iter(dataloader))

In [20]:
x,y = inputs

In [21]:
x,y

(tensor([[  257,  3753,    12,   343],
         [  655,   355,   345,   910],
         [  340,   373,   691,   257],
         [  220,   198,  3347, 23831]]),
 tensor([[ 3753,    12,   343,   799],
         [  355,   345,   910,    11],
         [  373,   691,   257,   582],
         [  198,  3347, 23831,   625]]))

In [31]:
tokenizer.n_vocab

50257

In [36]:
#positional and token embedding
vocab_size = 50257
out_dim = 256
max_length = 4

token_embed = nn.Embedding(vocab_size,out_dim)
positional_embed = nn.Embedding(max_length,out_dim)

In [40]:
token_embed.weight,token_embed.weight.shape

(Parameter containing:
 tensor([[ 0.6243,  0.6100, -0.4773,  ...,  0.7421,  1.3973, -1.6096],
         [-1.2035, -1.2515,  0.3734,  ..., -0.7380,  1.7154, -1.0384],
         [ 0.7888, -0.0711, -0.0159,  ...,  1.4704, -1.6878, -0.6194],
         ...,
         [-0.9627,  0.2066, -0.6358,  ..., -0.6111, -0.4648, -0.6491],
         [ 0.3387,  0.2378, -0.3975,  ..., -1.0421, -1.3483,  0.3647],
         [-0.3772,  0.2641, -0.7347,  ...,  0.8126, -0.1892, -0.1416]],
        requires_grad=True),
 torch.Size([50257, 256]))

In [41]:
positional_embed.weight,positional_embed.weight.shape

(Parameter containing:
 tensor([[ 0.6574,  1.3799,  1.6452,  ...,  0.3313,  0.6477, -0.7265],
         [ 0.0345, -2.5327, -0.3861,  ...,  0.9282, -0.3241, -0.6225],
         [ 0.5595, -0.6717,  0.2689,  ...,  0.0560,  0.5185,  0.3307],
         [ 0.1585, -0.0158,  0.7349,  ..., -1.0311,  1.3047,  0.6706]],
        requires_grad=True),
 torch.Size([4, 256]))

In [43]:
token_embed(x).shape

torch.Size([4, 4, 256])

In [45]:
positional_embed(torch.arange(max_length)).shape

torch.Size([4, 256])

In [47]:
def get_input_embedding(txt,
                        max_length,
                        stride,
                        batch_size,
                        num_workers
                        ):
    #POSITIONAL AND TOKEN EMBEDDING
    vocab_size = 50257
    out_dim = 256

    token_embed_layer = nn.Embedding(vocab_size,out_dim)#set shapes 
    positional_embed_layer = nn.Embedding(max_length,out_dim)#

    dataloader =loader(txt,
                       max_length=max_length,
                       stride=stride,
                       batch_size=batch_size,
                       num_workers=num_workers,  
                       )
    for X,y in dataloader:
        token_embeddings = token_embed_layer(X)
        positional_embeddings = positional_embed_layer(torch.arange(max_length))
        input_embeddings = token_embeddings + positional_embeddings
        break
    return input_embeddings

In [49]:
inpts_embed = get_input_embedding(raw_text,4,1,8,0)

In [52]:
inpts_embed.shape

torch.Size([8, 4, 256])