In [4]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
import pandas as pd


  from .autonotebook import tqdm as notebook_tqdm


In [5]:
# Load the dataset
dataset = load_dataset("roneneldan/TinyStories")


Repo card metadata block was not found. Setting CardData to empty.


In [None]:
dataset_train = pd.DataFrame(dataset['train'])

In [24]:
dataset_train

Unnamed: 0,text
0,"One day, a little girl named Lily found a need..."
1,"Once upon a time, there was a little car named..."
2,"One day, a little fish named Fin was swimming ..."
3,"Once upon a time, in a land full of trees, the..."
4,"Once upon a time, there was a little girl name..."
...,...
2119714,"Once upon a time, in a small town, there lived..."
2119715,"Once upon a time, there was a little boy named..."
2119716,"Once upon a time, there was a big tree. Under ..."
2119717,"Once upon a time, there was a little girl name..."


In [43]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import AutoTokenizer
from datasets import load_dataset
import numpy as np
import pandas as pd
#from transformers import pad_sequences

In [44]:
class StoryDataset(Dataset):
    def __init__(self, stories, tokenizer, max_length=128):
        self.stories = stories
        self.tokenizer = tokenizer
        self.max_length = max_length
        
    def __len__(self):
        return len(self.stories)
    
    def __getitem__(self, idx):
        story = self.stories[idx]['text']  # Access the 'text' value of the story dictionary
        input_ids = self.tokenizer("<sos>" + story, max_length=self.max_length, padding='max_length', truncation=True)['input_ids']
        label_ids = self.tokenizer(story + "</sos>", max_length=self.max_length, padding='max_length', truncation=True)['input_ids']
        
        return torch.tensor(input_ids), torch.tensor(label_ids)

In [45]:
# Get the dataset from Hugging Face
dataset = load_dataset('roneneldan/TinyStories')

# Split the dataset into training and validation splits
train_dataset = dataset['train']
val_dataset = dataset['validation']

# Tokenize the stories using a pre-trained tokenizer
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')

# Create the story datasets
train_story_dataset = StoryDataset(train_dataset, tokenizer)
val_story_dataset = StoryDataset(val_dataset, tokenizer)

# Create the data loaders
train_dataloader = DataLoader(train_story_dataset, batch_size=2, shuffle=True)
val_dataloader = DataLoader(val_story_dataset, batch_size=2, shuffle=False)

Repo card metadata block was not found. Setting CardData to empty.


In [46]:
train_dataloader

<torch.utils.data.dataloader.DataLoader at 0x1c92053a0>

In [47]:
import torch.nn as nn
import torch.nn.functional as F

In [48]:
class TokenEmbedding(nn.Module):
    def __init__(self, vocab_size, d_model):
        super(TokenEmbedding, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.d_model = d_model
        
    def forward(self, x):
        return self.embedding(x) * torch.sqrt(torch.tensor(self.d_model, dtype=torch.float32))

class PositionalEncoding(nn.Module):
    def __init__(self, d_model, max_len=128):
        super(PositionalEncoding, self).__init__()
        self.d_model = d_model
        self.max_len = max_len
        self.positional_encoding = self.generate_positional_encoding()
        
    def generate_positional_encoding(self):
        pe = torch.zeros(self.max_len, self.d_model)
        position = torch.arange(0, self.max_len, dtype=torch.float32).unsqueeze(1)
        div_term = torch.exp(torch.arange(0, self.d_model, 2).float() * (-torch.log(torch.tensor(10000.0)) / self.d_model))
        pe[:, 0::2] = torch.sin(position * div_term)
        pe[:, 1::2] = torch.cos(position * div_term)
        return pe.unsqueeze(0)
        
    def forward(self, x):
        return x + self.positional_encoding[:, :x.size(1), :]

In [49]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, num_heads, head_dim):
        super(MultiHeadAttention, self).__init__()
        self.d_model = d_model
        self.num_heads = num_heads
        self.head_dim = head_dim
        assert d_model % self.num_heads == 0, "d_model must be divisible by num_heads"

        self.head_dim = d_model // self.num_heads
        self.head_dim = d_model // num_heads
        
        self.query = nn.Linear(d_model, d_model)
        self.key = nn.Linear(d_model, d_model)
        self.value = nn.Linear(d_model, d_model)
        self.fc = nn.Linear(d_model, d_model)
        
    def forward(self, query, key, value, mask=None):
        batch_size = query.size(0)
        
        query = self.query(query)
        key = self.key(key)
        value = self.value(value)
        
        query = query.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        key = key.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        value = value.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

        Reshape to (batch_size, num_heads, seq_len, head_dim)
        query = query.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        key = key.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        value = value.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)
        
        scores = torch.matmul(query, key.transpose(-2, -1)) / torch.sqrt(torch.tensor(self.head_dim, dtype=torch.float32))
        
        if mask is not None:
            scores = scores.masked_fill(mask == 0, float('-inf'))
        
        attention = torch.softmax(scores, dim=-1)
        weighted_sum = torch.matmul(attention, value)
        
        weighted_sum = weighted_sum.transpose(1, 2).contiguous().view(batch_size, -1, self.d_model)
        output = self.fc(weighted_sum)
        
        return output

class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x
        

In [50]:
class FeedForward(nn.Module):
    def __init__(self, d_model, d_ff):
        super(FeedForward, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        
    def forward(self, x):
        x = F.relu(self.fc1(x))
        x = self.fc2(x)
        return x

class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff):
        super(DecoderLayer, self).__init__()
        self.self_attention = MultiHeadAttention(d_model, num_heads, d_model // num_heads)
        self.feed_forward = FeedForward(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        
    def forward(self, x):
        self_attention_output = self.self_attention(x, x, x)
        self_attention_output = self.norm1(x + self_attention_output)
        
        feed_forward_output = self.feed_forward(self_attention_output)
        output = self.norm3(self_attention_output + feed_forward_output)
        
        return output

class Decoder(nn.Module):
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff):
        super(Decoder, self).__init__()
        self.token_embedding = TokenEmbedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoding(d_model)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff) for _ in range(num_layers)])
        self.fc = nn.Linear(d_model, vocab_size)
        
    def forward(self, x):
        x = self.token_embedding(x)
        x = self.positional_encoding(x)
        
        for layer in self.layers:
            x = layer(x)
        
        output = self.fc(x)
        return output
    

In [51]:
# Define the hyperparameters for the Transformer model
vocab_size = tokenizer.vocab_size
d_model = 100
num_layers = 6
num_heads = 8
d_ff = 128   

In [52]:
print(vocab_size)

30522


In [53]:
model = Decoder(vocab_size, d_model, num_layers, num_heads, d_ff)

# Define the optimizer and loss function
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
criterion = nn.CrossEntropyLoss()


In [54]:
'''for epoch in range(num_epochs):
    total_loss = 0
    
    for batch in train_dataloader:
        input_ids, label_ids = batch
        
        # Forward pass
        output = model(input_ids)
        
        # Reshape the output and label_ids to match the loss function requirements
        output = output.view(-1, vocab_size)
        label_ids = label_ids.view(-1)
        
        # Calculate the loss
        loss = criterion(output, label_ids)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
    
    # Print the average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")
    '''

'for epoch in range(num_epochs):\n    total_loss = 0\n    \n    for batch in train_dataloader:\n        input_ids, label_ids = batch\n        \n        # Forward pass\n        output = model(input_ids)\n        \n        # Reshape the output and label_ids to match the loss function requirements\n        output = output.view(-1, vocab_size)\n        label_ids = label_ids.view(-1)\n        \n        # Calculate the loss\n        loss = criterion(output, label_ids)\n        \n        # Backward pass and optimization\n        optimizer.zero_grad()\n        loss.backward()\n        optimizer.step()\n        \n        total_loss += loss.item()\n    \n    # Print the average loss for the epoch\n    avg_loss = total_loss / len(train_dataloader)\n    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")\n    '

In [None]:
#pip install tqdm

In [55]:
# Training loop
num_epochs = 10

In [64]:
from tqdm import tqdm

for epoch in range(num_epochs):
    total_loss = 0
    # Assume that num_heads is 8
 
    
    # Wrap your dataloader with tqdm for a progress bar
    train_dataloader_progress = tqdm(train_dataloader, desc=f"Epoch {epoch+1}/{num_epochs}", unit="batch")
    
    for batch in train_dataloader_progress:
        # Define batch_size
        batch_size = 32  
        
        input_ids, label_ids = batch
        num_heads = 8
        

         # Make sure that the sequence length is divisible by num_heads
        sequence_length = input_ids.shape[1] # This should be divisible by num_heads
            
            
        # Check if the sequence length is divisible by num_heads
        if sequence_length % num_heads != 0:
            # If not, trim the input_ids tensor to the largest divisor of sequence_length
            max_divisor = sequence_length - (sequence_length % num_heads)
            input_ids = input_ids[:, :max_divisor]

    
        # Forward pass
        output = model(input_ids)
        
        # Reshape the output and label_ids to match the loss function requirements
        output = output.view(-1, vocab_size)
        label_ids = label_ids.view(-1)
        
        # Calculate the loss
        loss = criterion(output, label_ids)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        total_loss += loss.item()
        
        # Update the progress bar
        train_dataloader_progress.set_postfix({'loss': total_loss / (batch_idx+1)})
    
    # Print the average loss for the epoch
    avg_loss = total_loss / len(train_dataloader)
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {avg_loss:.4f}")

Epoch 1/10:   0%|                                | 0/1059860 [00:00<?, ?batch/s]


RuntimeError: shape '[2, -1, 8, 12]' is invalid for input of size 25600

In [61]:
# input size to the MultiHeadAttention layer is (batch_size, seq_len, embed_size)

# Calculate the total number of elements in the `query` tensor
total_elements = query.nelement()

# Calculate the expected number of elements after reshaping
expected_elements = 2 * self.num_heads * self.head_dim * -1

# Check if the total number of elements matches the expected number of elements
assert total_elements == expected_elements, f"Number of elements does not match. Expected {expected_elements}, but got {total_elements}."

# If the assertion passes, proceed with the reshaping operation
query = query.view(batch_size, -1, self.num_heads, self.head_dim).transpose(1, 2)

NameError: name 'query' is not defined

In [None]:
def infere(self, starting_word, max_length, sp_model):
        device = next(self.parameters()).device  # Get the device of the model
        
        # Initialize hidden states
        hidden = self.init_hidden(1)  # Assume batch size of 1 for inference
        
        # Convert the starting word to its index in the vocabulary
        #starting_index = vocab[starting_word]
        starting_index = sp_model.encode_as_ids(starting_word)
        print(starting_index)
        # Initialize the input sequence with the starting word
        input_token = torch.tensor([[starting_index]]).to(device)

        generated_text = [starting_word]

        # Loop until the end token is generated or max length is reached
        for _ in range(max_length):
            # Forward pass through the decoder
            output, predicted_ids, hidden = self.forward(input_token, hidden, '')

            # Get the predicted token index (assuming batch size of 1)
            predicted_index = predicted_ids.item()

            # Convert the predicted token index to its corresponding word
            predicted_word = sp_model.decode(predicted_index)

            # Append the predicted word to the generated text
            generated_text.append(predicted_word)

            # Check if the end token is generated
            if predicted_word == '<end>':
                break

            # Prepare the input for the next iteration
            input_token = torch.tensor([[predicted_index]]).to(device)

        # Combine the generated words into a single string
        generated_text = ' '.join(generated_text)

        return generated_text

In [None]:
# Get the vocabulary
vocab_list = [sp_model.id_to_piece(i) for i in range(vocab_size)]
# Create a vocabulary dictionary mapping words to indices
vocab = {idx: word for idx, word in enumerate(vocab_list)}