Description: [Implementation of GPT-2 language model for text generation, training custom models and fine-tuning.]
A class representing a language dataset for the Bigram Language Model.
def __init__(self, path: str, train_val_split: int = 90)
Initialize the LanguageDataset with the provided text data.
-
path
(str): The file path to the text data. -
train_val_split
(int, optional): The percentage split between training and validation data. Defaults to 90 (90% training, 10% validation).
def _load_dataset(self, path: str) -> str
Load the text data from the given file path.
path
(str): The file path to the text data.
text
(str): The loaded text data as a string.
def _encode(self, s: str) -> List[int]
Convert the text data into a list of numerical representations using character encoding.
s
(str): The input text data to be encoded.
List[int]
: A list of integer token indices representing the encoded text.
def _decode(self, l: List[int]) -> str
Convert a list of numerical representations back into the original text.
l
(List[int]): A list of integer token indices representing the encoded text.
str
: The decoded text as a string.
def _calculate_split(self, train_val_split: int) -> float
Calculate the split index for separating training and validation data.
train_val_split
(int): The percentage split between training and validation data.
float
: The split index as a float between 0 and 1, representing the percentage split.
@property
def text(self) -> str
Get the loaded text data as a string.
@property
def chars(self) -> List[str]
Get the list of unique characters present in the text data.
@property
def vocab_size(self) -> int
Get the size of the character vocabulary.
@property
def stoi(self) -> Dict[str, int]
Get a dictionary mapping characters to their corresponding integer token indices.
@property
def itos(self) -> Dict[int, str]
Get a dictionary mapping integer token indices to their corresponding characters.
@property
def train_data(self) -> torch.Tensor
Get the training data as a tensor containing numerical representations of the text.
@property
def val_data(self) -> torch.Tensor
Get the validation data as a tensor containing numerical representations of the text.
A single head of self-attention used in the Transformer block.
def __init__(self, head_size, n_embd, block_size, dropout):
Initialize the Head with linear layers and a lower triangular mask.
-
head_size
(int): The size of the attention head. -
n_embd
(int): The embedding dimension of the input. -
block_size
(int): The maximum length of the input sequence. -
dropout
(float): The dropout rate.
def forward(self, x):
Perform the forward pass of the attention head.
x
(torch.Tensor): The input tensor of shape(B, T, C)
, whereB
is the batch size,T
is the sequence length, andC
is the embedding dimension.
torch.Tensor
: The output tensor after the self-attention operation. It has the same shape as the input tensor(B, T, C)
.
Multi-head self-attention module used in the Transformer block.
def __init__(self, num_heads, head_size, n_embd, block_size, dropout):
Initialize the MultiHeadAttention with attention heads and projection layer.
-
num_heads
(int): The number of attention heads. -
head_size
(int): The size of each attention head. -
n_embd
(int): The embedding dimension of the input. -
block_size
(int): The maximum length of the input sequence. -
dropout
(float): The dropout rate.
def forward(self, x):
Perform the forward pass of the multi-head self-attention module.
x
(torch.Tensor): The input tensor of shape(B, T, C)
, whereB
is the batch size,T
is the sequence length, andC
is the embedding dimension.
torch.Tensor
: The output tensor after applying the multi-head self-attention. It has the same shape as the input tensor(B, T, C)
.
Feed-forward neural network module used in the Transformer block.
def __init__(self, n_embd, dropout):
Initialize the FeedForward with linear layers and activation functions.
-
n_embd
(int): The embedding dimension of the input. -
dropout
(float): The dropout rate.
def forward(self, x):
Perform the forward pass of the feed-forward neural network.
x
(torch.Tensor): The input tensor of shape(B, T, C)
, whereB
is the batch size,T
is the sequence length, andC
is the embedding dimension.
torch.Tensor
: The output tensor after the feed-forward operation. It has the same shape as the input tensor(B, T, C)
.
Transformer block module used in the BigramLanguageModel.
def __init__(self, n_embd, n_head, block_size, dropout):
Initialize the Transformer block with self-attention and feed-forward modules.
-
n_embd
(int): The embedding dimension of the input. -
n_head
(int): The number of attention heads in the multi-head self-attention. -
block_size
(int): The maximum length of the input sequence. -
dropout
(float): The dropout rate.
def forward(self, x):
Perform the forward pass of the transformer block.
x
(torch.Tensor): The input tensor of shape(B, T, C)
, whereB
is the batch size,T
is the sequence length, andC
is the embedding dimension.
torch.Tensor
: The output tensor after applying the transformer block. It has the same shape as the input tensor(B, T, C)
.
def __init__(self, vocab_size, n_embd, n_head, n_layer, block_size, dropout)
-
vocab_size
(int): The size of the vocabulary, which determines the number of tokens in the language model. -
n_embd
(int): The embedding dimension of the input. -
n_head
(int): The number of attention heads in the multi-head self-attention. -
n_layer
(int): The number of Transformer blocks in the model. -
block_size
(int): The maximum length of the input sequence. -
dropout
(float): The dropout rate for regularization.
def _build_block(self) -> Block
Creates a single Transformer block with specified embedding dimensions and attention heads.
def forward(self, idx: torch.Tensor, target: Optional[torch.Tensor] = None) -> Tuple[torch.Tensor, Optional[torch.Tensor]]
Performs the forward pass of the BigramLanguageModel.
-
idx
(torch.Tensor): The input tensor representing the token indices of shape (B, T), where B is the batch size, and T is the sequence length. -
target
(torch.Tensor, optional): The target tensor representing the token indices for computing the loss. It should have the same shape as idx. If None, no loss will be computed. (Default: None)
-
torch.Tensor
: The logits tensor after the forward pass of the model. It has the shape (B, T, vocab_size). -
torch.Tensor
: The loss tensor, computed using F.cross_entropy if the target is provided. Otherwise, None. It has the shape (B * T,) if target is not None.
def generate(self, idx: torch.Tensor, max_new_tokens: int) -> List[int]
Generates new tokens using the trained language model.
-
idx
(torch.Tensor): The input tensor representing the token indices of shape (B, T), where B is the batch size, and T is the sequence length. -
max_new_tokens
(int): The maximum number of new tokens to generate.
List[int]
: A list of generated token indices as integers.
def set_hyperparams(**kwargs)
Update the hyperparameters dictionary with the provided keyword arguments.
**kwargs
: Keyword arguments with hyperparameter names as keys and their corresponding values.
def load_dataset(path: str, train_val_split: int = 90) -> LanguageDataset
Load a dataset from the given file path and create a LanguageDataset object.
-
path
(str): The file path of the dataset to load. -
train_val_split
(int, optional): Percentage split between training and validation data. Default is 90, which means 90% training and 10% validation.
LanguageDataset
: An instance of the LanguageDataset class containing the loaded dataset.
def initialize_model(vocab_size: int, n_embd: Optional[int] = None, n_head: Optional[int] = None, n_layer: Optional[int] = None, block_size: Optional[int] = None, dropout: Optional[float] = None, device: Optional[str] = None) -> BigramLanguageModel
Initialize a BigramLanguageModel with the given hyperparameters.
-
vocab_size
(int): The size of the vocabulary, which determines the number of tokens in the language model. -
n_embd
(int, optional): The embedding dimension of the input. If not provided, it will be set to a default value. -
n_head
(int, optional): The number of attention heads in the multi-head self-attention. If not provided, it will be set to a default value. -
n_layer
(int, optional): The number of Transformer blocks in the model. If not provided, it will be set to a default value. -
block_size
(int, optional): The maximum length of the input sequence. If not provided, it will be set to a default value. -
dropout
(float, optional): The dropout rate for regularization. If not provided, it will be set to a default value. -
device
(str, optional): The device on which to place the model (e.g., 'cpu' or 'cuda'). If not provided, it will be set based on the availability of a GPU.
BigramLanguageModel
: An instance of the BigramLanguageModel class initialized with the given hyperparameters.
def train(model: BigramLanguageModel, train_data: torch.Tensor, val_data: torch.Tensor, learning_rate: Optional[float] = None, max_iters: Optional[int] = None, eval_interval: Optional[int] = None, device: Optional[str] = None, eval_iters: Optional[int] = None) -> BigramLanguageModel
Train the provided model on the given training data and validate it on the validation data.
-
model
(BigramLanguageModel): The language model to train. -
train_data
(torch.Tensor): The training data tensor. -
val_data
(torch.Tensor): The validation data tensor. -
learning_rate
(float, optional): The learning rate for the optimizer. If not provided, it will be set to a default value. -
max_iters
(int, optional): The maximum number of training iterations. If not provided, it will be set to a default value. -
eval_interval
(int, optional): Interval for printing evaluation results during training. If not provided, it will be set to a default value. -
device
(str, optional): The device on which to train the model (e.g., 'cpu' or 'cuda'). If not provided, it will be set based on the availability of a GPU. -
eval_iters
(int, optional): Number of iterations for evaluation during training. If not provided, it will be set to a default value.
BigramLanguageModel
: The trained model.
def save(path: str, model: BigramLanguageModel)
Save the state_dict of the provided model to the specified file path.
-
path
(str): The file path where the model state_dict will be saved. -
model
(BigramLanguageModel): The model whose state_dict will be saved.
def load(path: str, model: BigramLanguageModel, device: Optional[str] = None)
Load the model state_dict from the specified file path and assign it to the provided model.
-
path
(str): The file path from which the model state_dict will be loaded. -
model
(BigramLanguageModel): The model to which the loaded state_dict will be assigned. -
device
(str, optional): The device on which to load the model (e.g., 'cpu' or 'cuda'). If not provided, it will be set based on the availability of a GPU.
def fine_tune_model(model: BigramLanguageModel, learning_rate: float, max_iters: int, eval_interval: int, train_data: torch.Tensor, val_data: torch.Tensor) -> BigramLanguageModel
Fine-tune the provided model using the given training data.
-
model
(BigramLanguageModel): The model to be fine-tuned. -
learning_rate
(float): The learning rate for the optimizer during fine-tuning. -
max_iters
(int): The maximum number of iterations for fine-tuning. -
eval_interval
(int): The interval at which to evaluate the model's performance during fine-tuning. -
train_data
(torch.Tensor): The training data for fine-tuning. -
val_data
(torch.Tensor): The validation data for evaluating the model's performance during fine-tuning.
BigramLanguageModel
: The fine-tuned model.
def gpt2_124M(training_data_path: str, lr: float, max_iters: int, dropout: float = 0.1, eval_iters: int = 200, train_val_split: int = 85) -> BigramLanguageModel
Train a GPT-2 model with 124 million parameters on the provided training data.
-
training_data_path
(str): The path to the training data file. -
lr
(float): The learning rate for the optimizer during training. -
max_iters
(int): The maximum number of iterations for training. -
dropout
(float, optional): The dropout rate to be used in the model. Default is 0.1. -
eval_iters
(int, optional): The interval at which to evaluate the model's performance during training. Default is 200. -
train_val_split
(int, optional): The percentage of data to be used for training, and the rest for validation. Default is 85.
BigramLanguageModel
: The trained GPT-2 model.
Parameters count depends on the vocab_size.
def gpt2_finetune(pretrained_model: BigramLanguageModel, training_data_path: str, lr: float, max_iters: int, eval_iters: int = 200, train_val_split: int = 85) -> BigramLanguageModel
Fine-tune a pretrained GPT-2 model on the provided training data.
-
pretrained_model
(BigramLanguageModel): The pretrained GPT-2 model to be fine-tuned. -
training_data_path
(str): The path to the training data file. -
lr
(float): The learning rate for the optimizer during fine-tuning. -
max_iters
(int): The maximum number of iterations for fine-tuning. -
eval_iters
(int, optional): The interval at which to evaluate the model's performance during fine-tuning. Default is 200. -
train_val_split
(int, optional): The percentage of data to be used for training, and the rest for validation. Default is 85.
BigramLanguageModel
: The fine-tuned model.
def count_parameters(model: torch.nn.Module) -> int
Count the total number of trainable parameters in the given PyTorch model.
model
(torch.nn.Module): The PyTorch model for which the parameters need to be counted.
int
: The total number of trainable parameters in the model.
def get_batch(split: str, train_data: torch.Tensor, val_data: torch.Tensor, block_size: int = None, batch_size: int = None, device: str = None) -> Tuple[torch.Tensor, torch.Tensor]
Get a batch of data for training or validation from the specified split.
-
split
(str): The split to get the data from ('train' or 'val'). -
train_data
(torch.Tensor): The training data tensor. -
val_data
(torch.Tensor): The validation data tensor. -
block_size
(int, optional): The size of each input sequence block. Defaults to None. -
batch_size
(int, optional): The number of sequences in a batch. Defaults to None. -
device
(str, optional): The device to store the batch data on ('cpu' or 'cuda'). Defaults to None.
Tuple
[torch.Tensor, torch.Tensor]: A tuple containing two tensors - the input tensor representing the batch of sequences (x) and the target tensor representing the batch of sequences shifted by one (y).
def estimate_loss(model: torch.nn.Module, train_data: torch.Tensor, val_data: torch.Tensor, eval_iters: int = None, block_size: int = None, batch_size: int = None, device: str = None) -> Dict[str, float]
Estimate the average loss of the model on the training and validation data.
-
model
(torch.nn.Module): The PyTorch model for which the loss is to be estimated. -
train_data
(torch.Tensor): The training data tensor. -
val_data
(torch.Tensor): The validation data tensor. -
eval_iters
(int, optional): The number of iterations for estimating the loss. Defaults to None. -
block_size
(int, optional): The size of each input sequence block. Defaults to None. -
batch_size
(int, optional): The number of sequences in a batch. Defaults to None. -
device
(str, optional): The device to perform computations on ('cpu' or 'cuda'). Defaults to None.
-
Dict
[str, float]: A dictionary containing the average loss on the training and validation data.Keys
: 'train' and 'val'Values
: Average loss values (float)
default_hyperparams = {
'batch_size': 64,
'block_size': 256,
'max_iters': 5000,
'learning_rate': 5.859375e-05,
'device': 'cuda' if torch.cuda.is_available() else 'cpu',
'eval_iters': 200,
'n_embd': 384,
'n_head': 6,
'n_layer': 6,
'dropout': 0.2,
}
A dictionary containing the default hyperparameters for the GPT-2 model.
def set_hyperparamsdefault()
Set the default hyperparameters for the GPT-2 model using the set_hyperparams function.
# Import the required functions and classes
from GPT2LM import bigram as b
# Path to your training data file
training_data_path = 'path_to_your_training_data.txt'
# Load the dataset and split into training and validation sets (default split: 85% training, 15% validation)
dataset = b.load_dataset(training_data_path)
# set default hyperparams
b.set_hyperparams()
# Initialize the GPT-2 model with default hyperparameters and train on the dataset
model = b.initialize_model(vocab_size=dataset.vocab_size) # keep record of the 'vocab_size' to use later.
# Train the model on the training data and validate it on the validation data
trained_model = b.train(model, dataset.train_data, dataset.val_data)
# Save the trained model to a file
b.save('path_to_save_model.pt', trained_model)
# Import the required functions and classes
from GPT2LM import bigram as b
# Path to your training data file
training_data_path = 'path_to_your_training_data.txt'
# Load the dataset and split into training and validation sets (default split: 85% training, 15% validation)
dataset = b.load_dataset(training_data_path)
# set custom hyperparams
3 This will assign custom values to default hyperparams
batch_size=32
block_size=500
max_iters=10000
learning_rate=0.0001
eval_interval=250
n_embd=800
n_head=8
n_layer=8
dropout=0.1
b.set_hyperparams(batch_size=batch_size,
block_size=block_size,
max_iters=max_iters,
learning_rate=learning_rate,
eval_interval=eval_interval,
n_embd=n_embd,
n_head=n_head,
n_layer=n_layer,
dropout=dropout
)
# Initialize the GPT-2 model with custom hyperparameters and train on the dataset
model = b.initialize_model(vocab_size=dataset.vocab_size) # keep record of the 'vocab_size' to use later.
# Train the model on the training data and validate it on the validation data
trained_model = b.train(model, dataset.train_data, dataset.val_data)
# Save the trained model to a file
b.save('path_to_save_model.pt', trained_model)
# initiate context
context=torch.zeros((1,1),dtype=torch.long,device=device)
# generate the new tokens and decode them into text
text=dataset._decode(model.generate(context,max_new_tokens=200))
# initiate context
context=torch.zeros((1,1),dtype=torch.long,device=device)
# Give prompt
prompt = "To be or not to be, that is the question."
# encode prompt into tokens
prompt_encoded = torch.tensor(dataset._encode(prompt), dtype=torch.long, device=device)
prompt_encoded = prompt_encoded.unsqueeze(0)
# initiate context with prompt
context_with_prompt = torch.cat((context, prompt_encoded), dim=1)
# generate the new tokens and decode them into text
text=dataset._decode(model.generate(context_with_prompt,max_new_tokens=200))
This project is designed for educational purposes to help you learn and understand how generative AI works. It demonstrates the implementation of a GPT-2 variant, a popular generative language model, and provides a hands-on experience in training language models on custom datasets. By exploring the code and running the provided examples, you can gain insights into the underlying principles of generative AI and natural language processing.
Please note that while this project serves as a learning resource, it is essential to be mindful of the ethical considerations and potential risks associated with AI models. The use of AI technology, especially in generating text, should be done responsibly and with respect for ethical guidelines.
Generative Pre-trained Transformer (GPT) models, such as GPT-2, are designed to predict the next word or token in a sequence of text. They achieve this by leveraging the power of self-attention mechanisms within the Transformer architecture.
At a high level, here's how GPT models work to predict the next word or token:
-
Language Modeling
: GPT models are trained on large amounts of text data using a process called language modeling. During training, the model learns the statistical patterns and dependencies present in the input text. It captures the relationships between different words and the likelihood of one word following another in a sentence. -
Contextual Understanding
: The key feature of GPT models is their ability to understand the context in which a word or token appears. Instead of treating each word in isolation, the model considers the entire sequence of tokens and assigns higher importance to the tokens that are most relevant for understanding the current word. -
Self-Attention
: Self-attention is the mechanism that allows GPT models to weigh the importance of different tokens in the input sequence based on their contextual relevance. Tokens that are semantically related or have strong dependencies receive higher attention scores, while unrelated tokens receive lower attention scores. -
Autoregressive Generation
: To predict the next word or token in a sequence, GPT models use an autoregressive generation process. This means that the model predicts each token one by one, conditioned on the tokens that came before it in the sequence. The previously predicted tokens serve as context to guide the prediction of the next token. -
Text Completion
: By repeatedly predicting the next token and using the predicted tokens as context, the GPT model can complete a given prompt or generate entirely new text. The generated text is coherent and contextually appropriate because the model has learned from vast amounts of text data during pre-training. -
Adaptability
: GPT models are highly adaptable and can be fine-tuned on specific tasks or domains after pre-training. This process allows them to perform well on a wide range of natural language processing tasks, such as text generation, machine translation, question answering, and more.
In summary, GPT models are powerful language models that predict the next word or token in a sequence by understanding the context and dependencies between words. Their ability to generate coherent and contextually relevant text makes them valuable tools for various natural language processing applications.
As these models are designed to complete given documents, they are not capable of engaging in real-time conversations with users. If you present them with a question, they will simply generate more questions that resemble the input. To overcome this limitation, we use a specific format for interaction.
For instance, we frame the conversation as if it's between a human and an AI. This format helps the AI understand its role and context better. Here's an example prompt:
"AI: What can I do for you today? \nHuman: ..."
In this format, the human provides their input after "Human:", and the AI responds accordingly. The entire conversation is treated as a single document, with the AI's responses generated by the model and the human's responses provided by an actual person.
This approach allows us to harness the power of the language model to generate creative and contextually relevant AI responses, while human input ensures that the conversation remains meaningful and coherent.
By using this interaction format, we can effectively leverage the capabilities of the language model in completing text documents, making it a useful tool for various natural language processing tasks.
pip install GPT2LM
There are no such big changes in bigram, the only difference is the use of the GELU
activation function instead of the ReLU
activation function.
ReLU(x)
or Rectified Linear Unit
= max(0,x)
GELU(x)
or Gaussian Error Linear Unit
= xΦ(x) = 0.5x(1+tanh[2/π(x+0.044715x^3)])
The Gaussian Error Linear Unit (GELU)
is an activation function used in deep learning. It is defined as xΦ(x)
, where Φ(x)
is the standard Gaussian cumulative distribution function
. The GELU nonlinearity weights inputs by their percentile, rather than gating inputs by their sign as in ReLUs. Consequently, the GELU can be thought of as a smoother ReLU.
It is not known why certain activation functions work better than others in different contexts. So the only answer for “why use GELU instead of ReLu” is "because it works better". However, there are some possible explanations. For example, ReLU can suffer from "problems where a significant amount of neurons in the network become zero and don’t practically do anything". GELU is smoother near zero and “is differentiable in all ranges, and allows to have gradients (although small) in the negative range” which helps with this problem.
In summary, GELU is used in generative AI instead of ReLU because it has been found to work better in practice. It is a smoother version of ReLU that allows for gradients in the negative range, which can help prevent problems with neurons becoming inactive.
Activation functions are used in neural networks to introduce non-linearity into the output of a neuron. Without an activation function, the weights and bias would simply do a linear transformation. A linear equation is simple to solve but is limited in its capacity to solve complex problems and have less power to learn complex functional mappings from data. A neural network without an activation function is just a linear regression model
.
Generally, neural networks use non-linear activation functions, which can help the network learn complex data, compute and learn almost any function representing a question, and provide accurate predictions. Activation functions provide the building blocks that can be used repeatedly in two dimensions of the network structure so that, combined with an attenuation matrix to vary the weight of signaling from layer to layer, is known to be able to approximate an arbitrary and complex function.
In summary, activation functions are used in neural networks to introduce non-linearity into the output of a neuron, which allows the network to learn complex data and provide accurate predictions. They are essential building blocks of neural networks.
A bigram is a type of n-gram model that represents text as the joint probability of a sequence of two words. Bigrams, along with other n-grams, are used in most successful language models for speech recognition.
On the other hand, GPT (Generative Pretrained Transformer) is a type of neural language model that uses a transformer architecture to generate text. GPT is a modern Transformer language model, which is more complex than a bigram character-level language model.
In summary, a bigram is a simple statistical model that represents text as the joint probability of a sequence of two words, while GPT is a more complex neural language model that uses a transformer architecture to generate text. The two models have different strengths and weaknesses and can be used for different purposes.
Certainly, let's differentiate between GPT
(a type of transformer model), a bigram
model, and the custom BigramLanguageModel
:
-
GPT (Generative Pre-trained Transformer)
:Architecture
: GPT is based on the transformer architecture, which consists of a stack of multiple layers of self-attention and feedforward neural networks.Self-Attention
: GPT uses multi-head self-attention to capture relationships between tokens in the input sequence.Parameter Sharing
: GPT is a fully autoregressive model, meaning it generates one token at a time based on the previously generated tokens. It doesn't rely on fixed n-grams like bigram models.Pre-training
: GPT is pre-trained on a large corpus of text data, learning language patterns, and semantic understanding from diverse text sources.Generative
: GPT is designed for generating text and is commonly used for tasks like text completion, text generation, and language translation.
-
Bigram Model
:Architecture
: A bigram model is a simple statistical language model that focuses on pairs of consecutive words (bigrams) in a text corpus.Scope
: It doesn't have a deep neural network architecture like transformers or GPT. Instead, it relies on counting the frequency of bigrams in a given text corpus.Parameterization
: Bigram models have a relatively small number of parameters compared to deep learning models.Generative Power
: Bigram models are less expressive than GPT and cannot capture complex language structures or long-range dependencies in the same way.Use Case
: Bigram models are often used for basic language modeling tasks but are less suited for generating coherent and contextually rich text compared to GPT.
-
Custom Model (BigramLanguageModel)
:Architecture
:This model a custom implementation. It combines elements of self-attention and feedforward neural networks within each block.Embeddings
: It uses token embeddings and positional embeddings, which are common in transformer-based models.Multi-Head Attention
: Like GPT, it incorporates multi-head self-attention within each block.Complexity
: This custom model is more complex and expressive than a simple bigram model but may not have the same generative power or pre-training as GPT.
In summary, GPT is a state-of-the-art transformer-based model designed for generative tasks, bigram models are simpler statistical models focusing on word pairs, and the custom model combines elements of transformers but is a custom implementation tailored for a specific purpose. Each model has its own strengths and weaknesses depending on the task at hand.
Go to Colab or Kaggle and train two models, both on the same dataset but with different versions of GPT2LM, to understand how the model works with different activation functions and why generative AI like GPT uses GELU instead of ReLU, probably the most popular activation function.
Also implement your own ReLU and GELU function and plot the graph of them using matplotlib on two or three different ranges to see the actual difference in both functions because on large ranges the graph will overlap each other and appears as the same graph. So on smaller ranges [-1,1] you will see a very small difference in graphs.
Description: [Implementation of GPT-2 language model for text generation, training custom models and fine-tuning.]
The utils
module provides various utility functions used for common tasks such as setting random seeds for reproducibility, setting up logging for experiments, loading/saving models and data, and generating text using language models. Additionally, it includes a lightweight configuration class inspired by YACS (CfgNode
) for managing experiment configurations.
set_seed(seed)
- Description: Set the random seeds for reproducibility across different runs.
- Parameters:
seed
(int): The seed value to set for random number generators.
setup_logging(config)
- Description: Set up logging for the experiment. This function creates necessary directories and logs configurations.
- Parameters:
config
(object): Configuration object containing experiment settings.
CfgNode
- Description: A lightweight configuration class inspired by YACS. This class provides a flexible way to manage configurations for experiments or applications.
load_text(path)
- Description: Load text data from a file.
- Parameters:
path
(str): The path to the text file.
- Returns:
str
orNone
: The content of the text file as a string if successful, otherwiseNone
.
load_text(path)
-
Description: Load text data from a file.
-
Parameters:
path
(str): The path to the text file.
-
Returns:
str
orNone
: The content of the text file as a string if successful, otherwiseNone
.
-
Raises:
FileNotFoundError
: If the specified file path does not exist.Exception
: If an error occurs while reading the file.
load_json_to_text(path)
-
Description: Load JSON data from a file and convert it into a single text string.
-
Parameters:
path
(str): The path to the JSON file.
-
Returns:
str
orNone
: A single text string containing the JSON data if successful, otherwiseNone
.
-
Raises:
FileNotFoundError
: If the specified file path does not exist.Exception
: If an error occurs while reading the JSON file or converting it to text.
load_csv_to_text(path)
-
Description: Load data from a CSV file and convert it into a single text string.
-
Parameters:
path
(str): The path to the CSV file.
-
Returns:
str
orNone
: A single text string containing the CSV data if successful, otherwiseNone
.
-
Raises:
FileNotFoundError
: If the specified file path does not exist.Exception
: If an error occurs while reading the CSV file or converting it to text.
save(path, model)
- Description: Save the model state dictionary to a file.
- Parameters:
path
(str): Path to save the model state dictionary.model
(torch.nn.Module
): Model to save.
load(path, model, device=None)
- Description: Load the model state dictionary from a file.
- Parameters:
path
(str): Path to the saved model state dictionary.model
(torch.nn.Module
): Model to load the state dictionary into.device
(str, optional): Device to load the model onto (e.g., 'cuda' or 'cpu'). If None, it will automatically detect CUDA availability.
- Returns:
None
Generate(model, prompt, encoder_path, max_len)
- Description: Generate text based on a given prompt using the provided model.
- Parameters:
model
(torch.nn.Module
): The language model used for text generation.prompt
(str): The prompt to start text generation.encoder_path
(str): Path to the encoder file for encoding/decoding characters.max_len
(int): Maximum length of the generated text.
- Returns:
str
: Generated text based on the prompt.
print_config()
- Description: Function to print the configurations of various GPT models.
- Prints the configurations of different GPT models including the number of layers, attention heads, and embedding size.
CfgNode
: Lightweight configuration class inspired by YACS.
os
,sys
,csv
,math
,json
,time
,random
,requests
,regex
,numpy
,tqdm
: Standard Python libraries for various tasks.torch
,torch.nn
, ``torch.utils.data`: PyTorch library for deep learning.torch.utils.data.dataloader
: DataLoader class for loading data in batches.
device
: Represents the device on which tensors reside, either "cuda" (GPU) if available or "cpu" if not available.
- Ensure proper installation of PyTorch and other dependencies to utilize all functions effectively.
The GPT (Generative Pre-trained Transformer) Language Model is a powerful model architecture based on the Transformer architecture. This implementation includes functionalities for training and generation of text.
- Embedding Layers:
- Token Embeddings: Embeds input tokens into a continuous vector space.
- Position Embeddings: Embeds the position of each token in the input sequence.
- Transformer Blocks:
- Each block consists of a causal self-attention layer followed by a feed-forward neural network (MLP).
- Layer normalization and residual connections are applied within each block.
- Language Model Head:
- Linear layer projecting the output of the transformer blocks into logits over the vocabulary.
- Configurable:
- The model architecture can be configured using a
CfgNode
object, allowing customization of the number of layers, heads, embedding size, and other parameters.
- The model architecture can be configured using a
- Optimization:
- Provides methods for configuring optimizers for training, including options for weight decay and learning rate scheduling.
- Generation:
- Includes functionality for generating new sequences of text given a conditioning sequence.
- Initialization:
- Parameters are initialized following best practices for transformer-based models, including custom initialization for certain layers.
config
: Configuration object containing model parameters.idx
: Input tensor representing token indices.targets
: Target tensor for training.max_new_tokens
: Maximum number of tokens to generate during text generation.temperature
: Temperature parameter for sampling during generation.do_sample
: Boolean indicating whether to sample from the distribution during generation.top_k
: Number of top-k tokens to consider during sampling.
forward(idx, targets)
: Performs a forward pass through the model.generate(idx, max_new_tokens, temperature, do_sample, top_k)
: Generates new tokens given a conditioning sequence.
block_size
: Maximum block size for input sequences.device
: Device (CPU or GPU) where the model is currently located.
The CharDataset
class is designed for handling character-level text data. It prepares the data for training by encoding characters into numerical indices. This dataset is suitable for language modeling tasks where the model predicts the next character in a sequence given the previous characters.
CharDataset(data, block_size, output_dir='dir/')
data (str)
: Text data to be processed.block_size (int)
: Size of the text blocks used for training.output_dir (str, optional)
: Output directory to save the encoder JSON file.
-
vocab_size
: Number of unique characters in the dataset. -
block_size
: Size of the text blocks. -
data
: Raw text data. -
get_vocab_size()
: Get the vocabulary size. -
get_block_size()
: Get the block size. -
__len__()
: Get the length of the dataset. -
__getitem__(idx)
: Get an item from the dataset by index.
The Encoder
class provides functionality for encoding and decoding text using an encoder dictionary. It loads the encoder dictionary from a JSON file and provides methods to encode and decode text.
Encoder(encoder_path)
encoder_path (str)
: Path to the encoder JSON file.
-
encode(s)
: Encode a string into a list of integers. -
decode(l)
: Decode a list of integers into a string.encoder = Encoder("encoder.json") encoded_text = encoder.encode("Hello, world!") decoded_text = encoder.decode(encoded_text)
The FineEncoderDataset
class is similar to CharDataset
but allows for fine control over the encoding process. It requires an external encoder file (JSON format) for encoding and decoding the text data.
FineEncoderDataset(data, encoder_path, block_size)
data (str)
: Input data for the dataset.encoder_path (str)
: Path to the encoder JSON file.block_size (int)
: Size of each block in the dataset.
encode(s)
: Encode the input string using the encoder.decode(l)
: Decode the input list using the encoder.
dataset = FineEncoderDataset(data, "encoder.json", block_size=128)
The Trainer
class is responsible for training neural networks. It provides functionality for running the training loop, handling data loading, optimization, and callback triggering.
Trainer(config, model, train_dataset)
config (CfgNode)
: Configuration for the Trainer.model (nn.Module)
: The neural network model.train_dataset
: The training dataset.
device
: Device (CPU or GPU) used for training.optimizer
: Optimizer used for training.callbacks
: Dictionary storing callback functions for different events.iter_num
: Current iteration number.iter_time
: Time at the current iteration.iter_dt
: Time difference between the current and previous iteration.
-
add_callback(onevent, callback)
: Add a callback function to be triggered on an event. -
set_callback(onevent, callback)
: Set a callback function to be triggered on an event. -
trigger_callbacks(onevent)
: Trigger the callbacks for a specific event. -
run(num_epochs=None)
: Run the training loop.trainer = Trainer(config, model, train_dataset) trainer.run(num_epochs=10)
The FineTuner
class is used for fine-tuning a pre-trained model with new data. It initializes the training process with the provided model, data, encoder path, block size, and training configuration.
FineTuner(model, data, encoder_path, block_size, train_config)
model (torch.nn.Module)
: Model to fine-tune.data (str)
: Input data for fine-tuning.encoder_path (str)
: Path to the encoder file.block_size (int)
: Size of each block in the dataset.train_config (CfgNode)
: Training configuration for the fine-tuning process.
device
: Device (CPU or GPU) used for fine-tuning.
fine_tune(epoch=1)
: Perform fine-tuning of the model for the specified number of epochs.
fine_tuner = FineTuner(model, data, encoder_path, block_size, train_config)
fine_tuner.fine_tune(epoch=5)
from GPT2LM import gpt2
block_size=1024 #Example block_size, you can choose your own
text=gpt2.load_text('path to .txt) # Can use load_csv() or load_json() for csv and json
train_dataset=gpt2.CharDataset(text, block_size, 'out_dir') # Encoder.json will be save in out_dir
model_config = gpt2.GPT.get_default_config()
model_config.model_type = 'gpt-micro' # more model typesare available, use `print_config()` to see all
model_config.vocab_size = train_dataset.vocab_size
model_config.block_size = block_size
model = gpt2.GPT(model_config)
train_config = gpt2.Trainer.get_default_config()
train_config.learning_rate = 5e-4
train_config.max_iters = 1000 #iters per epoch
train_config.batch_size = 64
trainer = gpt2.Trainer(train_config, model, train_dataset)
trainer.run(10) # number of epochs = 10
print(gpt2.Generate(model, 'Hi', 'path_to_encoder.json', 200)) # 'Hi' is prompt and 200 is the maximum length of generated response means 200 characters will be generated
from GPT2LM import gpt2
block_size=1024 # same as pre-trained model
model_config = gpt2.GPT.get_default_config()
model_config.model_type = 'gpt-micro'
model_config.vocab_size = n # same as pre-trained model
model_config.block_size = block_size
model = gpt2.GPT(model_config)
load('path_to_pre_trained_model', model) # Load model weights from pre-trained model
encoder_path = "path_to_encoder.json"
fine_data=gpt2.load_text('path_to_txt_file)'
fine_config = gpt2.Trainer.get_default_config()
fine_config.learning_rate = 5e-4
fine_config.max_iters = 1000
fine_config.batch_size = 64
finetuner = gpt2.FineTuner(model, fine_data, encoder_path, block_size, fine_config)
finetuner.fine_tune(3)
This project is licensed under the MIT License.
You can check the model Model_Card
for all the available model types in GPT2LM.gpt2
.
Here are some links for better understanding.
-
Attention Is All You Need
link here -
Textbooks Are All You Need
Link here -
GPT(Generative Pre-trained Transformers)
Link here -
Language Models are Unsupervised Multitask Learners
Link here -
GPT-2
Link here -
GELU
Link here -
ReLU
Link here -
ReLU VS GELU
Link here -
GIT Link of this project
Link here -
PyPI Link of this project
Link here
Code is completely open source so you can play around it and get a better understanding of it.
GPT-3 Hyperparameter are not publicly available yet but you can use your own hyperparameter to train a gpt-3 like base model.