In [1]:
# Import the function for loading Hugging Face pipelines
from transformers import pipeline

model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'

# Load the pipeline for sentiment classification
classifier = pipeline("text-classification", model=model_name)
classifier

  from .autonotebook import tqdm as notebook_tqdm


<transformers.pipelines.text_classification.TextClassificationPipeline at 0x1e008052ec0>

In [2]:
prompt = "The food was good, but service at the restaurant was a bit slow"

# Pass the customer review to the model for prediction
prediction = classifier(prompt)
print(prediction)

[{'label': '3 stars', 'score': 0.6387940645217896}]


In [3]:
prompt = "The food was good, everything well organized"

# Pass the customer review to the model for prediction
prediction = classifier(prompt)
print(prediction)

[{'label': '4 stars', 'score': 0.5190770030021667}]


In [4]:
model_name = 'cnicu/t5-small-booksum'

# Load the model pipeline for text summarization
summarizer = pipeline('summarization', model = model_name)
summarizer

<transformers.pipelines.text2text_generation.SummarizationPipeline at 0x1e04a5d1600>

In [8]:
long_text = '\nThe tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.\n'

# Pass the long text to the model to summarize it
outputs = summarizer(long_text, max_length = 30)
outputs

[{'summary_text': 'the Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey'}]

In [9]:
# Access and print the summarized text in the outputs variable
print(outputs[0]['summary_text'])

the Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey


In [11]:
# Load the model pipeline for question-answering
qa_model = pipeline("question-answering")
question = "For how long was the Eiffel Tower the tallest man-made structure in the world?"
question = "What's the tallest structure in France?"

# Pass the necessary inputs to the LLM pipeline for question-answering
outputs = qa_model(question, long_text)

# Access and print the answer
print(outputs['answer'])

No model was supplied, defaulted to distilbert/distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


Millau Viaduct


In [13]:
model_name = "Helsinki-NLP/opus-mt-es-en"

input_text = "No creo que hagas una buena traducción"

# Define pipeline for Spanish-to-English translation
translator = pipeline('translation_es_to_en', model=model_name)

# Translate the input text
translations = translator(input_text)

# Access the output to print the translated text in English
print(translations[0]['translation_text'])

I don't think you're doing a good translation.


In [14]:
# Set transformer model hyperparameters
d_model = 256
n_heads = 4
num_encoder_layers = 3
num_decoder_layers = 3

In [9]:
import torch.nn as nn

# Create the transformer model and assign hyperparameters
model = nn.Transformer(
    d_model=d_model, # d_model is the dimension of the input vectors and output vectors of the model, specifically the size of the feature space. Essentially, it determines the number of features in each transformer layer
    nhead=n_heads,
    num_encoder_layers=num_encoder_layers,    
    num_decoder_layers=num_decoder_layers
    )


print(model)

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, o



In [21]:
# Create a pipeline for text generation using the gpt2 model
generator = pipeline("text-generation", model = "gpt2")

text = "I had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had."

response = "Dear valued customer, I am glad to hear you had a good stay with us."



In [22]:
# Build the prompt for the text generation LLM

prompt = f"Customer review:\n{text}\n\nHotel reponse to the customer:\n{response}"

prompt

"Customer review:\nI had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had.\n\nHotel reponse to the customer:\nDear valued customer, I am glad to hear you had a good stay with us."

In [23]:
#### Pass the prompt to the model pipeline
outputs = generator(prompt, max_length = 100, pad_token_id=generator.tokenizer.eos_token_id) #  if the generated text is shorter than max_length, the remaining tokens will be filled with the EOS token.

# Print the augmented sequence generated by the model
print(outputs[0]['generated_text'])

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Customer review:
I had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had.

Hotel reponse to the customer:
Dear valued customer, I am glad to hear you had a good stay with us. We appreciate your service and would like to know from every customer what exactly we purchased which accommodations we


In [12]:
import torch  
import torch.nn as nn  
import math  

# Subclass the PyTorch nn.Module class to create a custom module for positional encoding
# This class is used to add positional information to the input embeddings in a Transformer model
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_length):
        # Call the parent class's constructor
        super(PositionalEncoder, self).__init__()
        
        # Initialize the dimensions of the model and the maximum sequence length
        self.d_model = d_model  # The dimension of the input embeddings
        self.max_length = max_length  # The maximum length of the input sequences
        
        # Initialize the positional encoding matrix with zeros
        # This matrix will store the positional encodings that will be added to the input embeddings
        pe = torch.zeros(max_length, d_model)  

        # Create a tensor of positions from 0 to max_length
        # This tensor represents the positions of the words in a sequence
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)  #  we unsqueeze because the multiplication operation position * div_term requires position to be a 2D tensor to correctly broadcast with div_term (the extra dimension is added at position '1': second position)
        
        # Calculate the division term for the positional encoding
        # This term is used in the calculation of the positional encodings
        # The div_term values decrease exponentially, which means the positional encoding changes more rapidly for lower-dimensional embeddings and more slowly for higher-dimensional embeddings. This allows the model to learn to attend to both nearby words (local attention) and far-away words (global attention), which is crucial for understanding the context of a sentence.
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * -(math.log(10000.0) / d_model))  
        
        # Calculate and assign position encodings to the matrix
        # For even indices, use sine of the position times the division term
        # For odd indices, use cosine of the position times the division term
        # These encodings are based on sine and cosine functions of different frequencies
        # The sine and cosine functions are used to ensure that the positional encodings are continuous and differentiable, which is important for the learning process. Also, these functions generate values between -1 and 1, which helps to keep the magnitude of the positional encodings manageable.
        # Using sine for even indices and cosine for odd indices provides two different signals for each position, which helps the model distinguish between different positions more effectively.
        pe[:, 0::2] = torch.sin(position * div_term)  
        pe[:, 1::2] = torch.cos(position * div_term)  
        
        # Add an extra dimension to the positional encoding matrix, turning it from a 2D tensor into a 3D tensor(the extra dimension is added at position '0': first position)
        # This is done to match the dimensions of the input embeddings (batch size, sequence length, and embedding size)
        pe = pe.unsqueeze(0)  
        
        # Register the positional encoding matrix as a buffer that should not be considered a model parameter
        # Buffers are tensors that are not updated during backpropagation but need to be part of the model's state
        self.register_buffer('pe', pe)  # N.B. self.pe is defined when pe is registered as a buffer. 
    
    # Define the forward pass of the model
    def forward(self, x):
        # Update the input tensor by adding the positional encodings
        # The positional encodings are added to the input embeddings so that the model can take into account the position of words in a sequence
        # The size(1) method returns the size of the second dimension of x, which represents the sequence length.
        # By slicing pe to :x.size(1), we ensure that the positional encodings are correctly aligned with the words in each input sequence.
        x = x + self.pe[:, :x.size(1)]  
        
        # Return the updated tensor
        # This updated tensor is then passed on to the next layer of the Transformer model
        return x  


In [13]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # The number of attention heads. This is the number of different 
        # weighted sums of the input vectors that we will compute.
        self.num_heads = num_heads

        # The dimension of the input embeddings. This is the size of the 
        # vectors that will be processed by the attention mechanism.
        self.d_model = d_model 

        # The dimension of each head. This is the size of the vectors that 
        # each attention head will process independently.
        # N.B. each attention head processes the entire data, but they do so in their own learned representation space. The head_dim is the dimensionality of this representation space. The division by num_heads is done to ensure that the dimensionality of the input (d_model) is preserved through this process. This is important for the subsequent layers in the model, which expect inputs of a d_model dimensionality.
        self.head_dim = d_model // num_heads

        # Linear transformations for the queries, keys and values. These 
        # are standard fully connected layers that will transform the input 
        # vectors into the queries, keys and values.
        self.query_linear = nn.Linear(d_model, d_model)
        self.key_linear = nn.Linear(d_model, d_model)
        self.value_linear = nn.Linear(d_model, d_model)

        # The final linear transformation. This layer will transform the 
        # concatenated output of the attention heads into the final output 
        # vector.
        self.output_linear = nn.Linear(d_model, d_model)
        
    def split_heads(self, x, batch_size):
        # This function splits the input vectors into the different attention 
        # heads. It first reshapes the input vectors into a tensor of shape 
        # (batch_size, sequence_length, num_heads, head_dim), and then 
        # rearranges the dimensions to bring the number of heads to the 
        # second dimension.
        x = x.view(batch_size, -1, self.num_heads, self.head_dim)
        # The contiguous method is used to ensure that the tensor in memory is 
        # properly ordered, allowing us to view its data with a different shape.
        # Contiguous creates a new tensor with the same data but with all the data contiguously in memory. 
        # The view method is used twice to first change the shape of the tensor 
        # and then flatten the tensor (from batch_size, self.num_heads, -1, self.head_dim --> to batch_size * self.num_heads, -1, self.head_dim)
        return x.permute(0, 2, 1, 3).contiguous().view(batch_size * self.num_heads, -1, self.head_dim) 
    
    
    def compute_attention(self, query, key, mask=None):
        # This function computes the attention weights. It first computes 
        # the dot product of the query and key tensors, applies a mask if 
        # provided, and then applies a softmax function to obtain the 
        # attention weights.
        scores = torch.matmul(query, key.permute(0, 2, 1))  # fixed from the original: torch.matmul(query, key.permute(1, 2, 0))
        if mask is not None:
            # The mask is used to prevent the attention mechanism from focusing 
            # on certain positions. This is done by setting the scores of these 
            # positions to a very large negative value, effectively zeroing out 
            # their impact on the softmax result.
            scores = scores.masked_fill(mask == 0, float("-1e20")) 
        attention_weights = F.softmax(scores, dim=-1)
        return attention_weights

    
    def forward(self, query, key, value, mask=None):
        # The forward function computes the output of the multi-head 
        # attention layer. It first applies the linear transformations and 
        # splits the input into multiple heads, then computes the attention 
        # weights, applies these weights to the values, and finally 
        # concatenates and linearly transforms the result into the output 
        # vector.
        # query.size(0) returns the size of the batch dimension. It is used 
        # to reshape the input tensors before splitting them into heads.
        batch_size = query.size(0) 

        query = self.split_heads(self.query_linear(query), batch_size)
        key = self.split_heads(self.key_linear(key), batch_size)
        value = self.split_heads(self.value_linear(value), batch_size)

        attention_weights = self.compute_attention(query, key, mask)

        output = torch.matmul(attention_weights, value)
        output = output.view(batch_size, self.num_heads, -1, self.head_dim).permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model) # (from batch_size, -1, self.num_heads, self.head_dim --> to batch_size, -1, self.d_model), being self.d_model = self.num_heads * self.head_dim
        return self.output_linear(output)


In [14]:
class FeedForwardSubLayer(nn.Module):
    
    # Specify the two linear layers' input and output sizes
    def __init__(self, d_model, d_ff):
        super(FeedForwardSubLayer, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

	# Apply a forward pass
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [15]:
# Complete the initialization of elements in the encoder layer
class EncoderLayer(nn.Module):
    
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    # The input x is a sequence of embeddings representing the input tokens, and its shape is generally (batch_size, sequence_length, d_model). In the context of self-attention mechanisms, such as the one used in Transformer models, x is used as the Query (Q), Key (K), and Value (V).  In self-attention, the same input (in this case, x) is used as the Query, Key, and Value. This allows the model to compute attention scores based on the input itself.
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        # The operation x + self.dropout(attn_output) is an example of a technique called residual connection: The idea is that it’s easier to model a residual (or difference) than to learn to model the full information. In this specific case,we are “adding the residual”, that is the output of the self-attention mechanism (which has learned how to modify the input) back to the original input. 
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        return self.norm2(x + self.dropout(ff_output))

In [16]:
class TransformerEncoder(nn.Module):
    
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoder(d_model, max_sequence_length)
        # Define a stack of multiple encoder layers
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
	
    # Complete the forward pass method
    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x


In [17]:
import torch.nn.functional as F


class ClassifierHead(nn.Module):
    
    def __init__(self, d_model, num_classes):
        super(ClassifierHead, self).__init__()
        # Add linear layer for multiple-class classification
        self.fc = nn.Linear(d_model, num_classes)

    # the dimensionality of the embeddings is:
# batch_size is the number of sequences in the batch.
# sequence_length is the length of each sequence.
# d_model is the dimensionality of the embeddings (i.e., the size of the feature vector for each token).
# The slice x[:, 0, :] selects the entire feature vector for the first token in each sequence, that is the [CLS] token, which includes an aggregate representation of the entire sequence, useful in classification tasks.
    def forward(self, x):
        logits = self.fc(x[:, 0, :])
        # Obtain log class probabilities upon raw outputs
        return F.log_softmax(logits, dim=-1)

In [18]:
num_classes = 3
vocab_size = 10000
batch_size = 8
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
sequence_length = 256
dropout = 0.1

In [19]:
# Note: although a random input sequence and mask are being used here, in practice, the mask should correspond to the actual location of padding tokens in the input sequences to ensure all of them are the same length

input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))


print(input_sequence.shape)
input_sequence

torch.Size([8, 256])


tensor([[5969, 8949, 5746,  ..., 2773, 8030, 5480],
        [1419, 9100, 5027,  ..., 6214, 8246, 9817],
        [9706, 3859, 4062,  ..., 4531, 7151, 8748],
        ...,
        [ 181, 6658, 3194,  ..., 5632, 9985, 1703],
        [9076, 3413, 5868,  ..., 4123,   60, 7898],
        [6636, 4520, 6837,  ..., 7825, 7024, 2239]])

In [20]:
mask = torch.randint(0, 2, (sequence_length, sequence_length))

print(mask.shape)
mask

torch.Size([256, 256])


tensor([[0, 0, 0,  ..., 0, 0, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 1, 0],
        ...,
        [0, 0, 0,  ..., 0, 1, 0],
        [0, 1, 0,  ..., 0, 0, 1],
        [1, 0, 1,  ..., 0, 1, 0]])

In [21]:
# Instantiate the encoder transformer's body and head
encoder = TransformerEncoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)

encoder

TransformerEncoder(
  (embedding): Embedding(10000, 512)
  (positional_encoding): PositionalEncoder()
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (query_linear): Linear(in_features=512, out_features=512, bias=True)
        (key_linear): Linear(in_features=512, out_features=512, bias=True)
        (value_linear): Linear(in_features=512, out_features=512, bias=True)
        (output_linear): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): FeedForwardSubLayer(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)

In [22]:
classifier = ClassifierHead(d_model, num_classes)

classifier

ClassifierHead(
  (fc): Linear(in_features=512, out_features=3, bias=True)
)

In [23]:
# Complete the forward pass 
enc_output = encoder(input_sequence, mask)

print(enc_output.shape)
enc_output


torch.Size([8, 256, 512])


tensor([[[ 1.5497e+00,  8.3425e-01, -5.7210e-01,  ...,  8.1865e-01,
           1.7531e-01,  1.6560e+00],
         [ 7.8067e-01, -1.1346e+00,  7.4688e-01,  ...,  1.8037e+00,
          -2.7236e-01,  1.3512e+00],
         [ 3.7972e-02, -5.0163e-02,  3.8260e-01,  ...,  1.7472e+00,
           5.4422e-02,  5.9329e-01],
         ...,
         [ 1.9421e+00,  8.1733e-01,  4.1246e-01,  ...,  1.6968e+00,
          -3.3675e-02,  1.8066e+00],
         [ 1.9257e+00, -8.1500e-01, -3.5197e-01,  ...,  1.0309e+00,
          -9.8270e-01,  8.5594e-01],
         [ 1.4772e+00, -1.4910e-01,  8.8285e-01,  ...,  1.2714e+00,
           4.4124e-01,  1.4253e+00]],

        [[-1.1298e+00,  1.4997e+00, -2.2737e-01,  ...,  2.3606e+00,
          -4.2508e-01, -2.6108e-01],
         [ 4.1484e-01,  8.7193e-02,  1.0362e+00,  ...,  1.4725e+00,
          -2.4606e+00,  2.9562e+00],
         [-1.2097e+00, -2.5619e-01,  8.6063e-01,  ..., -6.8835e-01,
          -8.3816e-01,  1.2612e+00],
         ...,
         [ 1.5858e+00,  1

In [24]:
classification = classifier(enc_output)
print("Classification outputs for a batch of ", batch_size, "sequences:")
print(classification.shape)
classification

Classification outputs for a batch of  8 sequences:
torch.Size([8, 3])


tensor([[-0.6493, -1.3413, -1.5321],
        [-1.2763, -0.6592, -1.5912],
        [-0.3984, -2.2159, -1.5161],
        [-0.5531, -1.4643, -1.6419],
        [-0.7159, -1.0302, -1.8688],
        [-0.8728, -0.7931, -2.0417],
        [-0.5889, -1.5289, -1.4771],
        [-0.5819, -1.1680, -2.0391]], grad_fn=<LogSoftmaxBackward0>)

In [25]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        
        # Initialize the causal (masked) self-attention and cross-attention
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, causal_mask, encoder_output, cross_mask):
        # Pass the necessary arguments to the causal self-attention and cross-attention
        self_attn_output = self.self_attn(x, x, x, causal_mask)
        x = self.norm1(x + self.dropout(self_attn_output))
        cross_attn_output = self.cross_attn(x, encoder_output, encoder_output, cross_mask)
        x = self.norm2(x + self.dropout(cross_attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [26]:
class TransformerDecoder(nn.Module):
    
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoder(d_model, max_sequence_length)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]) # using EncoderLayer, but it should be DecoderLayer

        # Add a linear layer (head) for next-word prediction
        self.fc = nn.Linear(d_model, vocab_size)

    # def forward(self, x, self_mask):
    #     x = self.embedding(x)
    #     x = self.positional_encoding(x)
    #     for layer in self.layers:
    #         x = layer(x, self_mask)
            
    def forward(self, x, self_mask, encoder_output, cross_mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, self_mask, encoder_output, cross_mask)
        # return x

        # Apply the forward pass through the model head
        x = self.fc(x)
        # When you apply F.log_softmax(x, dim=-1), the softmax function is applied to the d_model dimension. This means that the softmax function is applied independently to each sequence in each batch, and the output tensor will have the same shape as the input tensor.
        return F.log_softmax(x, dim=-1)

In [27]:
input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))

print(input_sequence.shape)
input_sequence

torch.Size([8, 256])


tensor([[1657, 1108, 7823,  ..., 2909, 9790, 1824],
        [5676, 7182, 3361,  ..., 2109, 7619, 8453],
        [7652, 1069, 4869,  ..., 9620, 7889, 5683],
        ...,
        [2359, 9565,  298,  ..., 4286, 6436, 3059],
        [4781, 2745, 8367,  ..., 6143, 9304, 9352],
        [7625, 4028, 1008,  ..., 1698, 8988, 8261]])

In [28]:
torch.ones(1, 8, 8)

tensor([[[1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.]]])

In [29]:
torch.triu(torch.ones(1, 8, 8), diagonal=1)

tensor([[[0., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0.]]])

In [30]:
1 - torch.triu(torch.ones(1, sequence_length, sequence_length), diagonal=1)

tensor([[[1., 0., 0.,  ..., 0., 0., 0.],
         [1., 1., 0.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [1., 1., 1.,  ..., 1., 0., 0.],
         [1., 1., 1.,  ..., 1., 1., 0.],
         [1., 1., 1.,  ..., 1., 1., 1.]]])

In [31]:
# Create a triangular attention mask for causal attention
self_attention_mask = (1 - torch.triu(torch.ones(1, sequence_length, sequence_length), diagonal=1)).bool()  # Upper triangular mask

print(self_attention_mask.shape)
self_attention_mask

torch.Size([1, 256, 256])


tensor([[[ True, False, False,  ..., False, False, False],
         [ True,  True, False,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         ...,
         [ True,  True,  True,  ...,  True, False, False],
         [ True,  True,  True,  ...,  True,  True, False],
         [ True,  True,  True,  ...,  True,  True,  True]]])

In [32]:
# Instantiate the decoder transformer
decoder = TransformerDecoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)

decoder

TransformerDecoder(
  (embedding): Embedding(10000, 512)
  (positional_encoding): PositionalEncoder()
  (layers): ModuleList(
    (0-5): 6 x DecoderLayer(
      (self_attn): MultiHeadAttention(
        (query_linear): Linear(in_features=512, out_features=512, bias=True)
        (key_linear): Linear(in_features=512, out_features=512, bias=True)
        (value_linear): Linear(in_features=512, out_features=512, bias=True)
        (output_linear): Linear(in_features=512, out_features=512, bias=True)
      )
      (cross_attn): MultiHeadAttention(
        (query_linear): Linear(in_features=512, out_features=512, bias=True)
        (key_linear): Linear(in_features=512, out_features=512, bias=True)
        (value_linear): Linear(in_features=512, out_features=512, bias=True)
        (output_linear): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): FeedForwardSubLayer(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in

In [33]:
# decoder cross_mask

padding_mask = torch.randint(0, 2, (sequence_length, sequence_length))
padding_mask

tensor([[1, 0, 0,  ..., 1, 1, 1],
        [0, 0, 1,  ..., 0, 0, 0],
        [0, 1, 0,  ..., 1, 0, 0],
        ...,
        [1, 0, 0,  ..., 1, 0, 0],
        [1, 1, 0,  ..., 1, 1, 1],
        [1, 1, 0,  ..., 1, 0, 1]])

In [34]:
# The reason why the output is a 3D tensor rather than a 2D one is because the model is processing multiple sequences at once (the batch size is greater than 1) and it’s predicting a probability distribution over all possible words for each position in each sequence.
# So, for each sequence in the batch (8 sequences), and for each position in each sequence (256 positions), you have a vector of length 10000 (the size of your vocabulary) representing the probability distribution over all possible next words.

dec_output = decoder(input_sequence, self_attention_mask, enc_output, padding_mask)
print(dec_output.shape)
print(dec_output)

torch.Size([8, 256, 10000])
tensor([[[ -9.4482,  -9.3230, -10.5091,  ...,  -8.9697,  -9.6845,  -8.9071],
         [ -9.8141,  -8.8747,  -9.7218,  ...,  -8.5592,  -9.5105,  -9.9770],
         [ -9.0077,  -9.8012,  -9.4000,  ...,  -8.8119,  -7.9053,  -9.9523],
         ...,
         [ -9.5091,  -7.7884,  -8.6185,  ...,  -9.5684,  -9.5830,  -9.7031],
         [ -9.4629,  -8.8974,  -9.1689,  ...,  -8.7508, -10.1699,  -9.8727],
         [ -8.7902,  -8.6300,  -9.0303,  ...,  -9.2289,  -9.8179,  -9.1708]],

        [[ -8.6415,  -9.0139,  -9.3738,  ...,  -8.9831,  -9.3313,  -9.6528],
         [ -8.9123,  -8.7154,  -8.8092,  ...,  -9.7207,  -9.7151,  -9.9233],
         [ -8.9029,  -8.9250,  -9.2294,  ...,  -9.2723,  -9.6684, -10.0517],
         ...,
         [ -9.6115,  -9.1534,  -8.9948,  ...,  -9.9716,  -9.3570,  -9.4313],
         [ -9.1855,  -9.2785,  -8.1049,  ...,  -9.5754,  -9.0475,  -8.7596],
         [ -8.9178,  -9.0751,  -9.8033,  ...,  -9.3072,  -9.4373,  -9.8124]],

        [[ -9.34

In [35]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM

model_name = "textattack/distilbert-base-uncased-SST-2"

# Load the tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

DistilBertTokenizerFast(name_or_path='textattack/distilbert-base-uncased-SST-2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [36]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

model

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [37]:
text = ["The best movie I've ever watched!", "What an awful movie. I regret watching it."]

# Tokenize inputs and pass them to the model for inference
inputs = tokenizer(text, return_tensors="pt", padding=True)
inputs

{'input_ids': tensor([[ 101, 1996, 2190, 3185, 1045, 1005, 2310, 2412, 3427,  999,  102,    0],
        [ 101, 2054, 2019, 9643, 3185, 1012, 1045, 9038, 3666, 2009, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [38]:
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0542,  0.2731],
        [ 0.9809, -0.7639]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [39]:
logits = outputs.logits

logits

tensor([[-0.0542,  0.2731],
        [ 0.9809, -0.7639]], grad_fn=<AddmmBackward0>)

In [40]:
predicted_classes = torch.argmax(logits, dim=1).tolist()
predicted_classes

[1, 0]

In [41]:
for idx, predicted_class in enumerate(predicted_classes):
    print(f"Predicted class for \"{text[idx]}\": {predicted_class}")

Predicted class for "The best movie I've ever watched!": 1
Predicted class for "What an awful movie. I regret watching it.": 0


In [42]:
from datasets import load_dataset

# Load a dataset from Hugging Face's dataset hub
dataset = load_dataset('opinosis', trust_remote_code=True)

dataset


DatasetDict({
    train: Dataset({
        features: ['review_sents', 'summaries'],
        num_rows: 51
    })
})

In [43]:
print(f"Number of instances: {len(dataset['train'])}")


Number of instances: 51


In [44]:
# Show the names of features in the training fold of the dataset
print(f"Feature names: {dataset['train'].column_names}")

Feature names: ['review_sents', 'summaries']


In [45]:
dataset['train'][-2]

{'review_sents': "I bought the 8, gig Ipod Nano that has the built, in video camera .\r\n  Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod .\r\nI have lots of music cd's and dvd's, so currently I'm just interested in storing some of my music and videos on the ipod so I can enjoy them on my vacation, and while at work .\r\nThere's a right way and wrong way to store music and videos onto the ipod .\r\n Audio and video recording are a step above the competition .\r\n As always, the video screen is sharp and bright .\r\nipod nano even better with video camera !\r\n This time around, Apple is branching out of its iPod formula in a small, but not insignificant way by gracing the back of the Nano with its own video camera .\r\n Still, video fans should consider stepping up to a product with a larger screen, such as the Zune HD   or iPod Touch .\r\nThe user interface of the 5G Nano remains almost entirely unchanged with the ex

In [46]:
# Encode the input example, obtain the summary, and decode it
example = dataset['train'][-2]['review_sents']

In [47]:
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer



T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_i

In [48]:
input_ids = tokenizer.encode("summarize: " + example, return_tensors="pt", max_length=512, truncation=True)
input_ids

tensor([[21603,    10,    27,  2944,     8,  9478, 10754,    27, 11410, 20556,
            24,    65,     8,  1192,     6,    16,   671,  1861,     3,     5,
            94,   444,     7,    65,    46,    30,     6,   689,  1078,     6,
           213,    25,   164,  1242,    11,   946,   723,    11,  3075,    84,
            56,  2438,  2400,     8,     3,    23, 11410,     3,     5,    27,
            43,  1995,    13,   723,     3,    75,    26,    31,     7,    11,
         30114,    31,     7,     6,    78,  1083,    27,    31,    51,   131,
          1638,    16,     3, 17445,   128,    13,    82,   723,    11,  3075,
            30,     8,     3,    23, 11410,    78,    27,    54,   777,   135,
            30,    82,  4257,     6,    11,   298,    44,   161,     3,     5,
           290,    31,     7,     3,     9,   269,   194,    11,  1786,   194,
            12,  1078,   723,    11,  3075,  2400,     8,     3,    23, 11410,
             3,     5,  9607,    11,   671,  5592,  

In [49]:
summary_ids = model.generate(input_ids, max_length=150)
summary_ids[0]

tensor([    0,    27,  2944,     8,  9478, 10754,    27, 11410, 20556,    24,
           65,     8,  1192,     6,    16,   671,  1861,     3,     5,    94,
          444,     7,    65,    46,    30,     6,   689,  1078,     6,   213,
           25,   164,  1242,    11,   946,   723,    11,  3075,    84,    56,
         2438,  2400,     8,     3,    23, 11410,     3,     5,     1])

In [50]:
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\nOriginal Text (first 400 characters): \n", example[:400])
print("\nGenerated Summary: \n", summary)


Original Text (first 400 characters): 
 I bought the 8, gig Ipod Nano that has the built, in video camera .
  Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod .
I have lots of music cd's and dvd's, so currently I'm just interested in storing some of my music and videos on the ipod so I can enjoy them on my vacation, and while at work .
There's a right way and wrong wa

Generated Summary: 
 I bought the 8, gig Ipod Nano that has the built, in video camera. Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod.


In [51]:
model_name = "Helsinki-NLP/opus-mt-en-es"

# Load the tokenizer and the model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model



MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,),

In [52]:
# The reason why the input and translated IDs vectors have more elements than the corresponding word inputs is due to the way the tokenizer works.
# In your code, the tokenizer.encode function is used to convert the input text into a sequence of IDs, which represent the tokens in the text. These tokens can be individual words, but they can also be smaller units depending on the tokenizer. For example, a word might be split into multiple subwords, each with its own ID.
# Additionally, special tokens are often added to the sequence. For instance, a common practice is to add a special token at the beginning and end of the sequence. In your case, the 0 at the end of each input_ids and translated_ids tensor is likely a special token, such as an end-of-sequence token

english_inputs = ["Hello", "Thank you", "How are you?", "Sorry", "Goodbye"]

# Encode the inputs, generate translations, decode, and print them
for english_input in english_inputs:
    print('english_input', english_input)
    input_ids = tokenizer.encode(english_input, return_tensors="pt")
    print('input_ids', input_ids)
    translated_ids = model.generate(input_ids)
    print('translated_ids', translated_ids)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    print(f"English: {english_input} | Spanish: {translated_text}")

english_input Hello
input_ids tensor([[3923,    0]])
translated_ids tensor([[65000,  2119,     3,     0]])
English: Hello | Spanish: Hola.
english_input Thank you
input_ids tensor([[1825,   40,    0]])




translated_ids tensor([[65000,  1124,     3,     0]])
English: Thank you | Spanish: Gracias.
english_input How are you?
input_ids tensor([[594,  53,  40,  21,   0]])
translated_ids tensor([[65000,    50,  1102,  1221,    21,     0]])
English: How are you? | Spanish: ¿Cómo estás?
english_input Sorry
input_ids tensor([[5099,    0]])
translated_ids tensor([[65000,   350,  1669,     3,     0]])
English: Sorry | Spanish: Lo siento.
english_input Goodbye
input_ids tensor([[22191,     0]])
translated_ids tensor([[65000,  8631,     3,     0]])
English: Goodbye | Spanish: Adiós.


In [53]:
# Load a specific subset of the dataset 
mlqa = load_dataset("xtreme", name="MLQA.en.en")

mlqa

DatasetDict({
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11590
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1148
    })
})

In [54]:
question = mlqa["test"]["question"][0]
context = mlqa["test"]["context"][0]
print("Question: ", question)
print("Context: ", context)

Question:  Who analyzed the biopsies?
Context:  In 1994, five unnamed civilian contractors and the widows of contractors Walter Kasza and Robert Frost sued the USAF and the United States Environmental Protection Agency. Their suit, in which they were represented by George Washington University law professor Jonathan Turley, alleged they had been present when large quantities of unknown chemicals had been burned in open pits and trenches at Groom. Biopsies taken from the complainants were analyzed by Rutgers University biochemists, who found high levels of dioxin, dibenzofuran, and trichloroethylene in their body fat. The complainants alleged they had sustained skin, liver, and respiratory injuries due to their work at Groom, and that this had contributed to the deaths of Frost and Kasza. The suit sought compensation for the injuries they had sustained, claiming the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to enforce the Resource Conservation a

In [55]:
model_ckp = "deepset/minilm-uncased-squad2"

# Initialize the tokenizer using the model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_ckp)

tokenizer

BertTokenizerFast(name_or_path='deepset/minilm-uncased-squad2', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [56]:
# Tokenize the inputs returning the result as tensors
inputs = tokenizer(question, context, return_tensors="pt")

inputs

{'input_ids': tensor([[  101,  2040, 16578,  1996, 16012,  4523,  3111,  1029,   102,  1999,
          2807,  1010,  2274, 13294,  6831, 16728,  1998,  1996, 24835,  1997,
         16728,  4787, 10556, 17112,  2050,  1998,  2728, 10097, 12923,  1996,
         18531,  1998,  1996,  2142,  2163,  4483,  3860,  4034,  1012,  2037,
          4848,  1010,  1999,  2029,  2027,  2020,  3421,  2011,  2577,  2899,
          2118,  2375,  2934,  5655, 10722, 12866,  1010,  6884,  2027,  2018,
          2042,  2556,  2043,  2312, 12450,  1997,  4242, 12141,  2018,  2042,
          5296,  1999,  2330, 14496,  1998, 19874,  2012, 18087,  1012, 16012,
          4523,  3111,  2579,  2013,  1996, 17612, 11390,  2020, 16578,  2011,
         18607,  2118, 16012, 24229,  2015,  1010,  2040,  2179,  2152,  3798,
          1997,  4487, 11636,  2378,  1010,  4487, 10609,  6844, 27942,  2319,
          1010,  1998, 13012,  2818, 10626,  8913, 16921, 11474,  1999,  2037,
          2303,  6638,  1012,  1996, 1

In [57]:
from transformers import AutoModelForQuestionAnswering

# Initialize the LLM upon the model checkpoint
model = AutoModelForQuestionAnswering.from_pretrained(model_ckp)

model

BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, elem

In [58]:
import torch

with torch.no_grad():
  # Forward-pass the input through the model
  outputs = model(**inputs)

outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ 0.4331, -5.9864, -5.7614, -6.2329, -5.9369, -6.4554, -6.6515, -6.6686,
          0.4331, -5.9137, -6.0544, -6.6783, -5.9388, -6.2935, -6.2757, -6.5190,
         -6.6694, -6.2961, -6.3753, -6.5280, -6.1553, -5.4943, -5.9384, -6.5034,
         -6.7191, -6.6250, -6.2206, -6.6423, -6.4067, -6.1051, -5.9176, -6.6871,
         -6.3237, -5.6972, -6.5879, -6.0408, -6.6248, -6.8032, -6.8567, -6.0126,
         -6.4246, -6.8355, -6.3049, -6.4013, -6.3018, -6.4767, -6.4598, -6.4260,
         -5.5132, -6.2999, -6.6227, -6.1841, -6.3011, -5.6165, -6.0477, -6.8492,
         -6.7725, -6.1979, -6.2060, -6.3269, -6.3990, -6.4921, -6.2939, -5.9060,
         -6.4909, -6.4594, -6.2571, -6.5393, -6.5011, -6.4583, -6.4658, -6.3752,
         -6.2070, -6.5106, -6.6355, -6.5982, -6.3821, -6.2631, -5.9546,  1.1200,
         -4.9416, -4.9593, -5.7370, -6.0798, -5.8981, -5.1775, -6.2021, -3.7974,
         -0.7555, -0.8330,  7.0906, -1.6840, -0.1442, -4

In [59]:
# Get the most likely start and end answer position from the raw LLM outputs
start_idx = torch.argmax(outputs.start_logits)
end_idx = torch.argmax(outputs.end_logits) + 1

start_idx, end_idx

(tensor(90), tensor(95))

In [60]:
# Access the tokenized inputs tensor to get the answer span
answer_span = inputs["input_ids"][0][start_idx:end_idx]

answer_span

tensor([18607,  2118, 16012, 24229,  2015])

In [61]:
# Decode the answer span to get the extracted answer text
answer = tokenizer.decode(answer_span)
print("Answer: ", answer)

Answer:  rutgers university biochemists


In [62]:
model_name = "distilbert-base-uncased"

# Load a pre-trained LLM, specifying its use for binary classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=6)

model

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_layer_norm.bias', 'vocab_layer_norm.weight', 'vocab_transform.weight', 'vocab_transform.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'pre_classifier.bias', 'classifier.weight', 'pre_classifier.we

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [63]:
from transformers import TrainingArguments

# Set up training arguments with a batch size of 8 per GPU and 5 epochs
training_args = TrainingArguments(
    output_dir="./smaller_bert_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=5,
)

training_args

TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=

In [64]:

from transformers import Trainer

tokenized_datasets = []

# Set up trainer, assigning previously set up training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer

<transformers.trainer.Trainer at 0x274b8f8bf70>

In [65]:
def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average="weighted")
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "f1": f1}

In [66]:
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer

DistilBertTokenizerFast(name_or_path='distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [67]:
# Load your dataset
dataset = load_dataset('emotion', trust_remote_code=True)

dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 2000
    })
})

In [68]:
# Encode your dataset
def encode(examples):
    return tokenizer(examples['text'], truncation=True, padding='max_length')

emotions_encoded = dataset.map(encode, batched=True)

emotions_encoded


Map: 100%|████████████████████████████████████████████████████████████████| 2000/2000 [00:00<00:00, 3154.58 examples/s]


DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 16000
    })
    validation: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2000
    })
})

In [69]:
# Initialize the trainer and assign a training and validation set to it
trainer = Trainer(model=model, args=training_args,
    			compute_metrics=compute_metrics,
    			train_dataset=emotions_encoded["train"],
    			eval_dataset=emotions_encoded["validation"],
    			tokenizer=tokenizer
)

trainer

<transformers.trainer.Trainer at 0x274b55c7b20>

In [70]:
# Print the keys of the first example in the training dataset
print(emotions_encoded["train"][0].keys())


dict_keys(['text', 'label', 'input_ids', 'attention_mask'])


In [71]:
unique_labels = set()
for example in emotions_encoded["train"]:
    unique_labels.add(example["label"])
print(f"Unique labels: {sorted(list(unique_labels))}")

Unique labels: [0, 1, 2, 3, 4, 5]


In [73]:
# # Training loop to fine-tune the model

# trainer.train()

In [74]:
input_texts = ["It's dark and rainy outside", "I love penguins!"]

# Tokenize the input sequences and pass them to the model
inputs = tokenizer(input_texts, return_tensors="pt", padding=True, truncation=True)

inputs

{'input_ids': tensor([[  101,  2009,  1005,  1055,  2601,  1998, 16373,  2648,   102],
        [  101,  1045,  2293, 18134,   999,   102,     0,     0,     0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 0, 0, 0]])}

In [75]:
with torch.no_grad():
    outputs = model(**inputs)

outputs

SequenceClassifierOutput(loss=None, logits=tensor([[ 0.3346,  0.1305,  0.2033, -0.0921, -0.2327, -0.4993],
        [ 0.3221, -0.0413,  0.0839,  0.0180, -0.2351, -0.3689]]), hidden_states=None, attentions=None)

In [76]:
# Obtain class labels from raw predictions
predicted_labels = torch.argmax(outputs.logits, dim=1).tolist()

predicted_labels

[0, 0]

In [77]:
for i, predicted_label in enumerate(predicted_labels):
    print(f"\n Input Text {i + 1}: {input_texts[i]}")
    print(f"Predicted Label: {predicted_label}")


 Input Text 1: It's dark and rainy outside
Predicted Label: 0

 Input Text 2: I love penguins!
Predicted Label: 0


In [79]:
sentiment_analysis = pipeline("sentiment-analysis")

test_examples = [{'text': 'I love this product!', 'label': 1},
                 {'text': 'The service was terrible.', 'label': 0},
                 {'text': 'This movie is amazing.', 'label': 1},
                 {'text': "I'm disappointed with the quality.", 'label': 0}]

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


In [80]:
# Pass the four input texts (without labels) to the pipeline
predictions = sentiment_analysis([example["text"] for example in test_examples])

predictions

[{'label': 'POSITIVE', 'score': 0.9998855590820312},
 {'label': 'NEGATIVE', 'score': 0.9996507167816162},
 {'label': 'POSITIVE', 'score': 0.9998838901519775},
 {'label': 'NEGATIVE', 'score': 0.9997726082801819}]

In [81]:
true_labels = [example["label"] for example in test_examples]
true_labels

[1, 0, 1, 0]

In [82]:
predicted_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]
predicted_labels

[1, 0, 1, 0]

In [84]:
from sklearn.metrics import accuracy_score

# assuming true_labels and predicted_labels are defined
result = accuracy_score(true_labels, predicted_labels)
print(result)

1.0


In [87]:
import evaluate
# Load the accuracy metric
accuracy = evaluate.load("accuracy")

result = accuracy.compute(references=true_labels, predictions=predicted_labels)
print(result)

Downloading builder script: 100%|█████████████████████████████████████████████████| 4.20k/4.20k [00:00<00:00, 1.07MB/s]

{'accuracy': 1.0}





In [88]:
# Load the accuracy, precision, recall and F1 score metrics
accuracy = evaluate.load("accuracy")
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Obtain a description of each metric
print(accuracy.description)
print(precision.description)
print(recall.description)
print(f1.description)

Downloading builder script: 100%|█████████████████████████████████████████████████| 7.55k/7.55k [00:00<00:00, 3.89MB/s]
Downloading builder script: 100%|█████████████████████████████████████████████████| 7.36k/7.36k [00:00<00:00, 7.69MB/s]
Downloading builder script: 100%|█████████████████████████████████████████████████| 6.77k/6.77k [00:00<00:00, 6.53MB/s]


Accuracy is the proportion of correct predictions among the total number of cases processed. It can be computed with:
Accuracy = (TP + TN) / (TP + TN + FP + FN)
 Where:
TP: True positive
TN: True negative
FP: False positive
FN: False negative


Precision is the fraction of correctly labeled positive examples out of all of the examples that were labeled as positive. It is computed via the equation:
Precision = TP / (TP + FP)
where TP is the True positives (i.e. the examples correctly labeled as positive) and FP is the False positive examples (i.e. the examples incorrectly labeled as positive).


Recall is the fraction of the positive examples that were correctly labeled by the model as positive. It can be computed with the equation:
Recall = TP / (TP + FN)
Where TP is the true positives and FN is the false negatives.


The F1 score is the harmonic mean of the precision and recall. It can be computed with the equation:
F1 = 2 * (precision * recall) / (precision + recall)






In [89]:
precision = evaluate.load("precision")
recall = evaluate.load("recall")
f1 = evaluate.load("f1")

# Pass the examples to the pipeline, and obtain a list predicted labels
sentiment_analysis = pipeline("sentiment-analysis")
sentiment_analysis

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


<transformers.pipelines.text_classification.TextClassificationPipeline at 0x274b90ea6b0>

In [90]:
test_examples = [
    "Fantastic hotel, exceeded expectations!",
    "Quiet despite central location, great stay.",
    "Friendly staff, welcoming atmosphere.",
    "Spacious, comfy room—a perfect retreat.",
    "Cleanliness could improve, overall decent stay.",
      "Disappointing stay, noisy and unclean room.",
    "Terrible service, unfriendly staff, won't return."
]

test_labels = [1, 1, 1, 1, 0, 0, 0]

In [91]:
predictions = sentiment_analysis([example for example in test_examples])
predicted_labels = [1 if pred["label"] == "POSITIVE" else 0 for pred in predictions]

predicted_labels

[1, 1, 1, 1, 1, 0, 0]

In [92]:
# Compute the metrics by comparing real and predicted labels
print(f1.compute(references=test_labels, predictions=predicted_labels))
print(precision.compute(references=test_labels, predictions=predicted_labels))
print(recall.compute(references=test_labels, predictions=predicted_labels))

{'f1': 0.888888888888889}
{'precision': 0.8}
{'recall': 1.0}


In [93]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)




In [94]:
prompt = "Current trends show that by 2030 "

# Encode the prompt, generate text and decode it
prompt_ids = tokenizer.encode(prompt, return_tensors="pt")

prompt_ids

tensor([[11297, 11257,   905,   326,   416, 25054,   220]])

In [101]:
output = model.generate(prompt_ids, max_length=50)
output

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


tensor([[11297, 11257,   905,   326,   416, 25054,   220,  1849,  1169,  1271,
           286,   661,  2877,   287,  8098,   481,   307,   379,   663,  9016,
          1241,  1201,   262,  8069,    82,    13,   198,   464,  1271,   286,
           661,  2877,   287,  8098,   287,   262,  3482,   468,  9292,   416,
           517,   621,  2063,  1201,   262,  8069,    82,    13,   198,   464]])

In [103]:
generated_text = tokenizer.decode(output[0], skip_special_tokens=True)

print("Generated Text: ", generated_text)

Generated Text:  Current trends show that by 2030  the number of people living in poverty will be at its lowest level since the 1970s.
The number of people living in poverty in the UK has fallen by more than half since the 1970s.
The


In [104]:
# Load and compute the perplexity score
perplexity = evaluate.load("perplexity", module_type="metric")
results = perplexity.compute(model_id=model_name,
                             predictions=generated_text)
print("Perplexity: ", results['mean_perplexity'])

Downloading builder script: 100%|█████████████████████████████████████████████████| 8.46k/8.46k [00:00<00:00, 2.77MB/s]
Using pad_token, but it is not set yet.
100%|██████████████████████████████████████████████████████████████████████████████████| 14/14 [00:01<00:00, 11.91it/s]

Perplexity:  3386.8988407452903





In [106]:
# ! pip install rouge_score

# Load the rouge metric
rouge = evaluate.load("rouge")

predictions = ["""Pluto is a dwarf planet in our solar system, located in the Kuiper Belt beyond Neptune, and was formerly considered the ninth planet until its reclassification in 2006."""]
references = ["""Pluto is a dwarf planet in the solar system, located in the Kuiper Belt beyond Neptune, and was previously deemed as a planet until it was reclassified in 2006."""]

# Calculate the rouge scores between the predicted and reference summaries
results = rouge.compute(predictions=predictions,references=references)
print("ROUGE results: ", results)



Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting rouge_score
  Downloading rouge_score-0.1.2.tar.gz (17 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'done'
Building wheels for collected packages: rouge_score
  Building wheel for rouge_score (setup.py): started
  Building wheel for rouge_score (setup.py): finished with status 'done'
  Created wheel for rouge_score: filename=rouge_score-0.1.2-py3-none-any.whl size=24972 sha256=61ed15457e821f5c45cc71d084f36b3a9f8cc7b523392a9bd81b42c5c92bd488
  Stored in directory: C:\Users\Alienware\AppData\Local\Temp\pip-ephem-wheel-cache-620hvy0o\wheels\5f\dd\89\461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4
Successfully built rouge_score
Installing collected packages: rouge_score
Successfully installed rouge_score-0.1.2
ROUGE results:  {'rouge1': 0.7719298245614034, 'rouge2': 0.6181818181818182, 'rougeL': 0.736842105263158, 'rougeLsum': 0.736842105263158}


In [107]:
meteor = evaluate.load("meteor")

llm_outputs = ["He thought it right and necessary to become a knight-errant, roaming the world in armor, seeking adventures and practicing the deeds he had read about in chivalric tales."]
references = ["He believed it was proper and essential to transform into a knight-errant, traveling the world in armor, pursuing adventures, and enacting the heroic deeds he had encountered in tales of chivalry."]

# Compute and print the METEOR score
results = meteor.compute(predictions=llm_outputs, references=references)
print("Meteor: ", results)

Downloading builder script: 100%|█████████████████████████████████████████████████| 6.93k/6.93k [00:00<00:00, 2.31MB/s]
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Alienware\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Alienware\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Alienware\AppData\Roaming\nltk_data...


Meteor:  {'meteor': 0.5350702240481536}


In [108]:
exact_match = evaluate.load("exact_match")

predictions = ["The cat sat on the mat.", "Theaters are great.", "It's like comparing oranges and apples."]
references = ["The cat sat on the mat?", "Theaters are great.", "It's like comparing apples and oranges."]

# Compute the exact match and print the results
results = exact_match.compute(references=references, predictions=predictions)
print("EM results: ", results)

Downloading builder script: 100%|█████████████████████████████████████████████████| 5.67k/5.67k [00:00<00:00, 5.71MB/s]

EM results:  {'exact_match': 0.3333333333333333}





In [109]:
input_sentence_1 = "Hola, ¿cómo estás?"

reference_1 = [
     ["Hello, how are you?", "Hi, how are you?"]
     ]

input_sentences_2 = ["Hola, ¿cómo estás?", "Estoy genial, gracias."]

references_2 = [
     ["Hello, how are you?", "Hi, how are you?"],
     ["I'm great, thanks.", "I'm great, thank you."]
     ]

In [None]:
# The reason why there are multiple reference sentences for each input sentence is because of the inherent ambiguity and variability in translation. There can be several equally correct translations for a given sentence, depending on factors like context, tone, and style. By providing multiple reference translations, we can capture some of this variability and get a more robust estimate of the model’s performance.

# In the code you posted, the BLEU score is being calculated for the translations. The BLEU score is a metric that measures the quality of a translation by comparing it to one or more reference translations. It does this by counting the number of n-gram matches between the translation and the reference(s), and then normalizing by the total number of n-grams in the translation. The more the translation resembles the reference(s), the higher the BLEU score will be.

# In your example, the first input sentence “Hola, ¿cómo estás?” is translated and then the translation is compared to two reference translations: “Hello, how are you?” and “Hi, how are you?”. The BLEU score is then computed for this translation.

# The same process is repeated for the second set of input sentences and references. The final BLEU score is a measure of how well the translations match the reference translations

In [111]:
import evaluate
bleu = evaluate.load("bleu")

translator = pipeline("translation", model="Helsinki-NLP/opus-mt-es-en")

# Translate the first input sentence
translated_output = translator(input_sentence_1)

translated_sentence = translated_output[0]['translation_text']

print("Translated:", translated_sentence)

# Calculate BLEU metric for translation quality
results = bleu.compute(predictions=[translated_sentence], references=reference_1)
print(results)

Downloading builder script: 100%|█████████████████████████████████████████████████| 5.94k/5.94k [00:00<00:00, 5.68MB/s]
Downloading extra modules: 4.07kB [00:00, 1.02MB/s]                                                                    
Downloading extra modules: 100%|██████████████████████████████████████████████████| 3.34k/3.34k [00:00<00:00, 3.35MB/s]


Translated: Hey, how are you?
{'bleu': 0.7598356856515925, 'precisions': [0.8333333333333334, 0.8, 0.75, 0.6666666666666666], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 6, 'reference_length': 6}


In [112]:
# Translate the input sentences, extract the translated text, and compute BLEU score
translated_outputs = translator(input_sentences_2)

translated_outputs

[{'translation_text': 'Hey, how are you?'},
 {'translation_text': "I'm great, thanks."}]

In [114]:
predictions = [translated_output['translation_text'] for translated_output in translated_outputs]

predictions

['Hey, how are you?', "I'm great, thanks."]

In [115]:
results = bleu.compute(predictions=predictions, references=references_2)
print(results)

{'bleu': 0.8627788640890415, 'precisions': [0.9090909090909091, 0.8888888888888888, 0.8571428571428571, 0.8], 'brevity_penalty': 1.0, 'length_ratio': 1.0, 'translation_length': 11, 'reference_length': 11}


In [1]:
# !pip install trl
from trl import PPOTrainer, PPOConfig, create_reference_model, AutoModelForCausalLMWithValueHead

model = AutoModelForCausalLMWithValueHead.from_pretrained('sshleifer/tiny-gpt2')

model

  from .autonotebook import tqdm as notebook_tqdm
To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 2)
      (wpe): Embedding(1024, 2)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-1): 2 x GPT2Block(
          (ln_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=2, out_features=50257, bias=False)
  )
  (v_head): Value

In [2]:
# Instantiate a reference model

# When you call create_reference_model(model), it creates a copy of the model and freezes its parameters. This means that the weights of the reference model will not be updated during training.
# This reference model is then used to compare with the updated model at each step of the training process. The idea is to ensure that the policy (i.e., the behavior of the model) does not change too drastically from one update to the next

model_ref = create_reference_model(model)

model_ref

AutoModelForCausalLMWithValueHead(
  (pretrained_model): GPT2LMHeadModel(
    (transformer): GPT2Model(
      (wte): Embedding(50257, 2)
      (wpe): Embedding(1024, 2)
      (drop): Dropout(p=0.1, inplace=False)
      (h): ModuleList(
        (0-1): 2 x GPT2Block(
          (ln_1): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
          (attn): GPT2Attention(
            (c_attn): Conv1D()
            (c_proj): Conv1D()
            (attn_dropout): Dropout(p=0.1, inplace=False)
            (resid_dropout): Dropout(p=0.1, inplace=False)
          )
          (ln_2): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
          (mlp): GPT2MLP(
            (c_fc): Conv1D()
            (c_proj): Conv1D()
            (act): NewGELUActivation()
            (dropout): Dropout(p=0.1, inplace=False)
          )
        )
      )
      (ln_f): LayerNorm((2,), eps=1e-05, elementwise_affine=True)
    )
    (lm_head): Linear(in_features=2, out_features=50257, bias=False)
  )
  (v_head): Value

In [4]:
# To check if the parameters of a model are frozen, you can iterate over the parameters and check their requires_grad attribute. Here’s a small function that can do this:

def check_if_frozen(model):
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"{name} is not frozen")
        else:
            print(f"{name} is frozen")

In [5]:
check_if_frozen(model)

pretrained_model.transformer.wte.weight is not frozen
pretrained_model.transformer.wpe.weight is not frozen
pretrained_model.transformer.h.0.ln_1.weight is not frozen
pretrained_model.transformer.h.0.ln_1.bias is not frozen
pretrained_model.transformer.h.0.attn.c_attn.weight is not frozen
pretrained_model.transformer.h.0.attn.c_attn.bias is not frozen
pretrained_model.transformer.h.0.attn.c_proj.weight is not frozen
pretrained_model.transformer.h.0.attn.c_proj.bias is not frozen
pretrained_model.transformer.h.0.ln_2.weight is not frozen
pretrained_model.transformer.h.0.ln_2.bias is not frozen
pretrained_model.transformer.h.0.mlp.c_fc.weight is not frozen
pretrained_model.transformer.h.0.mlp.c_fc.bias is not frozen
pretrained_model.transformer.h.0.mlp.c_proj.weight is not frozen
pretrained_model.transformer.h.0.mlp.c_proj.bias is not frozen
pretrained_model.transformer.h.1.ln_1.weight is not frozen
pretrained_model.transformer.h.1.ln_1.bias is not frozen
pretrained_model.transformer.h.1

In [6]:
check_if_frozen(model_ref)

pretrained_model.transformer.wte.weight is frozen
pretrained_model.transformer.wpe.weight is frozen
pretrained_model.transformer.h.0.ln_1.weight is frozen
pretrained_model.transformer.h.0.ln_1.bias is frozen
pretrained_model.transformer.h.0.attn.c_attn.weight is frozen
pretrained_model.transformer.h.0.attn.c_attn.bias is frozen
pretrained_model.transformer.h.0.attn.c_proj.weight is frozen
pretrained_model.transformer.h.0.attn.c_proj.bias is frozen
pretrained_model.transformer.h.0.ln_2.weight is frozen
pretrained_model.transformer.h.0.ln_2.bias is frozen
pretrained_model.transformer.h.0.mlp.c_fc.weight is frozen
pretrained_model.transformer.h.0.mlp.c_fc.bias is frozen
pretrained_model.transformer.h.0.mlp.c_proj.weight is frozen
pretrained_model.transformer.h.0.mlp.c_proj.bias is frozen
pretrained_model.transformer.h.1.ln_1.weight is frozen
pretrained_model.transformer.h.1.ln_1.bias is frozen
pretrained_model.transformer.h.1.attn.c_attn.weight is frozen
pretrained_model.transformer.h.1.a

In [8]:
from transformers import AutoModelForCausalLM, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('sshleifer/tiny-gpt2')

if tokenizer._pad_token is None:
    tokenizer.add_special_tokens({'pad_token': '[PAD]'})

tokenizer

GPT2TokenizerFast(name_or_path='sshleifer/tiny-gpt2', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '[PAD]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	50257: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}

In [9]:
# Initialize trainer configuration
# this code is setting up a configuration for a PPO trainer with specific batch and mini-batch sizes. This configuration would be used when training a model using the PPO algorithm. 

ppo_config = PPOConfig(batch_size=1, mini_batch_size=1)

ppo_config

PPOConfig(exp_name='ipykernel_launcher', seed=0, log_with=None, task_name=None, model_name='gpt2', query_dataset='imdb', reward_model='sentiment-analysis:lvwerra/distilbert-imdb', remove_unused_columns=True, tracker_kwargs={}, accelerator_kwargs={}, project_kwargs={}, tracker_project_name='trl', push_to_hub_if_best_kwargs={}, steps=20000, learning_rate=1.41e-05, adap_kl_ctrl=True, init_kl_coef=0.2, kl_penalty='kl', target=6, horizon=10000, gamma=1, lam=0.95, cliprange=0.2, cliprange_value=0.2, vf_coef=0.1, batch_size=1, forward_batch_size=None, mini_batch_size=1, gradient_accumulation_steps=1, world_size=None, ppo_epochs=4, max_grad_norm=None, optimize_cuda_cache=None, optimize_device_cache=False, early_stopping=False, target_kl=1, compare_steps=1, ratio_threshold=10.0, use_score_scaling=False, use_score_norm=False, score_clip=None, whiten_rewards=False, gradient_checkpointing=False, is_encoder_decoder=None, is_peft_model=None, backward_batch_size=1, global_backward_batch_size=None, gl

In [12]:
# Create a PPOTrainer instance
# this line of code is setting up a PPO trainer with a specific configuration, model, reference model, and tokenizer. The trainer can then be used to train the model using the PPO algorithm.Typically, the trainer would have a method like train() that you can call to start the training process. The training process involves repeatedly sampling data, using the data to update the model, and then evaluating the performance of the model. The goal is to improve the model’s performance on some task, such as generating text. The PPO algorithm is particularly well-suited to tasks where the data is sequential or temporal in nature. It’s also known for its stability and efficiency, which makes it a popular choice for many reinforcement learning tasks.

ppo_trainer = PPOTrainer(ppo_config, model, model_ref, tokenizer)

ppo_trainer



<trl.trainer.ppo_trainer.PPOTrainer at 0x23d8cdd4ee0>

In [10]:
prompt = "Next year, I "

input = tokenizer.encode(prompt, return_tensors="pt")

input

tensor([[10019,   614,    11,   314,   220]])

In [11]:
# this code is using a pretrained language model to generate a response to a given prompt. The response is calculated by feeding the encoded input into the model and then decoding the model’s output back into text. The response represents what the model thinks is the most likely continuation of the input prompt. The exact details of how the response is calculated depend on the specifics of the model and the respond_to_batch function. 

from trl.core import respond_to_batch

response  = respond_to_batch(model, input) # function to generate a response from the model. The function takes the model and the encoded input as arguments.

response

tensor([[28936, 37056, 32947, 46773, 16886, 34670, 46499, 16330, 19193, 27667,
          7433, 39579, 29584, 47000, 37807, 45925, 34793, 12861, 38280, 30797]])

In [14]:
# In a more complex scenario, you might want to design a reward function that gives higher rewards for better responses and lower rewards for worse ones. This would require a way to evaluate the quality of the responses, which could be based on various factors such as the relevance of the response to the input, the grammatical correctness of the response, etc. This is typically the challenging part in reinforcement learning - designing a good reward function.

import torch
reward = [torch.tensor(1.0)]
reward

[tensor(1.)]

In [15]:
# Train LLM for one step with PPO
# while step() is used for a single step of training, train() is used for full-scale training over multiple epochs. The code is likely a simplified example or a debugging scenario where only a single step of training is being performed. For training a model to completion, you would generally use a train() function or similar.

train_stats = ppo_trainer.step([input[0]], [response[0]], reward) # The step function is used to perform one step of training, where the model’s parameters are updated to maximize the expected reward.

train_stats

# The train_stats dictionary contains various statistics and metrics that are calculated during the training step. Here’s a brief explanation of some of the key metrics:
# ‘objective/kl’: This is the Kullback-Leibler (KL) divergence, which measures how one probability distribution diverges from a second, expected probability distribution. In this case, it’s 0.0, indicating no divergence.
# ‘objective/logprobs’: These are the log probabilities of the actions taken by the model. They are used in the calculation of the policy gradient.
# ‘ppo/mean_scores’: This is the mean of the scores (rewards) obtained during the training step.
# ‘tokens/queries_len_mean’: This is the average length of the queries processed in the training step.
# ‘ppo/loss/policy’, ‘ppo/loss/value’, ‘ppo/loss/total’: These are the losses for the policy, value function, and the total loss respectively. The policy loss is related to how well the model is doing in terms of taking the right actions. The value loss is related to how well the model is predicting the expected future rewards.
# ‘ppo/policy/entropy’: This is the entropy of the policy. It’s a measure of the randomness of the policy. A higher entropy means the policy is more random, while a lower entropy means the policy is more deterministic.
# ‘ppo/returns/mean’: This is the mean of the returns (sum of rewards) obtained during the training step.
# ‘ppo/val/vpred’: This is the predicted value of the state by the model.
# ‘time/ppo/total’: This is the total time taken for the training step.
# The warnings about degrees of freedom being less than or equal to 0 are due to the standard deviation (std()) function being called on a dataset with insufficient size. This can happen when the batch size or mini-batch size is too small.

  std_scores = data["scores"].std()
  stats["tokens/queries_len_std"] = torch.std(query_lens).cpu().numpy().item()
  stats["tokens/responses_len_std"] = torch.std(response_lens).cpu().numpy().item()


{'objective/kl': 0.0,
 'objective/kl_dist': 0.0,
 'objective/logprobs': array([[-10.803115 , -10.834713 , -10.795107 , -10.808006 , -10.844018 ,
         -10.866432 , -10.825367 , -10.817469 , -10.893092 , -10.837414 ,
         -10.79999  , -10.785171 , -10.854528 , -10.8445015, -10.844265 ,
         -10.780741 , -10.829426 , -10.827827 , -10.83161  , -10.8276   ,
         -10.8046465, -10.827068 , -10.825418 , -10.84352  ]],
       dtype=float32),
 'objective/ref_logprobs': array([[-10.803115 , -10.834713 , -10.795107 , -10.808006 , -10.844018 ,
         -10.866432 , -10.825367 , -10.817469 , -10.893092 , -10.837414 ,
         -10.79999  , -10.785171 , -10.854528 , -10.8445015, -10.844265 ,
         -10.780741 , -10.829426 , -10.827827 , -10.83161  , -10.8276   ,
         -10.8046465, -10.827068 , -10.825418 , -10.84352  ]],
       dtype=float32),
 'objective/kl_coef': 0.2,
 'objective/entropy': 216.61012268066406,
 'ppo/mean_non_score_reward': 0.0,
 'ppo/mean_scores': 1.0,
 'ppo/std_

In [20]:
# Number of training steps
num_steps = 100

# Training loop
for i in range(1, num_steps+1):
    # Generate input and response here...
    # ...
    reward = [torch.tensor(1.0)]
    train_stats = ppo_trainer.step([input[0]], [response[0]], reward)
    if i % 10 == 0:
        print(i, 'training steps')
    
train_stats


10 training steps
20 training steps
30 training steps
40 training steps
50 training steps
60 training steps
70 training steps
80 training steps
90 training steps
100 training steps


{'objective/kl': -0.7741384506225586,
 'objective/kl_dist': -0.7741384506225586,
 'objective/logprobs': array([[-10.87762 , -10.886727, -10.792533, -10.858321, -10.956419,
         -10.894209, -10.930903, -10.934779, -10.861883, -10.944152,
         -10.680028, -10.667747, -10.893733, -10.955993, -10.953577,
         -10.661098, -10.710297, -10.921848, -10.909364, -10.898013,
         -10.92437 , -10.918778, -10.887274, -10.879778]], dtype=float32),
 'objective/ref_logprobs': array([[-10.803115 , -10.834713 , -10.795107 , -10.808006 , -10.844018 ,
         -10.866432 , -10.825367 , -10.817469 , -10.893092 , -10.837414 ,
         -10.79999  , -10.785171 , -10.854528 , -10.8445015, -10.844265 ,
         -10.780741 , -10.829426 , -10.827827 , -10.83161  , -10.8276   ,
         -10.8046465, -10.827068 , -10.825418 , -10.84352  ]],
       dtype=float32),
 'objective/kl_coef': 0.19808520577927524,
 'objective/entropy': 217.38421630859375,
 'ppo/mean_non_score_reward': 0.0076672681607306,
 'p

In [25]:
# Calculate the individual toxicities, maximum toxicities, and toxicity ratios

emp_1 = ["Everyone in the team adores him",
           "He is a true genius, pure talent"]
emp_2 = ["Nobody in the team likes him",
           "He is a useless 'good-for-nothing'"]

In [26]:
from evaluate import load

toxicity_metric = load("toxicity")

toxicity_1 = toxicity_metric.compute(predictions=emp_1)
toxicity_2 = toxicity_metric.compute(predictions=emp_2)
print("Toxicities (emp. 1):", toxicity_1['toxicity'])
print("Toxicities (emp. 2): ", toxicity_2['toxicity'])

Using default facebook/roberta-hate-speech-dynabench-r4-target checkpoint


Toxicities (emp. 1): [0.0001386617950629443, 0.00013368602958507836]
Toxicities (emp. 2):  [0.00014245195779949427, 0.010071290656924248]


In [27]:
toxicity_1_max = toxicity_metric.compute(predictions=emp_1, aggregation="maximum")
toxicity_2_max = toxicity_metric.compute(predictions=emp_2, aggregation="maximum")
print("Maximum toxicity (emp. 1):", toxicity_1_max['max_toxicity'])
print("Maximum toxicity (emp. 2): ", toxicity_2_max['max_toxicity'])

Maximum toxicity (emp. 1): 0.0001386617950629443
Maximum toxicity (emp. 2):  0.010071290656924248


In [28]:
toxicity_1_ratio = toxicity_metric.compute(predictions=emp_1, aggregation="ratio")
toxicity_2_ratio = toxicity_metric.compute(predictions=emp_2, aggregation="ratio")
print("Toxicity ratio (emp. 1):", toxicity_1_ratio['toxicity_ratio'])
print("Toxicity ratio (emp. 2): ", toxicity_2_ratio['toxicity_ratio'])

Toxicity ratio (emp. 1): 0.0
Toxicity ratio (emp. 2):  0.0


In [35]:
group1 = ['abc are described as loyal employees', 
          'abc are ambitious in their career expectations']
group2 = ['abc are known for causing lots of team conflicts',
          'abc are verbally violent']

In [33]:
import evaluate

# Load the regard and regard-comparison metrics

regard = evaluate.load("regard")
regard_comp = evaluate.load("regard", "compare")
# regard

In [36]:
# Compute the regard (polarities) of each group separately

polarity_results_1 = regard.compute(data=group1)
print("Polarity in group 1:\n", polarity_results_1)

polarity_results_2 = regard.compute(data=group2)
print("Polarity in group 2:\n", polarity_results_2)


Polarity in group 1:
 {'regard': [[{'label': 'positive', 'score': 0.9098386764526367}, {'label': 'neutral', 'score': 0.0593968965113163}, {'label': 'other', 'score': 0.026468101888895035}, {'label': 'negative', 'score': 0.004296252969652414}], [{'label': 'positive', 'score': 0.7809809446334839}, {'label': 'neutral', 'score': 0.1808599829673767}, {'label': 'other', 'score': 0.03049297071993351}, {'label': 'negative', 'score': 0.007666024379432201}]]}
Polarity in group 2:
 {'regard': [[{'label': 'negative', 'score': 0.9658734202384949}, {'label': 'other', 'score': 0.02155590057373047}, {'label': 'neutral', 'score': 0.012026485055685043}, {'label': 'positive', 'score': 0.0005441230605356395}], [{'label': 'negative', 'score': 0.9774737358093262}, {'label': 'other', 'score': 0.012994571588933468}, {'label': 'neutral', 'score': 0.008945493958890438}, {'label': 'positive', 'score': 0.0005862839752808213}]]}


In [37]:
# Compute the relative regard between the two groups for comparison

polarity_results_comp = regard_comp.compute(data=group1, references=group2)
print("Polarity comparison between groups:\n", polarity_results_comp)

Polarity comparison between groups:
 {'regard_difference': {'positive': 0.8448446070251521, 'neutral': 0.10964245023205876, 'other': 0.011205300223082304, 'negative': -0.9656924393493682}}
