In [3]:
# Import the function for loading Hugging Face pipelines
from transformers import pipeline

model_name = 'nlptown/bert-base-multilingual-uncased-sentiment'

# Load the pipeline for sentiment classification
classifier = pipeline("text-classification", model=model_name)
classifier

<transformers.pipelines.text_classification.TextClassificationPipeline at 0x29dff8438b0>

In [4]:
prompt = "The food was good, but service at the restaurant was a bit slow"

# Pass the customer review to the model for prediction
prediction = classifier(prompt)
print(prediction)

[{'label': '3 stars', 'score': 0.6387940645217896}]


In [5]:
model_name = 'cnicu/t5-small-booksum'

# Load the model pipeline for text summarization
summarizer = pipeline('summarization', model = model_name)
summarizer

Downloading config.json: 100%|█████████████████████████████████████████████████████| 1.38k/1.38k [00:00<00:00, 687kB/s]
Downloading pytorch_model.bin: 100%|████████████████████████████████████████████████| 242M/242M [00:05<00:00, 46.2MB/s]
Downloading tokenizer_config.json: 100%|██████████████████████████████████████████████████| 1.92k/1.92k [00:00<?, ?B/s]
Downloading spiece.model: 100%|█████████████████████████████████████████████████████| 792k/792k [00:00<00:00, 2.14MB/s]
Downloading tokenizer.json: 100%|█████████████████████████████████████████████████| 2.42M/2.42M [00:00<00:00, 5.99MB/s]
Downloading (…)cial_tokens_map.json: 100%|████████████████████████████████████████| 1.79k/1.79k [00:00<00:00, 1.78MB/s]


<transformers.pipelines.text2text_generation.SummarizationPipeline at 0x29dff8ad390>

In [10]:
long_text = '\nThe tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey building, and the tallest structure in Paris. Its base is square, measuring 125 metres (410 ft) on each side. During its construction, the Eiffel Tower surpassed the Washington Monument to become the tallest man-made structure in the world, a title it held for 41 years until the Chrysler Building in New York City was finished in 1930. It was the first structure to reach a height of 300 metres. Due to the addition of a broadcasting aerial at the top of the tower in 1957, it is now taller than the Chrysler Building by 5.2 metres (17 ft). Excluding transmitters, the Eiffel Tower is the second tallest free-standing structure in France after the Millau Viaduct.\n'

# Pass the long text to the model to summarize it
outputs = summarizer(long_text, max_length = 30)
outputs

[{'summary_text': 'the Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey'}]

In [11]:
# Access and print the summarized text in the outputs variable
print(outputs[0]['summary_text'])

the Eiffel Tower is 324 metres (1,063 ft) tall, about the same height as an 81-storey


In [14]:
# Load the model pipeline for question-answering
qa_model = pipeline("question-answering")
question = "For how long was the Eiffel Tower the tallest man-made structure in the world?"

# Pass the necessary inputs to the LLM pipeline for question-answering
outputs = qa_model(question, long_text)

# Access and print the answer
print(outputs['answer'])

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


41 years


In [32]:
model_name = "Helsinki-NLP/opus-mt-es-en"

input_text = "Este curso sobre LLMs se está poniendo muy interesante"

# Define pipeline for Spanish-to-English translation
translator = pipeline('translation_es_to_en', model=model_name)

# Translate the input text
translations = translator(input_text)

# Access the output to print the translated text in English
print(translations[0]['translation_text'])

Downloading config.json: 100%|████████████████████████████████████████████████████████████| 1.44k/1.44k [00:00<?, ?B/s]
Downloading pytorch_model.bin: 100%|████████████████████████████████████████████████| 312M/312M [00:07<00:00, 41.9MB/s]
Downloading generation_config.json: 100%|██████████████████████████████████████████████| 293/293 [00:00<00:00, 302kB/s]
Downloading tokenizer_config.json: 100%|████████████████████████████████████████████████████| 44.0/44.0 [00:00<?, ?B/s]
Downloading source.spm: 100%|███████████████████████████████████████████████████████| 826k/826k [00:00<00:00, 10.9MB/s]
Downloading target.spm: 100%|███████████████████████████████████████████████████████| 802k/802k [00:00<00:00, 2.01MB/s]
Downloading vocab.json: 100%|█████████████████████████████████████████████████████| 1.59M/1.59M [00:00<00:00, 3.18MB/s]


This course on LLMs is getting very interesting.


In [28]:
# Set transformer model hyperparameters
d_model = 256
n_heads = 4
num_encoder_layers = 3
num_decoder_layers = 3

In [29]:
import torch.nn as nn

# Create the transformer model and assign hyperparameters
model = nn.Transformer(
    d_model=d_model,
    nhead=n_heads,
    num_encoder_layers=num_encoder_layers,    
    num_decoder_layers=num_decoder_layers
    )


print(model)

Transformer(
  (encoder): TransformerEncoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerEncoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, out_features=256, bias=True)
        )
        (linear1): Linear(in_features=256, out_features=2048, bias=True)
        (dropout): Dropout(p=0.1, inplace=False)
        (linear2): Linear(in_features=2048, out_features=256, bias=True)
        (norm1): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (norm2): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
        (dropout1): Dropout(p=0.1, inplace=False)
        (dropout2): Dropout(p=0.1, inplace=False)
      )
    )
    (norm): LayerNorm((256,), eps=1e-05, elementwise_affine=True)
  )
  (decoder): TransformerDecoder(
    (layers): ModuleList(
      (0-2): 3 x TransformerDecoderLayer(
        (self_attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=256, o

In [34]:
# Create a pipeline for text generation using the gpt2 model
generator = pipeline("text-generation", model = "gpt2")

text = "I had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had."

response = "Dear valued customer, I am glad to hear you had a good stay with us."



In [35]:
# Build the prompt for the text generation LLM

prompt = f"Customer review:\n{text}\n\nHotel reponse to the customer:\n{response}"

prompt

"Customer review:\nI had a wonderful stay at the Riverview Hotel! The staff were incredibly attentive and the amenities were top-notch. The only hiccup was a slight delay in room service, but that didn't overshadow the fantastic experience I had.\n\nHotel reponse to the customer:\nDear valued customer, I am glad to hear you had a good stay with us."

#### Pass the prompt to the model pipeline
outputs = generator(prompt, max_length = 150, pad_token_id=generator.tokenizer.eos_token_id) #  if the generated text is shorter than max_length, the remaining tokens will be filled with the EOS token.

# Print the augmented sequence generated by the model
print(outputs[0]['generated_text'])

In [47]:
import torch  
import torch.nn as nn  
import math  

# Subclass the PyTorch nn.Module class to create a custom module for positional encoding
# This class is used to add positional information to the input embeddings in a Transformer model
class PositionalEncoder(nn.Module):
    def __init__(self, d_model, max_length):
        # Call the parent class's constructor
        super(PositionalEncoder, self).__init__()
        
        # Initialize the dimensions of the model and the maximum sequence length
        self.d_model = d_model  # The dimension of the input embeddings
        self.max_length = max_length  # The maximum length of the input sequences
        
        # Initialize the positional encoding matrix with zeros
        # This matrix will store the positional encodings that will be added to the input embeddings
        pe = torch.zeros(max_length, d_model)  

        # Create a tensor of positions from 0 to max_length
        # This tensor represents the positions of the words in a sequence
        position = torch.arange(0, max_length, dtype=torch.float).unsqueeze(1)  #  we unsqueeze because the multiplication operation position * div_term requires position to be a 2D tensor to correctly broadcast with div_term (the extra dimension is added at position '1': second position)
        
        # Calculate the division term for the positional encoding
        # This term is used in the calculation of the positional encodings
        # The div_term values decrease exponentially, which means the positional encoding changes more rapidly for lower-dimensional embeddings and more slowly for higher-dimensional embeddings. This allows the model to learn to attend to both nearby words (local attention) and far-away words (global attention), which is crucial for understanding the context of a sentence.
        div_term = torch.exp(torch.arange(0, d_model, 2, dtype=torch.float) * -(math.log(10000.0) / d_model))  
        
        # Calculate and assign position encodings to the matrix
        # For even indices, use sine of the position times the division term
        # For odd indices, use cosine of the position times the division term
        # These encodings are based on sine and cosine functions of different frequencies
        # The sine and cosine functions are used to ensure that the positional encodings are continuous and differentiable, which is important for the learning process. Also, these functions generate values between -1 and 1, which helps to keep the magnitude of the positional encodings manageable.
        # Using sine for even indices and cosine for odd indices provides two different signals for each position, which helps the model distinguish between different positions more effectively.
        pe[:, 0::2] = torch.sin(position * div_term)  
        pe[:, 1::2] = torch.cos(position * div_term)  
        
        # Add an extra dimension to the positional encoding matrix, turning it from a 2D tensor into a 3D tensor(the extra dimension is added at position '0': first position)
        # This is done to match the dimensions of the input embeddings (batch size, sequence length, and embedding size)
        pe = pe.unsqueeze(0)  
        
        # Register the positional encoding matrix as a buffer that should not be considered a model parameter
        # Buffers are tensors that are not updated during backpropagation but need to be part of the model's state
        self.register_buffer('pe', pe)  # N.B. self.pe is defined when pe is registered as a buffer. 
    
    # Define the forward pass of the model
    def forward(self, x):
        # Update the input tensor by adding the positional encodings
        # The positional encodings are added to the input embeddings so that the model can take into account the position of words in a sequence
        # The size(1) method returns the size of the second dimension of x, which represents the sequence length.
        # By slicing pe to :x.size(1), we ensure that the positional encodings are correctly aligned with the words in each input sequence.
        x = x + self.pe[:, :x.size(1)]  
        
        # Return the updated tensor
        # This updated tensor is then passed on to the next layer of the Transformer model
        return x  


In [73]:
class MultiHeadAttention(nn.Module):
    
    def __init__(self, d_model, num_heads):
        super(MultiHeadAttention, self).__init__()
        # The number of attention heads. This is the number of different 
        # weighted sums of the input vectors that we will compute.
        self.num_heads = num_heads

        # The dimension of the input embeddings. This is the size of the 
        # vectors that will be processed by the attention mechanism.
        self.d_model = d_model 

        # The dimension of each head. This is the size of the vectors that 
        # each attention head will process independently.
        # N.B. each attention head processes the entire data, but they do so in their own learned representation space. The head_dim is the dimensionality of this representation space. The division by num_heads is done to ensure that the dimensionality of the input (d_model) is preserved through this process. This is important for the subsequent layers in the model, which expect inputs of a d_model dimensionality.
        self.head_dim = d_model // num_heads

        # Linear transformations for the queries, keys and values. These 
        # are standard fully connected layers that will transform the input 
        # vectors into the queries, keys and values.
        self.query_linear = nn.Linear(d_model, d_model)
        self.key_linear = nn.Linear(d_model, d_model)
        self.value_linear = nn.Linear(d_model, d_model)

        # The final linear transformation. This layer will transform the 
        # concatenated output of the attention heads into the final output 
        # vector.
        self.output_linear = nn.Linear(d_model, d_model)
        
    def split_heads(self, x, batch_size):
        # This function splits the input vectors into the different attention 
        # heads. It first reshapes the input vectors into a tensor of shape 
        # (batch_size, sequence_length, num_heads, head_dim), and then 
        # rearranges the dimensions to bring the number of heads to the 
        # second dimension.
        x = x.view(batch_size, -1, self.num_heads, self.head_dim)
        # The contiguous method is used to ensure that the tensor in memory is 
        # properly ordered, allowing us to view its data with a different shape.
        # Contiguous creates a new tensor with the same data but with all the data contiguously in memory. 
        # The view method is used twice to first change the shape of the tensor 
        # and then flatten the tensor (from batch_size, self.num_heads, -1, self.head_dim --> to batch_size * self.num_heads, -1, self.head_dim)
        return x.permute(0, 2, 1, 3).contiguous().view(batch_size * self.num_heads, -1, self.head_dim) 
    
    
    def compute_attention(self, query, key, mask=None):
        # This function computes the attention weights. It first computes 
        # the dot product of the query and key tensors, applies a mask if 
        # provided, and then applies a softmax function to obtain the 
        # attention weights.
        scores = torch.matmul(query, key.permute(0, 2, 1))  # fixed from the original: torch.matmul(query, key.permute(1, 2, 0))
        if mask is not None:
            # The mask is used to prevent the attention mechanism from focusing 
            # on certain positions. This is done by setting the scores of these 
            # positions to a very large negative value, effectively zeroing out 
            # their impact on the softmax result.
            scores = scores.masked_fill(mask == 0, float("-1e20")) 
        attention_weights = F.softmax(scores, dim=-1)
        return attention_weights

    
    def forward(self, query, key, value, mask=None):
        # The forward function computes the output of the multi-head 
        # attention layer. It first applies the linear transformations and 
        # splits the input into multiple heads, then computes the attention 
        # weights, applies these weights to the values, and finally 
        # concatenates and linearly transforms the result into the output 
        # vector.
        # query.size(0) returns the size of the batch dimension. It is used 
        # to reshape the input tensors before splitting them into heads.
        batch_size = query.size(0) 

        query = self.split_heads(self.query_linear(query), batch_size)
        key = self.split_heads(self.key_linear(key), batch_size)
        value = self.split_heads(self.value_linear(value), batch_size)

        attention_weights = self.compute_attention(query, key, mask)

        output = torch.matmul(attention_weights, value)
        output = output.view(batch_size, self.num_heads, -1, self.head_dim).permute(0, 2, 1, 3).contiguous().view(batch_size, -1, self.d_model) # (from batch_size, -1, self.num_heads, self.head_dim --> to batch_size, -1, self.d_model), being self.d_model = self.num_heads * self.head_dim
        return self.output_linear(output)


In [74]:
class FeedForwardSubLayer(nn.Module):
    
    # Specify the two linear layers' input and output sizes
    def __init__(self, d_model, d_ff):
        super(FeedForwardSubLayer, self).__init__()
        self.fc1 = nn.Linear(d_model, d_ff)
        self.fc2 = nn.Linear(d_ff, d_model)
        self.relu = nn.ReLU()

	# Apply a forward pass
    def forward(self, x):
        return self.fc2(self.relu(self.fc1(x)))

In [75]:
# Complete the initialization of elements in the encoder layer
class EncoderLayer(nn.Module):
    
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(EncoderLayer, self).__init__()
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    # The input x is a sequence of embeddings representing the input tokens, and its shape is generally (batch_size, sequence_length, d_model). In the context of self-attention mechanisms, such as the one used in Transformer models, x is used as the Query (Q), Key (K), and Value (V).  In self-attention, the same input (in this case, x) is used as the Query, Key, and Value. This allows the model to compute attention scores based on the input itself.
    def forward(self, x, mask):
        attn_output = self.self_attn(x, x, x, mask)
        # The operation x + self.dropout(attn_output) is an example of a technique called residual connection: The idea is that it’s easier to model a residual (or difference) than to learn to model the full information. In this specific case,we are “adding the residual”, that is the output of the self-attention mechanism (which has learned how to modify the input) back to the original input. 
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.feed_forward(x)
        return self.norm2(x + self.dropout(ff_output))

In [76]:
class TransformerEncoder(nn.Module):
    
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length):
        super(TransformerEncoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoder(d_model, max_sequence_length)
        # Define a stack of multiple encoder layers
        self.layers = nn.ModuleList([EncoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)])
	
    # Complete the forward pass method
    def forward(self, x, mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, mask)
        return x


In [84]:
import torch.nn.functional as F


class ClassifierHead(nn.Module):
    
    def __init__(self, d_model, num_classes):
        super(ClassifierHead, self).__init__()
        # Add linear layer for multiple-class classification
        self.fc = nn.Linear(d_model, num_classes)

    # the dimensionality of the embeddings is:
# batch_size is the number of sequences in the batch.
# sequence_length is the length of each sequence.
# d_model is the dimensionality of the embeddings (i.e., the size of the feature vector for each token).
# The slice x[:, 0, :] selects the entire feature vector for the first token in each sequence, that is the [CLS] token, which includes an aggregate representation of the entire sequence, useful in classification tasks.
    def forward(self, x):
        logits = self.fc(x[:, 0, :])
        # Obtain log class probabilities upon raw outputs
        return F.log_softmax(logits, dim=-1)

In [85]:
num_classes = 3
vocab_size = 10000
batch_size = 8
d_model = 512
num_heads = 8
num_layers = 6
d_ff = 2048
sequence_length = 256
dropout = 0.1

In [86]:
# Note: although a random input sequence and mask are being used here, in practice, the mask should correspond to the actual location of padding tokens in the input sequences to ensure all of them are the same length

input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))


print(input_sequence.shape)
input_sequence

torch.Size([8, 256])


tensor([[3276, 5084, 3060,  ..., 1444, 6205, 8953],
        [7178, 9596, 3913,  ..., 2767, 9638, 1859],
        [3357, 3281, 4249,  ..., 8888, 5844,  721],
        ...,
        [4958, 3438, 2315,  ..., 6940, 9966,  563],
        [6210, 2765, 2150,  ...,  895, 9827, 9617],
        [5198, 1807, 9069,  ...,  472, 4867, 2383]])

In [87]:
mask = torch.randint(0, 2, (sequence_length, sequence_length))

print(mask.shape)
mask

torch.Size([256, 256])


tensor([[1, 1, 0,  ..., 1, 0, 1],
        [0, 1, 1,  ..., 0, 0, 0],
        [1, 0, 0,  ..., 1, 1, 0],
        ...,
        [1, 1, 0,  ..., 0, 0, 0],
        [1, 0, 1,  ..., 0, 0, 1],
        [0, 0, 1,  ..., 1, 0, 1]])

In [88]:
# Instantiate the encoder transformer's body and head
encoder = TransformerEncoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)

encoder

TransformerEncoder(
  (embedding): Embedding(10000, 512)
  (positional_encoding): PositionalEncoder()
  (layers): ModuleList(
    (0-5): 6 x EncoderLayer(
      (self_attn): MultiHeadAttention(
        (query_linear): Linear(in_features=512, out_features=512, bias=True)
        (key_linear): Linear(in_features=512, out_features=512, bias=True)
        (value_linear): Linear(in_features=512, out_features=512, bias=True)
        (output_linear): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): FeedForwardSubLayer(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in_features=2048, out_features=512, bias=True)
        (relu): ReLU()
      )
      (norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (norm2): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
)

In [89]:
classifier = ClassifierHead(d_model, num_classes)

classifier

ClassifierHead(
  (fc): Linear(in_features=512, out_features=3, bias=True)
)

In [127]:
# Complete the forward pass 
enc_output = encoder(input_sequence, mask)

print(enc_output.shape)
enc_output


torch.Size([8, 256, 512])


tensor([[[-1.5396e+00, -7.0473e-01, -1.6752e-01,  ..., -4.3265e-01,
           1.6643e-01, -6.7266e-01],
         [ 3.7615e-01, -2.1864e-01,  1.9548e+00,  ...,  7.2642e-01,
          -5.6769e-01,  4.9699e-01],
         [-5.7968e-01, -4.6237e-01,  8.9962e-01,  ...,  5.6324e-01,
          -1.7061e+00,  3.1116e-01],
         ...,
         [ 8.2552e-02, -1.3685e+00, -1.0649e-01,  ...,  8.2046e-01,
          -3.9226e-01,  6.9439e-01],
         [-1.7032e-01, -5.6079e-01, -1.4322e+00,  ...,  1.2878e+00,
           7.4063e-01,  2.2125e+00],
         [-6.5154e-01, -6.4432e-01,  3.9084e-01,  ...,  2.0036e+00,
           9.3471e-01,  6.8842e-01]],

        [[-1.1413e+00, -1.9727e-01,  1.2967e+00,  ...,  9.2610e-01,
          -3.9590e-02,  1.1990e+00],
         [ 4.1502e-01,  8.3782e-01, -1.3256e-01,  ...,  9.2458e-01,
          -1.0622e+00,  8.6573e-05],
         [-6.5585e-01, -2.5852e-01, -5.0404e-01,  ...,  6.7345e-01,
           1.0727e+00,  3.8947e-02],
         ...,
         [-3.7751e-02, -7

In [92]:
classification = classifier(output)
print("Classification outputs for a batch of ", batch_size, "sequences:")
print(classification.shape)
classification

Classification outputs for a batch of  8 sequences:
torch.Size([8, 3])


tensor([[-0.6070, -2.4803, -0.9907],
        [-1.0721, -1.7006, -0.7441],
        [-0.9410, -0.9812, -1.4486],
        [-0.5018, -1.3718, -1.9598],
        [-0.6680, -2.1002, -1.0083],
        [-0.6394, -2.7262, -0.8991],
        [-0.8657, -1.0693, -1.4439],
        [-0.5899, -1.5080, -1.4949]], grad_fn=<LogSoftmaxBackward0>)

In [118]:
class DecoderLayer(nn.Module):
    def __init__(self, d_model, num_heads, d_ff, dropout):
        super(DecoderLayer, self).__init__()
        
        # Initialize the causal (masked) self-attention and cross-attention
        self.self_attn = MultiHeadAttention(d_model, num_heads)
        self.cross_attn = MultiHeadAttention(d_model, num_heads)
        self.feed_forward = FeedForwardSubLayer(d_model, d_ff)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.norm3 = nn.LayerNorm(d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, causal_mask, encoder_output, cross_mask):
        # Pass the necessary arguments to the causal self-attention and cross-attention
        self_attn_output = self.self_attn(x, x, x, causal_mask)
        x = self.norm1(x + self.dropout(self_attn_output))
        cross_attn_output = self.cross_attn(x, encoder_output, encoder_output, cross_mask)
        x = self.norm2(x + self.dropout(cross_attn_output))
        ff_output = self.feed_forward(x)
        x = self.norm3(x + self.dropout(ff_output))
        return x

In [119]:
class TransformerDecoder(nn.Module):
    
    def __init__(self, vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length):
        super(TransformerDecoder, self).__init__()
        self.embedding = nn.Embedding(vocab_size, d_model)
        self.positional_encoding = PositionalEncoder(d_model, max_sequence_length)
        self.layers = nn.ModuleList([DecoderLayer(d_model, num_heads, d_ff, dropout) for _ in range(num_layers)]) # using EncoderLayer, but it should be DecoderLayer

        # Add a linear layer (head) for next-word prediction
        self.fc = nn.Linear(d_model, vocab_size)

    # def forward(self, x, self_mask):
    #     x = self.embedding(x)
    #     x = self.positional_encoding(x)
    #     for layer in self.layers:
    #         x = layer(x, self_mask)
            
    def forward(self, x, self_mask, encoder_output, cross_mask):
        x = self.embedding(x)
        x = self.positional_encoding(x)
        for layer in self.layers:
            x = layer(x, self_mask, encoder_output, cross_mask)
        # return x

        # Apply the forward pass through the model head
        x = self.fc(x)
        # When you apply F.log_softmax(x, dim=-1), the softmax function is applied to the d_model dimension. This means that the softmax function is applied independently to each sequence in each batch, and the output tensor will have the same shape as the input tensor.
        return F.log_softmax(x, dim=-1)

In [120]:
input_sequence = torch.randint(0, vocab_size, (batch_size, sequence_length))

print(input_sequence.shape)
input_sequence

torch.Size([8, 256])


tensor([[3781, 4176, 9546,  ...,  608,  945, 8430],
        [6604, 7862, 5303,  ..., 7011, 8362,  568],
        [6679, 6208, 8972,  ..., 4628, 3468, 2262],
        ...,
        [7556, 6363, 4086,  ..., 7937, 2155, 6590],
        [ 692,  910, 1684,  ..., 2832, 3182, 2349],
        [2623, 2797, 6853,  ..., 8498, 2130,  256]])

In [121]:
torch.ones(1, 8, 8)

tensor([[[1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.],
         [1., 1., 1., 1., 1., 1., 1., 1.]]])

In [122]:
torch.triu(torch.ones(1, 8, 8), diagonal=1)

tensor([[[0., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1.],
         [0., 0., 0., 1., 1., 1., 1., 1.],
         [0., 0., 0., 0., 1., 1., 1., 1.],
         [0., 0., 0., 0., 0., 1., 1., 1.],
         [0., 0., 0., 0., 0., 0., 1., 1.],
         [0., 0., 0., 0., 0., 0., 0., 1.],
         [0., 0., 0., 0., 0., 0., 0., 0.]]])

In [123]:
1 - torch.triu(torch.ones(1, sequence_length, sequence_length), diagonal=1)

tensor([[[1., 0., 0.,  ..., 0., 0., 0.],
         [1., 1., 0.,  ..., 0., 0., 0.],
         [1., 1., 1.,  ..., 0., 0., 0.],
         ...,
         [1., 1., 1.,  ..., 1., 0., 0.],
         [1., 1., 1.,  ..., 1., 1., 0.],
         [1., 1., 1.,  ..., 1., 1., 1.]]])

In [124]:
# Create a triangular attention mask for causal attention
self_attention_mask = (1 - torch.triu(torch.ones(1, sequence_length, sequence_length), diagonal=1)).bool()  # Upper triangular mask

print(self_attention_mask.shape)
self_attention_mask

torch.Size([1, 256, 256])


tensor([[[ True, False, False,  ..., False, False, False],
         [ True,  True, False,  ..., False, False, False],
         [ True,  True,  True,  ..., False, False, False],
         ...,
         [ True,  True,  True,  ...,  True, False, False],
         [ True,  True,  True,  ...,  True,  True, False],
         [ True,  True,  True,  ...,  True,  True,  True]]])

In [125]:
# Instantiate the decoder transformer
decoder = TransformerDecoder(vocab_size, d_model, num_layers, num_heads, d_ff, dropout, max_sequence_length=sequence_length)

decoder

TransformerDecoder(
  (embedding): Embedding(10000, 512)
  (positional_encoding): PositionalEncoder()
  (layers): ModuleList(
    (0-5): 6 x DecoderLayer(
      (self_attn): MultiHeadAttention(
        (query_linear): Linear(in_features=512, out_features=512, bias=True)
        (key_linear): Linear(in_features=512, out_features=512, bias=True)
        (value_linear): Linear(in_features=512, out_features=512, bias=True)
        (output_linear): Linear(in_features=512, out_features=512, bias=True)
      )
      (cross_attn): MultiHeadAttention(
        (query_linear): Linear(in_features=512, out_features=512, bias=True)
        (key_linear): Linear(in_features=512, out_features=512, bias=True)
        (value_linear): Linear(in_features=512, out_features=512, bias=True)
        (output_linear): Linear(in_features=512, out_features=512, bias=True)
      )
      (feed_forward): FeedForwardSubLayer(
        (fc1): Linear(in_features=512, out_features=2048, bias=True)
        (fc2): Linear(in

In [130]:
# decoder cross_mask

padding_mask = torch.randint(0, 2, (sequence_length, sequence_length))
padding_mask

tensor([[0, 1, 0,  ..., 0, 0, 0],
        [1, 1, 0,  ..., 0, 1, 1],
        [1, 0, 1,  ..., 0, 0, 1],
        ...,
        [0, 1, 0,  ..., 0, 1, 0],
        [1, 0, 1,  ..., 1, 0, 0],
        [1, 0, 0,  ..., 0, 0, 0]])

In [132]:
# The reason why the output is a 3D tensor rather than a 2D one is because the model is processing multiple sequences at once (the batch size is greater than 1) and it’s predicting a probability distribution over all possible words for each position in each sequence.
# So, for each sequence in the batch (8 sequences), and for each position in each sequence (256 positions), you have a vector of length 10000 (the size of your vocabulary) representing the probability distribution over all possible next words.

dec_output = decoder(input_sequence, self_attention_mask, enc_output, padding_mask)
print(dec_output.shape)
print(dec_output)

torch.Size([8, 256, 10000])
tensor([[[ -8.7817, -10.2451,  -9.6885,  ..., -10.1293,  -9.3597,  -9.2839],
         [ -8.4072, -10.3035,  -9.8577,  ...,  -9.9263,  -9.5064, -10.0248],
         [ -8.6076,  -9.8351, -10.0339,  ...,  -9.2415,  -8.4927,  -8.5276],
         ...,
         [ -8.9352, -10.3473,  -9.9254,  ...,  -9.2700,  -9.4030,  -9.0087],
         [ -8.3065,  -9.9286, -10.0848,  ...,  -9.1453,  -9.5359,  -9.3849],
         [ -8.9925,  -9.0589,  -9.7228,  ...,  -9.2637,  -8.6402,  -9.2644]],

        [[ -9.4488,  -9.6935,  -9.9559,  ..., -10.2595,  -8.2290,  -9.0882],
         [ -8.0041,  -9.3526, -10.6797,  ...,  -9.4334,  -8.1505,  -8.3640],
         [ -8.5836,  -8.8375, -10.2864,  ..., -10.2429,  -9.6378,  -8.9388],
         ...,
         [ -8.3471,  -9.4997, -10.1958,  ...,  -9.1147,  -9.8314,  -9.1181],
         [ -7.8797, -10.4160, -10.3558,  ...,  -9.9459, -10.0784,  -9.8117],
         [ -7.8935,  -9.3017, -10.1916,  ...,  -9.3722,  -8.7363,  -8.8932]],

        [[ -8.44

In [9]:
from transformers import AutoModel, AutoTokenizer, AutoModelForSequenceClassification, AutoModelForSeq2SeqLM

model_name = "textattack/distilbert-base-uncased-SST-2"

# Load the tokenizer and pre-trained model
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer



DistilBertTokenizerFast(name_or_path='textattack/distilbert-base-uncased-SST-2', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [10]:
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

model



DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [13]:
text = ["The best movie I've ever watched!", "What an awful movie. I regret watching it."]

# Tokenize inputs and pass them to the model for inference
inputs = tokenizer(text, return_tensors="pt", padding=True)
inputs

{'input_ids': tensor([[ 101, 1996, 2190, 3185, 1045, 1005, 2310, 2412, 3427,  999,  102,    0],
        [ 101, 2054, 2019, 9643, 3185, 1012, 1045, 9038, 3666, 2009, 1012,  102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [14]:
outputs = model(**inputs)
outputs

SequenceClassifierOutput(loss=None, logits=tensor([[-0.0542,  0.2731],
        [ 0.9809, -0.7639]], grad_fn=<AddmmBackward0>), hidden_states=None, attentions=None)

In [15]:
logits = outputs.logits

logits

tensor([[-0.0542,  0.2731],
        [ 0.9809, -0.7639]], grad_fn=<AddmmBackward0>)

In [16]:
predicted_classes = torch.argmax(logits, dim=1).tolist()
predicted_classes

[1, 0]

In [17]:
for idx, predicted_class in enumerate(predicted_classes):
    print(f"Predicted class for \"{text[idx]}\": {predicted_class}")

Predicted class for "The best movie I've ever watched!": 1
Predicted class for "What an awful movie. I regret watching it.": 0


In [18]:
from datasets import load_dataset

# Load a dataset from Hugging Face's dataset hub
dataset = load_dataset('opinosis', trust_remote_code=True)

dataset


DatasetDict({
    train: Dataset({
        features: ['review_sents', 'summaries'],
        num_rows: 51
    })
})

In [20]:
print(f"Number of instances: {len(dataset['train'])}")


Number of instances: 51


In [21]:
# Show the names of features in the training fold of the dataset
print(f"Feature names: {dataset['train'].column_names}")

Feature names: ['review_sents', 'summaries']


In [22]:
dataset['train'][-2]

{'review_sents': "I bought the 8, gig Ipod Nano that has the built, in video camera .\r\n  Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod .\r\nI have lots of music cd's and dvd's, so currently I'm just interested in storing some of my music and videos on the ipod so I can enjoy them on my vacation, and while at work .\r\nThere's a right way and wrong way to store music and videos onto the ipod .\r\n Audio and video recording are a step above the competition .\r\n As always, the video screen is sharp and bright .\r\nipod nano even better with video camera !\r\n This time around, Apple is branching out of its iPod formula in a small, but not insignificant way by gracing the back of the Nano with its own video camera .\r\n Still, video fans should consider stepping up to a product with a larger screen, such as the Zune HD   or iPod Touch .\r\nThe user interface of the 5G Nano remains almost entirely unchanged with the ex

In [23]:
# Encode the input example, obtain the summary, and decode it
example = dataset['train'][-2]['review_sents']

In [24]:
model_name = "t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)
tokenizer

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


T5TokenizerFast(name_or_path='t5-small', vocab_size=32100, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_id_43>', '<extra_i

In [25]:
input_ids = tokenizer.encode("summarize: " + example, return_tensors="pt", max_length=512, truncation=True)
input_ids

tensor([[21603,    10,    27,  2944,     8,  9478, 10754,    27, 11410, 20556,
            24,    65,     8,  1192,     6,    16,   671,  1861,     3,     5,
            94,   444,     7,    65,    46,    30,     6,   689,  1078,     6,
           213,    25,   164,  1242,    11,   946,   723,    11,  3075,    84,
            56,  2438,  2400,     8,     3,    23, 11410,     3,     5,    27,
            43,  1995,    13,   723,     3,    75,    26,    31,     7,    11,
         30114,    31,     7,     6,    78,  1083,    27,    31,    51,   131,
          1638,    16,     3, 17445,   128,    13,    82,   723,    11,  3075,
            30,     8,     3,    23, 11410,    78,    27,    54,   777,   135,
            30,    82,  4257,     6,    11,   298,    44,   161,     3,     5,
           290,    31,     7,     3,     9,   269,   194,    11,  1786,   194,
            12,  1078,   723,    11,  3075,  2400,     8,     3,    23, 11410,
             3,     5,  9607,    11,   671,  5592,  

In [26]:
summary_ids = model.generate(input_ids, max_length=150)
summary_ids[0]

tensor([    0,    27,  2944,     8,  9478, 10754,    27, 11410, 20556,    24,
           65,     8,  1192,     6,    16,   671,  1861,     3,     5,    94,
          444,     7,    65,    46,    30,     6,   689,  1078,     6,   213,
           25,   164,  1242,    11,   946,   723,    11,  3075,    84,    56,
         2438,  2400,     8,     3,    23, 11410,     3,     5,     1])

In [27]:
summary = tokenizer.decode(summary_ids[0], skip_special_tokens=True)

print("\nOriginal Text (first 400 characters): \n", example[:400])
print("\nGenerated Summary: \n", summary)


Original Text (first 400 characters): 
 I bought the 8, gig Ipod Nano that has the built, in video camera .
  Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod .
I have lots of music cd's and dvd's, so currently I'm just interested in storing some of my music and videos on the ipod so I can enjoy them on my vacation, and while at work .
There's a right way and wrong wa

Generated Summary: 
 I bought the 8, gig Ipod Nano that has the built, in video camera. Itunes has an on, line store, where you may purchase and download music and videos which will install onto the ipod.


In [28]:
model_name = "Helsinki-NLP/opus-mt-en-es"

# Load the tokenizer and the model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSeq2SeqLM.from_pretrained(model_name)

model



MarianMTModel(
  (model): MarianModel(
    (shared): Embedding(65001, 512, padding_idx=65000)
    (encoder): MarianEncoder(
      (embed_tokens): Embedding(65001, 512, padding_idx=65000)
      (embed_positions): MarianSinusoidalPositionalEmbedding(512, 512)
      (layers): ModuleList(
        (0-5): 6 x MarianEncoderLayer(
          (self_attn): MarianAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (self_attn_layer_norm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (activation_fn): SiLUActivation()
          (fc1): Linear(in_features=512, out_features=2048, bias=True)
          (fc2): Linear(in_features=2048, out_features=512, bias=True)
          (final_layer_norm): LayerNorm((512,),

In [29]:
# The reason why the input and translated IDs vectors have more elements than the corresponding word inputs is due to the way the tokenizer works.
# In your code, the tokenizer.encode function is used to convert the input text into a sequence of IDs, which represent the tokens in the text. These tokens can be individual words, but they can also be smaller units depending on the tokenizer. For example, a word might be split into multiple subwords, each with its own ID.
# Additionally, special tokens are often added to the sequence. For instance, a common practice is to add a special token at the beginning and end of the sequence. In your case, the 0 at the end of each input_ids and translated_ids tensor is likely a special token, such as an end-of-sequence token

english_inputs = ["Hello", "Thank you", "How are you?", "Sorry", "Goodbye"]

# Encode the inputs, generate translations, decode, and print them
for english_input in english_inputs:
    print('english_input', english_input)
    input_ids = tokenizer.encode(english_input, return_tensors="pt")
    print('input_ids', input_ids)
    translated_ids = model.generate(input_ids)
    print('translated_ids', translated_ids)
    translated_text = tokenizer.decode(translated_ids[0], skip_special_tokens=True)
    print(f"English: {english_input} | Spanish: {translated_text}")

english_input Hello
input_ids tensor([[3923,    0]])




translated_ids tensor([[65000,  2119,     3,     0]])
English: Hello | Spanish: Hola.
english_input Thank you
input_ids tensor([[1825,   40,    0]])
translated_ids tensor([[65000,  1124,     3,     0]])
English: Thank you | Spanish: Gracias.
english_input How are you?
input_ids tensor([[594,  53,  40,  21,   0]])
translated_ids tensor([[65000,    50,  1102,  1221,    21,     0]])
English: How are you? | Spanish: ¿Cómo estás?
english_input Sorry
input_ids tensor([[5099,    0]])
translated_ids tensor([[65000,   350,  1669,     3,     0]])
English: Sorry | Spanish: Lo siento.
english_input Goodbye
input_ids tensor([[22191,     0]])
translated_ids tensor([[65000,  8631,     3,     0]])
English: Goodbye | Spanish: Adiós.


In [30]:
# Load a specific subset of the dataset 
mlqa = load_dataset("xtreme", name="MLQA.en.en")

mlqa

DatasetDict({
    test: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 11590
    })
    validation: Dataset({
        features: ['id', 'title', 'context', 'question', 'answers'],
        num_rows: 1148
    })
})

In [31]:
question = mlqa["test"]["question"][0]
context = mlqa["test"]["context"][0]
print("Question: ", question)
print("Context: ", context)

Question:  Who analyzed the biopsies?
Context:  In 1994, five unnamed civilian contractors and the widows of contractors Walter Kasza and Robert Frost sued the USAF and the United States Environmental Protection Agency. Their suit, in which they were represented by George Washington University law professor Jonathan Turley, alleged they had been present when large quantities of unknown chemicals had been burned in open pits and trenches at Groom. Biopsies taken from the complainants were analyzed by Rutgers University biochemists, who found high levels of dioxin, dibenzofuran, and trichloroethylene in their body fat. The complainants alleged they had sustained skin, liver, and respiratory injuries due to their work at Groom, and that this had contributed to the deaths of Frost and Kasza. The suit sought compensation for the injuries they had sustained, claiming the USAF had illegally handled toxic materials, and that the EPA had failed in its duty to enforce the Resource Conservation a

In [32]:
model_ckp = "deepset/minilm-uncased-squad2"

# Initialize the tokenizer using the model checkpoint
tokenizer = AutoTokenizer.from_pretrained(model_ckp)

tokenizer

BertTokenizerFast(name_or_path='deepset/minilm-uncased-squad2', vocab_size=30522, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

In [33]:
# Tokenize the inputs returning the result as tensors
inputs = tokenizer(question, context, return_tensors="pt")

inputs

{'input_ids': tensor([[  101,  2040, 16578,  1996, 16012,  4523,  3111,  1029,   102,  1999,
          2807,  1010,  2274, 13294,  6831, 16728,  1998,  1996, 24835,  1997,
         16728,  4787, 10556, 17112,  2050,  1998,  2728, 10097, 12923,  1996,
         18531,  1998,  1996,  2142,  2163,  4483,  3860,  4034,  1012,  2037,
          4848,  1010,  1999,  2029,  2027,  2020,  3421,  2011,  2577,  2899,
          2118,  2375,  2934,  5655, 10722, 12866,  1010,  6884,  2027,  2018,
          2042,  2556,  2043,  2312, 12450,  1997,  4242, 12141,  2018,  2042,
          5296,  1999,  2330, 14496,  1998, 19874,  2012, 18087,  1012, 16012,
          4523,  3111,  2579,  2013,  1996, 17612, 11390,  2020, 16578,  2011,
         18607,  2118, 16012, 24229,  2015,  1010,  2040,  2179,  2152,  3798,
          1997,  4487, 11636,  2378,  1010,  4487, 10609,  6844, 27942,  2319,
          1010,  1998, 13012,  2818, 10626,  8913, 16921, 11474,  1999,  2037,
          2303,  6638,  1012,  1996, 1

In [34]:
from transformers import AutoModelForQuestionAnswering

# Initialize the LLM upon the model checkpoint
model = AutoModelForQuestionAnswering.from_pretrained(model_ckp)

model

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


BertForQuestionAnswering(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 384, padding_idx=0)
      (position_embeddings): Embedding(512, 384)
      (token_type_embeddings): Embedding(2, 384)
      (LayerNorm): LayerNorm((384,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=384, out_features=384, bias=True)
              (key): Linear(in_features=384, out_features=384, bias=True)
              (value): Linear(in_features=384, out_features=384, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=384, out_features=384, bias=True)
              (LayerNorm): LayerNorm((384,), eps=1e-12, elem

In [35]:
import torch

with torch.no_grad():
  # Forward-pass the input through the model
  outputs = model(**inputs)

outputs

QuestionAnsweringModelOutput(loss=None, start_logits=tensor([[ 0.4331, -5.9864, -5.7614, -6.2329, -5.9369, -6.4554, -6.6515, -6.6686,
          0.4331, -5.9137, -6.0544, -6.6783, -5.9388, -6.2935, -6.2757, -6.5190,
         -6.6694, -6.2961, -6.3753, -6.5280, -6.1553, -5.4943, -5.9384, -6.5034,
         -6.7191, -6.6250, -6.2206, -6.6423, -6.4067, -6.1051, -5.9176, -6.6871,
         -6.3237, -5.6972, -6.5879, -6.0408, -6.6248, -6.8032, -6.8567, -6.0126,
         -6.4246, -6.8355, -6.3049, -6.4013, -6.3018, -6.4767, -6.4598, -6.4260,
         -5.5132, -6.2999, -6.6227, -6.1841, -6.3011, -5.6165, -6.0477, -6.8492,
         -6.7725, -6.1979, -6.2060, -6.3269, -6.3990, -6.4921, -6.2939, -5.9060,
         -6.4909, -6.4594, -6.2571, -6.5393, -6.5011, -6.4583, -6.4658, -6.3752,
         -6.2070, -6.5106, -6.6355, -6.5982, -6.3821, -6.2631, -5.9546,  1.1200,
         -4.9416, -4.9593, -5.7370, -6.0798, -5.8981, -5.1775, -6.2021, -3.7974,
         -0.7555, -0.8330,  7.0906, -1.6840, -0.1442, -4

In [36]:
# Get the most likely start and end answer position from the raw LLM outputs
start_idx = torch.argmax(outputs.start_logits)
end_idx = torch.argmax(outputs.end_logits) + 1

start_idx, end_idx

(tensor(90), tensor(95))

In [37]:
# Access the tokenized inputs tensor to get the answer span
answer_span = inputs["input_ids"][0][start_idx:end_idx]

answer_span

tensor([18607,  2118, 16012, 24229,  2015])

In [38]:
# Decode the answer span to get the extracted answer text
answer = tokenizer.decode(answer_span)
print("Answer: ", answer)

Answer:  rutgers university biochemists


In [39]:
model_name = "distilbert-base-uncased"

# Load a pre-trained LLM, specifying its use for binary classification
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

model

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertForSequenceClassification: ['vocab_transform.bias', 'vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_layer_norm.bias', 'vocab_projector.bias']
- This IS expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForS

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [40]:
from transformers import TrainingArguments

# Set up training arguments with a batch size of 8 per GPU and 5 epochs
training_args = TrainingArguments(
    output_dir="./smaller_bert_finetuned",
    per_device_train_batch_size=8,
    num_train_epochs=5,
)

training_args

TrainingArguments(
_n_gpu=0,
adafactor=False,
adam_beta1=0.9,
adam_beta2=0.999,
adam_epsilon=1e-08,
auto_find_batch_size=False,
bf16=False,
bf16_full_eval=False,
data_seed=None,
dataloader_drop_last=False,
dataloader_num_workers=0,
dataloader_pin_memory=True,
ddp_backend=None,
ddp_bucket_cap_mb=None,
ddp_find_unused_parameters=None,
ddp_timeout=1800,
debug=[],
deepspeed=None,
disable_tqdm=False,
do_eval=False,
do_predict=False,
do_train=False,
eval_accumulation_steps=None,
eval_delay=0,
eval_steps=None,
evaluation_strategy=no,
fp16=False,
fp16_backend=auto,
fp16_full_eval=False,
fp16_opt_level=O1,
fsdp=[],
fsdp_config={'fsdp_min_num_params': 0, 'xla': False, 'xla_fsdp_grad_ckpt': False},
fsdp_min_num_params=0,
fsdp_transformer_layer_cls_to_wrap=None,
full_determinism=False,
gradient_accumulation_steps=1,
gradient_checkpointing=False,
greater_is_better=None,
group_by_length=False,
half_precision_backend=auto,
hub_model_id=None,
hub_private_repo=False,
hub_strategy=every_save,
hub_token=

In [53]:
# !pip install entrypoints
# !pip install sqlparse
# !pip install databricks_cli
# !pip install importlib_metadata
# !pip install --upgrade mlflow transformers
# !pip install --upgrade mlflow
# !pip uninstall mlflow
# !pip install mlflow

from transformers import Trainer

tokenized_datasets = []

# Set up trainer, assigning previously set up training arguments
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets,
)

trainer

ImportError: cannot import name 'MLFLOW_ENABLE_ASYNC_LOGGING' from 'mlflow.environment_variables' (C:\Users\Alienware\AppData\Roaming\Python\Python310\site-packages\mlflow\environment_variables.py)