In [1]:
import torch
print("PyTorch version:", torch.__version__)
print("Is CUDA available?:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("CUDA device name:", torch.cuda.get_device_name(0))

PyTorch version: 2.4.1
Is CUDA available?: True
CUDA device name: NVIDIA GeForce RTX 4070


In [12]:
import torch.nn as nn
from transformers import AutoModel, AutoTokenizer
from torch.nn.utils.rnn import pad_sequence
import torch
import torch.nn.functional as F
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset

## Task 1: Sentence Transformer Implementation

Sentence Transformers generate fixed-length sentence embeddings from variable-length texts. Unlike more complex methods, our implementation utilizes pre-trained BERT models with straightforward pooling techniques such as mean and max pooling to aggregate token information and convert token-level embeddings into sentence-level embeddings.

For classification tasks, sentence-level embeddings effectively categorize sentences. To adapt our Sentence Transformer model for multiple tasks, including Named Entity Recognition (NER), we've incorporated an option to generate token-level embeddings. This is achieved by specifying pooling='no_pool', which bypasses the pooling step and preserves individual token embeddings necessary for tasks requiring detailed granularithod, allowing for dirsaffectively utilize the model for generating sentence embeddings.

#### Model Architecture

**Initialization:**

* model_name: Utilizes a pre-trained BERT model, 'bert-base-uncased'.
* max_length: Sets a consistent maximum sequence length for input sentences to ensure uniform processing during tokenization.

**Forward Pass**

* Processes input_ids and attention_mask generated by the tokenizer.
* Performs a forward pass using BERT to obtain the last hidden states.
* Depending on the chosen pooling method, aggregates token embeddings into a single vector per sentence.

**Pooling Strategies:**

* **cls**: Utilizes the CLS token’s embedding, primarily for classification tasks.
* **mean**: Averages token embeddings to provide a generalized representation of the sentence context.
* **max**: Identifies the most significant features by applying max pooling across token embeddings.
* **no_pool**: Retains the full sequence of embeddings, suitable for token-level tasks or further processing.

**Tokenization and Encoding:**

The tokenize_and_encode method handles the tokenization and encoding of raw text inputs, preparing them for the model. It returns embeddings based on the selected pooling strategy, ready for downstream tasks.

In [3]:
max_length = 32

In [13]:
class SentenceTransformer(nn.Module):
    def __init__(self, model_name="bert-base-uncased",max_length=max_length):
        super().__init__()
        self.bert = AutoModel.from_pretrained(model_name)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.max_length = max_length 

    def forward(self, input_ids, attention_mask, pooling='mean'):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        last_hidden_state = outputs.last_hidden_state

        if pooling == "cls":
            return last_hidden_state[:, 0]
        elif pooling == "mean":
            mask_expanded = attention_mask.unsqueeze(-1).expand_as(last_hidden_state)
            summed = (last_hidden_state * mask_expanded).sum(1)
            counts = mask_expanded.sum(1)
            return summed / torch.clamp(counts, min=1e-9)
        elif pooling == "max":
            masked_hidden = last_hidden_state.masked_fill(~attention_mask.bool().unsqueeze(-1), float('-inf'))
            return torch.max(masked_hidden, 1)[0]
        elif pooling == "no_pool":
            return last_hidden_state
        else:
            raise ValueError("Invalid pooling method specified")

    # Returns the dimension of final hidden layer, to be used in Task 2
    def get_hidden_size(self): 
        return self.bert.config.hidden_size

    def tokenize_and_encode(self, sentences, pooling='mean'):
        encoded = self.tokenizer(sentences, padding="max_length", truncation=True, return_tensors="pt", max_length=self.max_length)
        input_ids = encoded["input_ids"].to(self.bert.device)
        attention_mask = encoded["attention_mask"].to(self.bert.device)
        return self.forward(input_ids, attention_mask, pooling)


**Testing the model:**

In [15]:
# Load Model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
sentence_transformer_model = SentenceTransformer().to(device)

# Example Sentences
sentences = ["Machine learning is amazing!", "Artificial intelligence is transforming industries."]

# Generate Sentence Embeddings with 'mean' pooling
embeddings = sentence_transformer_model.tokenize_and_encode(sentences)

print("Embeddings shape:", embeddings.shape)  
print("First sentence embedding (first 10 dims):", embeddings[0][:10])  # Preview first 10 dimensions


Embeddings shape: torch.Size([2, 768])
First sentence embedding (first 10 dims): tensor([ 0.0383,  0.1468, -0.0972,  0.0882,  0.2505, -0.5942,  0.2747,  0.7684,
        -0.3251, -0.5464], device='cuda:0', grad_fn=<SliceBackward0>)


In [16]:
# Example Sentences
sentences = ["Machine learning is amazing!", "Artificial intelligence is transforming industries."]

# Generate Sentence Embeddings with 'no_pool'
embeddings = sentence_transformer_model.tokenize_and_encode(sentences,pooling = 'no_pool')

print("Embeddings shape:", embeddings.shape)  # Expected: (2, 768)
print("First sentence embedding (first 10 dims):", embeddings[0][:10])  # Preview first 10 dimensions


Embeddings shape: torch.Size([2, 32, 768])
First sentence embedding (first 10 dims): tensor([[ 0.1668,  0.1314, -0.0622,  ..., -0.2837,  0.1851,  0.4111],
        [-0.1358,  0.0814, -0.1430,  ..., -0.2873,  0.4565,  0.5274],
        [-0.5486,  0.2052, -0.0356,  ..., -1.1440,  0.0069, -0.0233],
        ...,
        [ 0.3040, -0.1088,  0.4053,  ...,  0.1167,  0.0503, -0.1390],
        [ 0.1495, -0.3485,  0.2995,  ...,  0.2205,  0.1133, -0.3069],
        [ 0.4380,  0.0248,  0.4735,  ...,  0.0920, -0.0062, -0.0892]],
       device='cuda:0', grad_fn=<SliceBackward0>)


## Task 2: Multi-Task Learning Expansion

For the task of expanding a Sentence Transformer model to handle multi-task learning, the implementation has been designed to accommodate two distinct natural language processing tasks simultaneously. The architecture changes involve the integration of task-specific heads on top of a shared transformer-based encoder, allowing for both task-specific processing and resource-efficient learning. Here's a concise description of the modifications:

#### Multi-Task Model Description

The MultiTaskModel class is structured to utilize the encoding capabilities of a pre-defined transformer model, specifically designed to generate embeddings that serve to multiple tasks. This design ensures that the model can perform different types of analyses on the same input data without redundant processing.

**Model Components:**
- **Shared Encoder:** The SentenceTransformer model defined in Task 1 is used as the shared backbone. This encoder processes input sentences and produces embeddings. The choice of transformer (model_name) can be specified, with "bert-base-uncased" being a common default for English language tasks.

- **Task-specific Heads:** Two linear layers serve as task-specific heads:

  * **Classifier Head:** This component handles Sentence Classification by predicting the class of a sentence from a predefined set. It utilizes embeddings derived from the pooled output of the sentence transformer. `embedding_dim`, representing the dimensionality of the input features to the classifier, defines the size of the embeddings, while `num_task_A`, specifies the number of categories in the classification task and serves as the output size.
  * **NER Head:** For a task such as Named Entity Recognition (NER), which requires token-level predictions. This head operates on the sequence of token embeddings directly output by the transformer without pooling. `num_task_B` represents the number of NER classes, which serve as the output parameter.

**Forward Pass:**
- **Input Handling:** The model first tokenizes input sentences using the SentenceTransformer’s tokenizer, applying padding and truncation to fit the specified max_length.
- **Task-based Processing:**
  * For sentence classification, the method selects a pooling strategy (like mean, max, or using the cls token) to condense the entire token sequence into a single vector, which is then fed into the classifier head.
  * For NER or similar tasks requiring token-level details, the no_pool strategy is employed, preserving the original sequence of embeddings for token-level classification.

This architecture effectively supports multi-task learning by leveraging shared representations for efficiency while allowing for task-specific adjustments via separate heads. This approach not only optimizes the training process but also enhances the model's ability to generalize across different types of tasks by learning shared features.

In [17]:
class MultiTaskModel(nn.Module):
    def __init__(self, model_name, num_task_A, num_task_B, device=None, max_length=max_length):
        super().__init__()
        self.encoder = SentenceTransformer(model_name=model_name)
        embedding_dim = self.encoder.get_hidden_size()
        self.device = device or 'cpu'
        self.classifier_head = nn.Linear(embedding_dim, num_task_A).to(self.device)
        self.ner_head = nn.Linear(embedding_dim, num_task_B).to(self.device)
        self.encoder.to(self.device)
        self.max_length = max_length

    def forward(self, sentences, task="classifier", pooling='mean'):
        inputs = self.encoder.tokenizer(sentences, padding="max_length", truncation=True, return_tensors="pt", max_length=self.max_length)
        input_ids = inputs["input_ids"].to(self.device)
        attention_mask = inputs["attention_mask"].to(self.device)
        if task == 'classifier':
            sentence_embeddings = self.encoder(input_ids, attention_mask, pooling=pooling)
            logits = self.classifier_head(sentence_embeddings)
        elif task == 'ner':
            token_level_embeddings = self.encoder(input_ids, attention_mask, pooling='no_pool')
            logits = self.ner_head(token_level_embeddings)
        else:
            raise ValueError("Unknown task specified!")
            
        return logits


**Testing the Model**

When testing the model, the output logits for the classifier task should be a vector of size `[batch_size, number_of_classes]`, representing the probability distribution across the defined categories for each sentence in the batch. 

For the NER task, the expected output is `[batch_size, sequence_length, number_of_classes]`. The sequence_length corresponds to the number of tokens in each sentence, which equals the `max_length` due to padding. This ensures consistency in the output dimensions across different inputs.

In [18]:
# Classification
multi_task_model = MultiTaskModel(
    model_name="bert-base-uncased",
    num_task_A=3,   # e.g., positive, negative, neutral
    num_task_B=5,   # e.g., some entity tags
    device=torch.device("cuda" if torch.cuda.is_available() else "cpu")
)

# Classification forward pass
sentences_cls = ["I love this movie!", "It was terrible..."]
logits_cls = multi_task_model(sentences_cls, task="classifier")
print("Classifier logits:", logits_cls.shape)  

# NER forward pass
sentences_ner = ["John lives in New York", "Barack Obama was president"]
logits_ner = multi_task_model(sentences_ner, task="ner")
print("NER logits:", logits_ner.shape)  

Classifier logits: torch.Size([2, 3])
NER logits: torch.Size([2, 32, 5])


## Task 3: Training Considerations

Freezing the entire network  or partially freezing have advantages and disadvantages:

1. **Freezing the Entire Network:**
 When the entire network is frozen, weights remain unchanged, and no training is conducted. This approach saves computational resources, as it eliminates the need for updates during training. If the pretrained model's domain is similar to ours, freezing the network can produce effective results, particularly with small datasets, as it helps avoid overfitting. However, this method lacks flexibility and may not perform well unless the pretrained model is extremely close to your domain.
 
2. **Freezing Only Transformer Backbone**
The transformer backbone's weights are fixed, allowing the pretrained model to act as a feature extractor. The task-specific heads are trained to map these extracted embeddings to the specific categories or values relevant to our tasks. This method accelerates training compared to fine-tuning the entire model. It is particularly beneficial when dealing with small datasets, as it preserves the core features of the pre-trained model. This approach leverages the generalizability of pre-trained embeddings while customizing the model to specific tasks. It’s beneficial when the new tasks differ in finer details that the last layers can adapt to. However, if our task diverges significantly from the original training context of the model, performance may degrade.
  
   
3. **Freezing only one of the task-specific heads**
   If one of the task-specific heads is frozen, it enables targeted adjustments to the model’s performance. This strategy is beneficial when one task is well-tuned and stable, while another still requires optimization. By freezing the head of the well-performing task, we can focus training efforts on areas that need improvement, enhancing overall efficiency. This approach is particularly valuable when dealing with tasks of varying complexity and datasets of differing sizes. For example, if a task with limited data has already achieved satisfactory results, freezing its corresponding head prevents overfitting and conserves the integrity of its performance. Simultaneously, it allows continued training for the task with more abundant data, where further performance gains are possible. On the other hand, if one task has significantly more data, training the model jointly without adjustments could lead to dominance by the task with more data, potentially skewing the model's ability to learn from the smaller dataset effectively. Freezing the well-trained task might help mitigate this by allowing more training focus on the underrepresented task. Freezing the head for the more stable or adequately trained task also means fewer parameters require updates during training, which enhances computational efficiency and helps balance the model’s learning across its various tasks.
   ledge. data.
   

#### How to Approach a Transfer Learning Process

1. When selecting a pretrained model, I would prefer a robust option like BERT, which has been extensively trained on diverse and large datasets. Additionally, choosing a model pretrained on data that closely aligns with my specific domain is crucial to leverage transferrable features effectively.
2. First we freeze the backbone and unfreeze the task specific layers to adapt the response to the new task. Depending on the performance or overfitting concerns we gradually unfreeze starting from the top layers. use a small learning rate. As I keep unfreezing we would decrease the learning rate.
3. When utilizing transfer learning, selecting a pre-trained model that is already trained on vast datasets have several advantages like drastically reducing the need for additional data and shortening training times. However, fine-tuning such a model requires careful consideration.
     
    * **Gradual Unfreezing:** It's crucial to unfreeze layers progressively to avoid catastrophic forgetting, where the model loses previously learned information from the original training set. We begin with the last few layers, as they are more specialized to the task at hand, and we gradually work our way to earlier layers if necessary.

    * **Learning Rate Adjustments:** We start with a smaller learning rate to preserve the core features of the model and prevent the model from deviating too much from its initial training. As we unfreeze more layers, we may slightly increase the learning rate to allow the model to adapt more freely to the specifics of the new task.

    * **Preventing Overfitting:** By only updating the weights of the final layers, there's a risk of overfitting, where the model learns the training data too well and performs poorly on unseen data. To combat this, unfreezing additional layers allows the model to learn more generalizable features rather than just memorizing the training data.

    * **Adapting to New Domains:** If the new task diverges significantly from the data used in pre-training, it might be beneficial to unfreeze more layers. This approach allows the model to adjust its internal representations more extensively to better suit the new domain.

Following these steps can effectively leverage a pre-trained model for new tasks while maintaining a balance between adaptation and retention of learned knowledge.
   

## Task 4: Training Loop Implementation

#### Data

We will generate a simple dataset to test our training implementation and to explore how different properties of the data can be utilized for multi-tasking. This dataset is designed to support tasks that may require multiple labels per instance, with some data points labeled for both tasks and others labeled for only one.

#### Data Overview

Our dataset includes:

- **Sentences:** Each entry is a string representing a sentence.
Sentiment Labels: Numerical labels indicating the sentiment of the sentence:
   * 1: Positive
   * 0: Neutral
   * -1: Negative
- **NER Labels:** Lists of integers, where each integer corresponds to a token in the sentence and represents an entity type. If NER labels are not applicable, the entry is None.
   * 0: O (No entity)
   * 1: PER (Person)
   * 2: ORG (Organization)
   * 3: LOC (Location)
     
- **Task Flags:** Descriptors indicating whether labels are provided for both tasks ("both") or a single task ("classification" for sentiment only, "ner" for NER only).

#### Data Structure
The structured data is as follows:

In [19]:
# Creating combined data with availability flags
data_combined = [
    ("The weather is lovely in California today.", 1, [0, 0, 0, 0, 3, 0], "both"),  # Positive, mentions a location
    ("John ordered pizza from Domino's yesterday.", 0, [1, 0, 0, 0, 2, 0], "both"),  # Neutral, mentions a person and organization
    ("Tesla launches a new model next month.", 0, [2, 0, 0, 0, 0, 0], "both"),  # Neutral, mentions an organization
    ("The movie was great!", 1, None, "classification"),  # Positive, only classification
    ("Julia is now working remotely.", 0, [1, 0, 0, 0], "ner"),  # Neutral, mentions a person
    ("Microsoft introduces Windows 12 in July.", 0, [2, 0, 0, 3, 0], "both"),  # Neutral, mentions an organization and a time
    ("Sadly, the series finale was disappointing.", -1, None, "classification"),  # Negative, only classification
    ("Daniel travels to Spain for a conference.", 0, [1, 0, 0, 3, 0, 0], "both"),  # Neutral, mentions a person and a location
    ("Larry Page resigns from Google.", -1, [1, 0, 0, 2], "both"),  # Negative, mentions people and an organization
    ("The seminar in Boston was extremely informative.", 1, None, "classification")  # Positive, only classification
]

We will use -2 as a placeholder for missing values to resolve issues encountered with None values in the DataLoader. Additionally, we will represent the negative class in sentiment classification with 2 instead of -1. After making these changes we have the following:

In [20]:
data_combined = [
    ("The weather is lovely in California today.", 1, [0, 0, 0, 0, 3, 0], "both"),  # Positive, mentions a location
    ("John ordered pizza from Domino's yesterday.", 0, [1, 0, 0, 0, 2, 0], "both"),  # Neutral, mentions a person and organization
    ("Tesla launches a new model next month.", 0, [2, 0, 0, 0, 0, 0], "both"),  # Neutral, mentions an organization
    ("The movie was great!", 1, [-2,-2,-2,-2], "classification"),  # Positive, only classification
    ("Julia is now working remotely.", 0, [1, 0, 0, 0], "ner"),  # Neutral, mentions a person
    ("Microsoft introduces Windows 12 in July.", 0, [2, 0, 0, 3, 0], "both"),  # Neutral, mentions an organization and a time
    ("Sadly, the series finale was disappointing.", 2, [-2,-2,-2,-2,-2,-2], "classification"),  # Negative, only classification
    ("Daniel travels to Spain for a conference.", 0, [1, 0, 0, 3, 0, 0], "both"),  # Neutral, mentions a person and a location
    ("Larry Page resigns from Google.", 2, [1, 0, 0, 2], "both"),  # Negative, mentions people and an organization
    ("The seminar in Boston was extremely informative.", 1, [-2,-2,-2,-2,-2,-2,-2], "classification")  # Positive, only classification
]

#### Handling Mixed Task Data in MultiTaskDataset Class

The MultitaskDataset class is designed for handling datasets that contain mixed tasks like sentence classification and Named Entity Recognition (NER), suitable for integration with PyTorch's DataLoader. Initialized with a dataset and a predefined maximum sequence length, this class prepares data for efficient batch processing critical for training neural networks.

In the constructor (`__init__`), it accepts data_combined, a list of tuples containing sentences, classification labels (labelA), NER labels (labelsB), and a task flag (task_available). It ensures that all NER labels are padded to max_length to maintain uniform sequence lengths necessary for models requiring fixed-size inputs.

The `__getitem__` method retrieves and processes each data point by ensuring:

Classification labels are converted to tensors; if missing, a placeholder value of -2 is used.
NER labels are padded with -2 to match max_length if present, or filled entirely with -2 if absent, catering to the fixed-length requirements of certain network architectures.
This structured approach ensures every model input batch is consistent, supporting the model's ability to learn effectively from varied training data, especially when some tasks may not be annotated across all dataset samples.

In [21]:
class MultitaskDataset(Dataset):
    def __init__(self, data_combined,max_length):
        self.data_combined = data_combined
        self.max_len_ner = max_length

    def __len__(self):
        return len(self.data_combined)

    def __getitem__(self, idx):
        sentence, labelA, labelsB, task_available = self.data_combined[idx]
        labelA = torch.tensor([labelA]) if labelA is not None else torch.tensor([-2])  # Handle None for classification labels
        if labelsB is not None:
            labelsB = torch.tensor(labelsB + [-2] * (self.max_len_ner - len(labelsB)))  # Pad NER labels
        else:
            labelsB = torch.tensor([-2] * self.max_len_ner)  # Handle completely missing NER labels
        return sentence, labelA, labelsB, task_available


#### Handling Mixed-Task Batches in Training

The training implementation employs a batch data loading approach to improve computational efficiency. Given the multi-label nature of our dataset, where each batch may contain data for different tasks, the approach is adapted to handle batches that require specific processing per task label.

**Batch Handling**

In a standard setup where each task's data is distinct and separate, batching and processing can be straightforward. However, in our multi-task scenario where batches contain mixed labels (both classification and NER), we cannot process the entire batch in a uniform manner. Instead, we iterate over each item within a batch, processing it according to its specific task label. This approach ensures that the model is accurately updated based on the relevant task, preserving task integrity and promoting effective learning across different types of data.

**Data Loader and Batch Processing**

While a DataLoader is used to manage batch creation efficiently, each batch's mixed nature requires individual processing of data points. This method is particularly suited to our small dataset and helps mitigate the complexities introduced by mixed-task batches.  Consideration for a more streamlined batch processing method could be explored in future iterations.

#### Training Process and Loss Function Calculation

.
**Loss Function Calculation**

During training, we calculate loss separately for each task within a batch, summing these to get a total loss for the batchy:

**Classification Task:** We calculate the cross-entropy loss for sentence classification.
**NER Task:** Similarly, cross-entropy loss is computed for named entity recognition, with adjustments for sequence length and paddin This adjustment is necessary to handle the token-level details required by NER.

**Weighted Loss Considerations**

While the current implementation treats each task equally, introducing weighted loss could optimize training by prioritizing more critical or complex tasks. This adjustment would be particularly useful in scenarios where task importance or data representation varies significantl datasets. performance and efficiency.

**Batch Processing and Loss Aggregation**

Each batch provided by the DataLoader may contain mixed tasks. We process these individually to respect their specific requirements:

For classification, the output logits are `[batch_size, num_labels]`.
For NER, the output shape is `[batch_size, seq_len, num_labels]`, requiring reshaping for effective loss computation.
The losses from both tasks are summed to form a total batch loss, which is then used for backpropagation. This approach ensures that both tasks influence the model's learning, maintaining a balanced focus across different types of data.

**Conclusion**

This method facilitates efficient training across various tasks by appropriately managing each task's specific requirements and leveraging shared model features. It aims to enhance the model's adaptability and performance in processing complex, diverse datasets.

In [1]:
def train_multitask(model, dataset, num_epochs=1, batch_size=1):
    data_loader = DataLoader(dataset, batch_size=2, shuffle=True)
    optimizer = Adam(model.parameters(), lr=1e-4)
    device = next(model.parameters()).device  # Get the device model is currently on
    PAD_INDEX = -2  # Define padding index for NER task, ensure it does not conflict with any label index

    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}")
        for sentences, labelsA, labelsBs, task_available in data_loader:
            model.train()
            optimizer.zero_grad()
            total_loss = torch.tensor(0.0, device=device, requires_grad=True)

            # Transfer tensors to the appropriate device
            labelsA = labelsA.to(device)
            labelsBs = labelsBs.to(device)

            #Since each batch contains mixed tasks we go over each element separately
            for i, task in enumerate(task_available):
                if task in ["classification", "both"]:
                    logits_cls = model(sentences[i], task="classifier", pooling='mean')
                    loss_cls = F.cross_entropy(logits_cls, labelsA[i], ignore_index=PAD_INDEX)
                    total_loss = total_loss + loss_cls
                    print(f"Task A loss: {loss_cls.item():.4f} for sentence '{sentences[i]}'")

                if task in ["ner", "both"]:
                    logits_ner = model(sentences[i], task="ner", pooling='no_pool')
                    batch_size, seq_len, num_labels = logits_ner.shape # Shape of logits 
                    # Reshape the NER labels to a 1D tensor to match the expected format for loss calculation.
                    labelsB_reshaped = labelsBs[i].view(-1)
                    # Reshape the NER logits to a 2D tensor where each row represents the logits for a token, facilitating computation of the loss per token.
                    logits_ner_reshaped = logits_ner.view(-1, num_labels)
                    loss_ner = F.cross_entropy(logits_ner_reshaped, labelsB_reshaped, ignore_index=PAD_INDEX)
                    total_loss = total_loss + loss_ner
                    print(f"Task B loss: {loss_ner.item():.4f} for sentence '{sentences[i]}'")

            total_loss.backward()
            optimizer.step()
            optimizer.zero_grad()


### Initialize the Model

In [23]:
# Define the model parameters
model_name = 'bert-base-uncased'
num_task_A = 3  # Three sentiment classes (positive, neutral, negative)
num_task_B = 4  # Four NER tags (Person, Location, Organization, Other)

# Initialize the model
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")  # Using GPU if available, otherwise CPU
multi_task_model = MultiTaskModel(model_name=model_name, num_task_A=num_task_A, num_task_B=num_task_B, device=device)

### Train the Model

We will not train the model on real-world data; instead, we will use the small sample dataset provided above to verify the model's functionality

In [28]:
# Generate dataset
dataset = MultitaskDataset(data_combined,max_length)  

# Define training parameters
num_epochs = 3
batch_size = 1 # we chose batch size to be one since it is already handling data one by one in the code (explained above)

# Train the model
train_multitask(multi_task_model, dataset, num_epochs=num_epochs, batch_size=batch_size)


Epoch 1
Task A loss: 0.0028 for sentence 'The seminar in Boston was extremely informative.'
Task A loss: 0.0133 for sentence 'Larry Page resigns from Google.'
Task B loss: 0.1035 for sentence 'Larry Page resigns from Google.'
Task A loss: 0.0133 for sentence 'Microsoft introduces Windows 12 in July.'
Task B loss: 0.1423 for sentence 'Microsoft introduces Windows 12 in July.'
Task A loss: 0.0362 for sentence 'Daniel travels to Spain for a conference.'
Task B loss: 0.0716 for sentence 'Daniel travels to Spain for a conference.'
Task A loss: 0.0303 for sentence 'The weather is lovely in California today.'
Task B loss: 0.1165 for sentence 'The weather is lovely in California today.'
Task A loss: 0.0025 for sentence 'Sadly, the series finale was disappointing.'
Task A loss: 0.0128 for sentence 'Tesla launches a new model next month.'
Task B loss: 0.0958 for sentence 'Tesla launches a new model next month.'
Task A loss: 0.0054 for sentence 'The movie was great!'
Task B loss: 0.0458 for sente