In [None]:

# Import necessary libraries
import torch
import torch.nn as nn
from transformers import BertModel, BertTokenizer
from torch.utils.data import DataLoader, Dataset, random_split

# Task 1: Make fixed length embeddings for input sentences



*  Mean pooling was implmented to achieve a fixed-length embedding for any input sentence. The BERT model is an encoder that produces embedding vectors for every token in a sentence. Instead of stopping at this output where we have a vector for each token, we add a mean pooling layer at the output to take the mean of the vectors which then results in a fixed length vector.
* The vector length is of a fixed size because the seq_length/context length/embedding vector size for BERT is fixed at 768, so when you average n 768 vectors you get 1 final fixed vector of size 768 which is our fixed-length embedding vector



In [None]:


# Initialize the tokenizer and model from Hugging Face's model hub
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# Define a function to encode sentences and obtain fixed-length embeddings
def encode_sentences(sentences):
    # Tokenize the sentences and convert them into token IDs and attention masks
    inputs = tokenizer(sentences, padding=True, truncation=True, return_tensors="pt")

    # Forward pass through the model
    with torch.no_grad():
        outputs = model(**inputs)

    # The model's outputs includes hidden states of the last layer
    last_hidden_state = outputs.last_hidden_state  # Shape: [batch_size, seq_length, hidden_size]

    # Perform mean pooling over the token embeddings to get a fixed-length sentence embedding
    attention_mask = inputs["attention_mask"]
    mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float() # have the att mask match the shape of the last_hidden_state
    sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1) # multiply each token embedding by its attention mask to zero out the padding tokens
    sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
    sentence_embeddings = sum_embeddings / sum_mask

    return sentence_embeddings



In [None]:
# Sample sentences to test the model
sample_sentences = [
    "Transformers are powerful models for natural language processing.",
    "They are widely used for various tasks, such as translation and sentiment analysis.",
    "I love ML",
    "Dakota Favors Rocks!"
]
[print(i) for i in sample_sentences]

print()

# Encode the sentences and print the embeddings
sentence_embeddings = encode_sentences(sample_sentences)
print(f"Sentence Embeddings:\n{sentence_embeddings}")
print(sentence_embeddings.shape)


Transformers are powerful models for natural language processing.
They are widely used for various tasks, such as translation and sentiment analysis.
I love ML
Dakota Favors Rocks!

Sentence Embeddings:
tensor([[ 0.1937, -0.1926,  0.1387,  ..., -0.4427, -0.5905,  0.1295],
        [ 0.0890, -0.1854, -0.0093,  ..., -0.5218, -0.2757,  0.3439],
        [ 0.2508,  0.3918,  0.2517,  ..., -0.2790,  0.0437, -0.0609],
        [ 0.1860, -0.2024,  0.0250,  ...,  0.2248,  0.0859,  0.0298]])
torch.Size([4, 768])


# Task 2: Expound from task1 and make a model with multi-task capabilites

To support/incoporate multitask learning into our already existing model, we simply add two neural network heads whose inputs are the output of our mean pooling layer.
We can modify these neural networks to optimize/improve performance. Adding these networks was the major change in adding multitask learning capabilities to our existing model.

##### Model

In [None]:
class MultiTaskModel(nn.Module):
    def __init__(self, model_name="bert-base-uncased", num_classes_task1=4, num_classes_task2=3):
        super(MultiTaskModel, self).__init__()

        # Shared transformer backbone
        self.bert = BertModel.from_pretrained(model_name)

        # Task-specific heads
        self.classification_head = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes_task1)  # Adjust output size to the number of classes in Task 1
        )

        self.sentiment_head = nn.Sequential(
            nn.Linear(self.bert.config.hidden_size, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes_task2)  # Adjust output size to the number of classes in Task 2
        )

    def forward(self, input_ids, attention_mask):
        # Pass through shared transformer model
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Use mean pooling to get sentence embeddings
        last_hidden_state = outputs.last_hidden_state
        mask_expanded = attention_mask.unsqueeze(-1).expand(last_hidden_state.size()).float()
        sum_embeddings = torch.sum(last_hidden_state * mask_expanded, 1)
        sum_mask = torch.clamp(mask_expanded.sum(1), min=1e-9)
        sentence_embedding = sum_embeddings / sum_mask  # Mean pooling across token embedding vectors

        # Pass the pooled embedding through each task-specific head
        classification_logits = self.classification_head(sentence_embedding)
        sentiment_logits = self.sentiment_head(sentence_embedding)

        # Apply softmax to get probabilities
        classification_probs = torch.softmax(classification_logits, dim=1)
        sentiment_probs = torch.softmax(sentiment_logits, dim=1)

        return classification_logits, sentiment_logits


Lets visualize some test sample sentences prior to training

In [None]:

# Initialize the model, tokenizer, and example input
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = MultiTaskModel(model_name=model_name, num_classes_task1=4, num_classes_task2=3)

# Example input sentences
sentences = ["The underdog team pulled off an incredible victory in the championship game!",
            "The new policy has received widespread praise for addressing environmental issues.",
            "The latest Marvel movie was a thrilling experience with breathtaking visuals.",
            "The update introduced several bugs, making the app nearly unusable."]
[print(i) for i in sentences]
print()

# Tokenize input
inputs = tokenizer(sentences,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt")

# Forward pass
classification_logits, sentiment_logits = model(input_ids=inputs["input_ids"],
                                                attention_mask=inputs["attention_mask"])

# Output
print(f"Classification Logits:\n{classification_logits}")
print()
print(f"Sentiment Logits:\n{sentiment_logits}")
print()
print(f"Predictions: {torch.argmax(classification_logits, dim=1)}")
print(f"Predictions: {torch.argmax(sentiment_logits, dim=1)}")

The underdog team pulled off an incredible victory in the championship game!
The new policy has received widespread praise for addressing environmental issues.
The latest Marvel movie was a thrilling experience with breathtaking visuals.
The update introduced several bugs, making the app nearly unusable.

Classification Logits:
tensor([[ 0.0114,  0.0303,  0.0204,  0.0879],
        [ 0.0924, -0.0323,  0.0219,  0.1392],
        [ 0.0115,  0.0479,  0.0034,  0.0806],
        [ 0.1248, -0.1183,  0.0250,  0.1327]], grad_fn=<AddmmBackward0>)

Sentiment Logits:
tensor([[0.3027, 0.1047, 0.0845],
        [0.1519, 0.1013, 0.0644],
        [0.2066, 0.1410, 0.1427],
        [0.1156, 0.1983, 0.1260]], grad_fn=<AddmmBackward0>)

Predictions: tensor([3, 3, 3, 3])
Predictions: tensor([0, 0, 0, 1])


###### Dataset

Lets:

* create a sample dataset
* perform five epochs of training on the data
* obtain loss

**n/b**:

Task 1: Classification [sentence categories:- Entertainment(0), Sports(1), Politics(2), Technology(3)]

Task 2: Sentiment [sentiment categories:- negative(0), neutral(1), postive(2)]

In [None]:
class MultiTaskDataset(Dataset):
    def __init__(self, sentences, labels_task1, labels_task2, tokenizer, max_length=128):
        self.sentences = sentences
        self.labels_task1 = labels_task1
        self.labels_task2 = labels_task2
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        label_task1 = self.labels_task1[idx]
        label_task2 = self.labels_task2[idx]

        # Tokenize the sentence
        encoding = self.tokenizer(
            sentence,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": encoding["input_ids"].squeeze(0),
            "attention_mask": encoding["attention_mask"].squeeze(0),
            "label_task1": torch.tensor(label_task1),
            "label_task2": torch.tensor(label_task2),
        }

In [None]:
# initialize tokenizer and example dataset
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
sentences = [
    "The movie was fantastic!",
    "The game was boring.",
    "Who is the president of the United States?",
    "This movie is amazing!",
    "What are the latest news in tech?",
    "Who won the football game?",
    "This new tv series 'the penguin' is very good, it has a high IMDB rating"
]
labels_task1 = [0, 1, 2, 0, 3, 1, 0]  # Task 1: Classification (sentence categories: 0 = Entertainment, 1 = Sports, 2 = Politics, 3 = Technology)
labels_task2 = [2, 0, 1, 2, 0, 1, 2]  # Task 2: Sentiment (sentiment categories: 0 = negative, 1 = neutral, 2 = postive)



dataset = MultiTaskDataset(sentences, labels_task1, labels_task2, tokenizer)

# split dataset
dataset_size = len(dataset)
train_size = int(0.7 * dataset_size)  # 70% for training
val_size = dataset_size - train_size  # 30% for validation
gen = torch.Generator().manual_seed(1442) # for reproducibility
train_dataset, val_dataset = random_split(dataset, [train_size, val_size], generator=gen)

# create DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=2, shuffle=False)

In [None]:
for batch in train_dataloader:
  print(batch["input_ids"].shape)
  print(batch["attention_mask"].shape)
  print(batch["label_task1"].shape)
  print(batch["label_task2"].shape)

print()

for batch in val_dataloader:
  print(batch["input_ids"].shape)
  print(batch["attention_mask"].shape)
  print(batch["label_task1"].shape)
  print(batch["label_task2"].shape)

torch.Size([2, 128])
torch.Size([2, 128])
torch.Size([2])
torch.Size([2])
torch.Size([2, 128])
torch.Size([2, 128])
torch.Size([2])
torch.Size([2])

torch.Size([2, 128])
torch.Size([2, 128])
torch.Size([2])
torch.Size([2])
torch.Size([1, 128])
torch.Size([1, 128])
torch.Size([1])
torch.Size([1])


##### Training & Eval

In [None]:
# Initialize the model
model = MultiTaskModel(num_classes_task1=4, num_classes_task2=3)
optimizer = torch.optim.Adam(model.parameters(), lr=1e-5)

criterion_task1 = nn.CrossEntropyLoss()
criterion_task2 = nn.CrossEntropyLoss()

In [None]:
# Training and Eval loop

num_epochs = 6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_total_loss = 0
    val_total_loss = 0
    correct_task1 = 0 # for accuracy calculation
    correct_task2 = 0 # for accuracy calculation
    total_samples = 0 # for accuracy calculation

    # train
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels_task1 = batch["label_task1"].to(device)
        labels_task2 = batch["label_task2"].to(device)

        # Forward pass
        classification_logits, sentiment_logits = model(input_ids, attention_mask)

        # Compute losses for each task
        loss_task1 = criterion_task1(classification_logits, labels_task1)
        loss_task2 = criterion_task2(sentiment_logits, labels_task2)

        # Combine the losses
        combined_loss = loss_task1 + loss_task2

        # Backward pass and optimization
        optimizer.zero_grad()
        combined_loss.backward()
        optimizer.step()

        train_total_loss += combined_loss.item()

    train_avg_loss = train_total_loss / len(train_dataloader)

    # eval
    with torch.no_grad():
      model.eval()

      for batch in val_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels_task1 = batch["label_task1"].to(device)
        labels_task2 = batch["label_task2"].to(device)

        # Forward pass
        classification_logits, sentiment_logits = model(input_ids, attention_mask)

        # Calculate losses
        loss_task1 = criterion_task1(classification_logits, labels_task1)
        loss_task2 = criterion_task2(sentiment_logits, labels_task2)
        combined_loss = loss_task1 + loss_task2
        val_total_loss += combined_loss.item()

        # Calculate predictions
        predictions_task1 = torch.argmax(classification_logits, dim=1)
        predictions_task2 = torch.argmax(sentiment_logits, dim=1)

        # Update correct counts for accuracy calculation
        correct_task1 += (predictions_task1 == labels_task1).sum().item()
        correct_task2 += (predictions_task2 == labels_task2).sum().item()
        total_samples += labels_task1.size(0)

      # Calculate average loss and accuracy
      val_avg_loss = val_total_loss / len(val_dataloader)
      accuracy_task1 = correct_task1 / total_samples
      accuracy_task2 = correct_task2 / total_samples

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_avg_loss:.4f}, Val Loss: {val_avg_loss:.4f}, Accuracy Task 1: {accuracy_task1:.4f}, Accuracy Task 2: {accuracy_task2:.4f}")

Epoch 1/6, Train Loss: 2.5255, Val Loss: 2.4704, Accuracy Task 1: 0.3333, Accuracy Task 2: 0.3333
Epoch 2/6, Train Loss: 2.4225, Val Loss: 2.4467, Accuracy Task 1: 0.3333, Accuracy Task 2: 0.3333
Epoch 3/6, Train Loss: 2.3524, Val Loss: 2.4260, Accuracy Task 1: 0.3333, Accuracy Task 2: 0.3333
Epoch 4/6, Train Loss: 2.2099, Val Loss: 2.4020, Accuracy Task 1: 0.3333, Accuracy Task 2: 0.3333
Epoch 5/6, Train Loss: 2.1313, Val Loss: 2.3800, Accuracy Task 1: 0.3333, Accuracy Task 2: 0.6667
Epoch 6/6, Train Loss: 2.0498, Val Loss: 2.3561, Accuracy Task 1: 0.3333, Accuracy Task 2: 0.6667


Lets see the model results on the same test sentences from last time

In [None]:

# Initialize the, tokenizer, and example input
tokenizer = BertTokenizer.from_pretrained(model_name)

# Example input sentences
sentences = ["The underdog team pulled off an incredible victory in the championship game!",
            "The new policy has received widespread praise for addressing environmental issues.",
            "The latest Marvel movie was a thrilling experience with breathtaking visuals.",
            "The update introduced several bugs, making the app nearly unusable."]
[print(i) for i in sentences]
print()

# Tokenize input
inputs = tokenizer(sentences,
            max_length=128,
            padding="max_length",
            truncation=True,
            return_tensors="pt")

# Forward pass
model.eval()
classification_logits, sentiment_logits = model(input_ids=inputs["input_ids"],
                                                attention_mask=inputs["attention_mask"])

# Output
print(f"Classification Logits:\n{classification_logits}")
print()
print(f"Sentiment Logits:\n{sentiment_logits}")
print()
print(f"Sentence Classification Predictions: {torch.argmax(classification_logits, dim=1)}")
print(f"Sentiment Predictions: {torch.argmax(sentiment_logits, dim=1)}")


The underdog team pulled off an incredible victory in the championship game!
The new policy has received widespread praise for addressing environmental issues.
The latest Marvel movie was a thrilling experience with breathtaking visuals.
The update introduced several bugs, making the app nearly unusable.

Classification Logits:
tensor([[ 0.2231,  0.1663, -0.1411, -0.1210],
        [ 0.1068,  0.0857, -0.1120, -0.0523],
        [ 0.2597,  0.0160, -0.1406, -0.0841],
        [ 0.1937, -0.0676, -0.1111, -0.0796]], grad_fn=<AddmmBackward0>)

Sentiment Logits:
tensor([[-0.0496,  0.0575, -0.0610],
        [ 0.0081, -0.1173, -0.0834],
        [-0.0371, -0.0248,  0.0296],
        [ 0.0675,  0.0008, -0.1106]], grad_fn=<AddmmBackward0>)

Sentence Classification Predictions: tensor([0, 0, 0, 0])
Sentiment Predictions: tensor([1, 0, 2, 0])


# Task 3: Training Considerations

### Freezing Scenarios

1.Freezing the entire network parameters.

**Implications**: This means the model's parameters would not be updated during training, in other words there would be training of the network and the model would basically rely on the pre-trained weights.

**Advantages**: This can be advantageous when we have a very small dataset and want to avoid overfitting. Since we are not performing any training and only doing inference using the pretrained models weights, it's more computationally efficient and less costly too, since theres no gradient calculations for backpropagation

**How it should be trained:** Here training doesn't happen cause all the parameters are frozen.

2.Freezing only the transformer backbone

**Implications:** If just the transformer backbone is frozen and we train just the heads, we can see the transformer backbone as an encoder or feature extractor that helps yield the fixed-length embedding vectors to be used by our several task heads.

**Advantages:** Training is more efficient and less costly computationally since we are updating only task head parameters during backpropagation. It improves the learning done the task heads because we only updating those parameters. It could eliminate a vanishing gradient problem if existent. It also eliminates the risk of degrading the pre-trained model general knowledge usually caused by fine-tuning

**How it should be trained:** Since we are optimizing only the task-specific heads, I'll consider the following ideas when setting the following hyperparemeters:
* I'll use a relatively higher learning rate since only small task-specific heads are being updated e.g. 1e-3 or 1e-2
* I'll use larger batch sizes since freezing the transformer backbone frees up some memory for us
* I'll use Adam or AdamW as choice of optimizer since we are making small scale parameter updates
* I'll make sure to regualarize properly since the small task-specific heads are prone to overfitting due to the backbone embeddigs.
* Won't use that many epochs, will use 5-19 epochs because the backbone already provides robust embeddings for the sentences.

In terms of hyperparameter optimization, after defining my hyperparameter search space, I'll use either random search or grid search for search and clear ML for easy hyperparameter optimzation and experimentation.






3.Freezing either of the task heads

**Implications:** This allows one task head to leverage learned represnetations from the pre-trained model. The frozen head retains its learned parameters while other parts of the network are updated. But this means the frozen head better be pre-trained already in order to be useful during inference.

**Advantages:** Can be great when we want to retain performance/prior leanred behavior  of frozen task head only and want the rest of the network to adapt to the new task. In terms of computational costs, it can be less costly relatively as well. This technique can als be use to prevent overfitting for the frozen task.

**How it should be trained:**
In setting the hyperparameters I'll consider the following:

*  I'll set a low learning rate for the backbone and the trainable head e.g. 1e-5 to 4e-4
*  Set the weight decay within 1e-4 to 1e-2 to ensure regularization of the backbone and trainable head.
*  Set a linear decay scheduler with warmup for the backbone
* use smaller batch sizes since we computing gradients for the backbone.

During the trials, I'll gradually unfreeze the trainable head for a few epochs, increase the learning rate in the initial steps to avoid destabilizing the backbone and use optuna or hyperband for hyperparameter search since tuning both the backbone and a task head can get complex


### Transfer Learning

**Choice of Pre-trained Model** I'll use either sentence bert or bert or roberta. My choice of which bert variant I"ll use depends on the task at hand and demands of the task. If it's a case where I need just general text understanding, BERT would suffice, if it's a case where the texts involve specialized domains like science or sports or research or medicine, I'll use SciBERT or BioBERT or look for any of any BERT that has been optimized to encode sentences within that domain and finally if the task requieres high-qulity sentence embeddings, I'll use sentence-bert. In summary the pretrained model I'll consider using would be an encoder as the backbone, because this is what encoders are generally great at; creating great token embeddings that captures the context and semantics.

**Which layers I'll freeze:** I'll start by freezing the entire transformer backbone. The rationale behind this is because the backbone already creates embeddings that generalizes well to tasks like sentence classification, etc. This would reduce overfitting on the dataset that would be used to implement the multi-task heads. After some epochs of training the other task heads on the dataset, I'll then gradually unfreeze the later layers of the transformer backbone and fine-tune them as these layers can be useful in capturing the task-specific features

# Task4: Layer-wise Learning Rate

The 4 layers to implement layerwise learning rate are:
* The early layers (closer to input) of the transformer backbone,
* the later layers of the backbone,
* the task head layers and
* the embeddings layer/layer before our mean pooling of the backbone

For the **early/deeper layers of the backbone**, *very low learning rates* are used to minimize the intensity of the gradient update and also to preserve the rigor of the backbone as those layers perform well in language understanding.

For the **later layers of the backbone**, we can *increase the learning rate* used for the earlier layers a little more to encourage faster learning as these layers are closer to the task heads and can update their weights based on those tasks.

For the **task head layers**, *high learning rates to encourage faster learning* because they need to adapt quickly to the tasks they are trained on

A *low learning rate* is used in the **embedding layer** because its shared across both tasks and requires little to no changes during training

**Potential Benefits**:

Layer-wise learning rates can help with stability in training. This simply means we control the rate of learning across the network removing the need to freeze some layers. This can be crucial so as to preserve those already optimal weights at the early layers of the backbone that already carry good general language understanding and need not to be too finetuned to on any new tasks. We can basically control the rate of learning across several layers and increase learning rates for later layers as these layers need to adapt to the task.

Layer-wise learning rates can also help with computational costs and efficiency. It can also introduce some regularization because, since we limit the rate of change of the weights in some of these layers, we can reduce the risk of overfitting.

Layer-wise learning rates can also help preserve domain knowledge coming from the trasnformer backbone which can be super useful during finetuning. The model will balance preserving those weights and adapting to domain-specific features.

In [None]:
# Define different learning rates for each layer
def implement_layerwise_lr(model, base_lr=1e-5, layer_decay=0.9, head_lr=1e-4):
    """Applies layer-wise learning rates with an exponential decay."""
    optimizer_grouped_parameters = [] # initialize paramters list

    # Get transformer backbone (BERT layers)
    bert_layers = list(model.bert.encoder.layer)  # List of all BERT layers
    for i, layer in enumerate(bert_layers):
        lr = base_lr * (layer_decay ** (len(bert_layers) - i - 1))  # Later layers get higher lr
        optimizer_grouped_parameters.append({"params": layer.parameters(), "lr": lr})

    # Get embeddings layer and edit lr
    optimizer_grouped_parameters.append({"params": model.bert.embeddings.parameters(), "lr": base_lr * (layer_decay ** len(bert_layers))})

    # Task-specific heads
    optimizer_grouped_parameters.append({"params": model.classification_head.parameters(), "lr": head_lr})
    optimizer_grouped_parameters.append({"params": model.sentiment_head.parameters(), "lr": head_lr})

    return optimizer_grouped_parameters

# Init the optimizer with layer-wise learning rates and params
model = MultiTaskModel(num_classes_task1=4, num_classes_task2=3)
optimizer = torch.optim.AdamW(implement_layerwise_lr(model))

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
criterion_task1 = nn.CrossEntropyLoss()
criterion_task2 = nn.CrossEntropyLoss()


num_epochs = 6
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    train_total_loss = 0
    val_total_loss = 0
    correct_task1 = 0 # for accuracy calculation
    correct_task2 = 0 # for accuracy calculation
    total_samples = 0 # for accuracy calculation

    # train
    for batch in train_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels_task1 = batch["label_task1"].to(device)
        labels_task2 = batch["label_task2"].to(device)

        # Forward pass
        classification_logits, sentiment_logits = model(input_ids, attention_mask)

        # Compute losses for each task
        loss_task1 = criterion_task1(classification_logits, labels_task1)
        loss_task2 = criterion_task2(sentiment_logits, labels_task2)

        # Combine the losses
        combined_loss = loss_task1 + loss_task2

        # Backward pass and optimization
        optimizer.zero_grad()
        combined_loss.backward()
        optimizer.step()

        train_total_loss += combined_loss.item()

    train_avg_loss = train_total_loss / len(train_dataloader)

    # eval
    with torch.no_grad():
      model.eval()

      for batch in val_dataloader:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels_task1 = batch["label_task1"].to(device)
        labels_task2 = batch["label_task2"].to(device)

        # Forward pass
        classification_logits, sentiment_logits = model(input_ids, attention_mask)

        # Calculate losses
        loss_task1 = criterion_task1(classification_logits, labels_task1)
        loss_task2 = criterion_task2(sentiment_logits, labels_task2)
        combined_loss = loss_task1 + loss_task2
        val_total_loss += combined_loss.item()

        # Calculate predictions
        predictions_task1 = torch.argmax(classification_logits, dim=1)
        predictions_task2 = torch.argmax(sentiment_logits, dim=1)

        # Update correct counts for accuracy calculation
        correct_task1 += (predictions_task1 == labels_task1).sum().item()
        correct_task2 += (predictions_task2 == labels_task2).sum().item()
        total_samples += labels_task1.size(0)

      # Calculate average loss and accuracy
      val_avg_loss = val_total_loss / len(val_dataloader)
      accuracy_task1 = correct_task1 / total_samples
      accuracy_task2 = correct_task2 / total_samples

    print(f"Epoch {epoch + 1}/{num_epochs}, Train Loss: {train_avg_loss:.4f}, Val Loss: {val_avg_loss:.4f}, Accuracy Task 1: {accuracy_task1:.4f}, Accuracy Task 2: {accuracy_task2:.4f}")

Epoch 1/6, Train Loss: 2.5181, Val Loss: 2.5300, Accuracy Task 1: 0.3333, Accuracy Task 2: 0.3333
Epoch 2/6, Train Loss: 2.3565, Val Loss: 2.5379, Accuracy Task 1: 0.0000, Accuracy Task 2: 0.3333
Epoch 3/6, Train Loss: 2.2526, Val Loss: 2.5561, Accuracy Task 1: 0.0000, Accuracy Task 2: 0.3333
Epoch 4/6, Train Loss: 2.1142, Val Loss: 2.5756, Accuracy Task 1: 0.0000, Accuracy Task 2: 0.3333
Epoch 5/6, Train Loss: 2.0544, Val Loss: 2.5957, Accuracy Task 1: 0.0000, Accuracy Task 2: 0.3333
Epoch 6/6, Train Loss: 1.9560, Val Loss: 2.6176, Accuracy Task 1: 0.0000, Accuracy Task 2: 0.3333


For this solution, I purposely used a small sample dataset and a light weight model just to demo the implementation. I was more focused on building out the model and answering the questions than building something big.