## Aditi Nabar Take Home Assessment

In [1]:
import ast
import pandas as pd

import torch
import torch.nn as nn
from transformers import AutoModel, AutoConfig, AutoTokenizer

annotated_df = None

In [2]:
model_name = "bert-base-uncased"

### Step 1: Implement a Sentence Transformer Model

#### Transformer Backbone

In [3]:
class BERTTransformer(nn.Module):
    def __init__(self, model_name):
        super(BERTTransformer, self).__init__()

        self.bert = AutoModel.from_pretrained(model_name)

    def forward(self, input_ids, attention_mask, **kwargs):
        # model is initialized without decoder
        # get the cls token embedding from the last hidden state, basically the sentence embeddings from encoder.
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        embedding = output.last_hidden_state[:, 0, :]
        
        return embedding


Above we have a basic model class that loads a pre-trained model, and can generate a sentence embedding using the .forward() method. Below are some helper functions you can use to pass in a sentence and return an embedding. These helper functions can also be used with the class created below.

#### Helper functions

In [4]:
# toy dataset if you want sentences to pass
annotated_df = pd.read_csv("annotated_w_quotes.csv").rename({"annotations": "ner_annotations"}, axis=1)
included_cols = ["headline", "short_description"]
classification_label_col = "category"
ner_label_col = "ner_annotations"

def get_input_sentence(row, included_columns=None):
    """Helper to generate sentences at row level on a dataframe, or just returns a sentence"""
    if isinstance(row, pd.Series):
        return ' '.join([row[col] for col in included_columns])
    else:
        return row

def generate_sentence_embeddings(model, input_text, included_columns=None, task=None):
    # build input sentence
    sentence = get_input_sentence(input_text, included_columns)

    # get encoded input sentence
    encoded_input = tokenizer(sentence, padding=True, return_tensors="pt")

    # get bert sentence embedding with contextual understanding
    bert_sent_embedding = model.forward(encoded_input["input_ids"], attention_mask=encoded_input["attention_mask"], task=task)
    return bert_sent_embedding


In [5]:
# load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)
basicBERTmodel = BERTTransformer(model_name)


In [6]:
for i in range(5):
    record = annotated_df.iloc[i]
    embedding = generate_sentence_embeddings(basicBERTmodel, record, included_columns=included_cols)
    print(embedding.shape)
    # print(embedding[0][:50])

torch.Size([1, 768])
torch.Size([1, 768])
torch.Size([1, 768])
torch.Size([1, 768])
torch.Size([1, 768])


In [7]:
# If you are passing in a text string, you don't need to pass a second argument.
embedding = generate_sentence_embeddings(basicBERTmodel, "INSERT SENTENCE HERE")
print(embedding.shape)
print(embedding[0][:50])

torch.Size([1, 768])
tensor([ 3.2509e-02,  1.1863e-01, -7.8482e-02, -3.7044e-02, -7.4845e-02,
        -1.2509e-01,  8.3830e-02,  4.3325e-01,  6.2973e-02, -6.9573e-02,
         2.0910e-02,  3.4076e-02, -7.5872e-02,  1.8824e-02,  1.4187e-01,
         8.7889e-02, -1.8660e-01,  3.9010e-01,  3.9392e-02, -2.0885e-01,
        -2.0547e-02, -1.0078e-01, -7.3176e-02, -1.0138e-01,  1.9656e-01,
        -1.1594e-01, -3.9264e-02, -7.7164e-02, -1.5536e-01,  2.4024e-01,
         2.0571e-01,  2.7743e-04, -1.4508e-01,  1.3075e-01,  1.3907e-02,
        -8.1446e-04,  1.4078e-02, -3.3752e-02,  7.1549e-02, -2.4380e-03,
        -5.2628e-02, -7.7874e-02,  3.4912e-01, -1.1846e-01, -6.4522e-03,
        -1.5248e-01, -1.6053e+00,  1.4721e-02, -2.4472e-01, -2.0926e-01],
       grad_fn=<SliceBackward0>)


### Step 2: Multi-Task Learning Expansion

In [11]:
# def get_label_list(row):
#     return ast.literal_eval(row["ner_annotations"].replace("‘", "'").replace("’", "'"))

num_ner_labels = 5
num_classification_labels = 10

In [12]:
class MultiTaskBERT(nn.Module):
    def __init__(self, model_name, num_category_classes, num_ner_entities):
        super(MultiTaskBERT, self).__init__()

        self.bert = AutoModel.from_pretrained(model_name)
        self.sentence_classifier_head = nn.Linear(self.bert.config.hidden_size, num_category_classes)
        self.ner_head = nn.Linear(self.bert.config.hidden_size, num_ner_entities)

    def forward(self, input_ids, attention_mask, task):
        # model is initialized without decoder
        # get the cls token embedding from the last hidden state, basically the sentence embeddings from encoder.
        output = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # use mean pooling of the hidden state embeddings instead of the cls token embedding given by output.pooler_output
        if task == "classification":
            pooled_sentence_embedding = self._mean_pooling(output.last_hidden_state, attention_mask)
            task_output = self.sentence_classifier_head(pooled_sentence_embedding)

        elif task == "ner":
            task_output = self.ner_head(output.last_hidden_state)
        
        return task_output

    def _mean_pooling(self, hidden_states, attention_mask):
        # expand the mask to the dimensions of the hidden states
        expanded_mask = attention_mask.unsqueeze(-1).expand(hidden_states.size())

        # sum embeddings along the dimension of the tokens so that our output remains a vector
        sum_embeddings = torch.sum(hidden_states * expanded_mask, dim=1)
        sum_mask = expanded_mask.sum(dim=1)
        return sum_embeddings / sum_mask


Discussion: 

On lines 6 and 7, you can see the addition of two task heads - one for the sentence classification task, and one for an NER task. These are defined in the `__init__` and then implemented in the `.forward`. 

A few things to note: 
- I've added linear layers as the task heads, to take the input embeddings from their higher dimensional space to the lower dimensional space of the output.
- For the sentence embedding used by the sentence classification head, I had two options - I could use the token embedding for the CLS token which the BERT model adds to the beginning of the input sentence, which is _a_ representation of the sentence, but apparently not a good one. The other option was to perform a mean pooling on all the layers in the last hidden state, and use that as the sentence embedding. The mean pooling works by condensing the weights along the dimension of the tokens, and then applying the attention mask to enhance semantic representation. I've chosen to go the route of mean-pooling the last hidden state layers, and then use that embedding as the sentence embedding.

In [13]:
mtlBERT = MultiTaskBERT(
    model_name=model_name,
    num_category_classes=num_classification_labels,
    num_ner_entities=num_ner_labels
)

#### Generate NER predictions

In [14]:
# If you are passing in a text string, you don't need to pass the included_columns argument.
embedding = generate_sentence_embeddings(mtlBERT, "INSERT SENTENCE HERE", task="ner")
print(embedding.shape)
print(embedding[0][:50])

torch.Size([1, 5, 5])
tensor([[ 0.6581, -0.3127,  0.2183, -0.3534,  0.2040],
        [ 0.0270, -0.2295,  0.1007, -0.0386,  0.2704],
        [ 0.0393, -0.1144, -0.1133,  0.1671,  0.2099],
        [-0.0279, -0.1757,  0.0821, -0.5505,  0.4746],
        [-0.1393, -0.2641,  0.0574,  0.4134, -0.4826]],
       grad_fn=<SliceBackward0>)


#### Generate sentence classification predictions

In [15]:
# If you are passing in a text string, you don't need to pass the included_columns argument.
embedding = generate_sentence_embeddings(mtlBERT, "INSERT SENTENCE HERE", task="classification")
print(embedding.shape)
print(embedding[0][:50])

torch.Size([1, 10])
tensor([-0.0495, -0.0392,  0.1757, -0.1240,  0.0898,  0.0321, -0.2603, -0.0134,
         0.1902, -0.2885], grad_fn=<SliceBackward0>)


### Step 3: Discussion Questions

#### 1. How would you decide which portions of the network to train and which to keep frozen? 
- a. When would it make sense to freeze the transformer backbone and only train the task specific layers? 
- b. When would it make sense to freeze one head while training the other?

At a high level, a multi-task learning pipeline is comprised of a shared backbone with various task-specific heads that allow the model to learn features specific to each task. The general case of a multi-task learning (MTL) pipeline entails running a training loop through the backbone and each task head to get weight updates across the board. There may be times when it is appropriate to deviate from this general flow and freeze parts of the network while training others, for example when there is imbalanced amounts of data across tasks, or when the various heads have reached different levels of performance. I'll get into more specifics on how to decide a path forward below.

a. I think there are two cases in which it would make sense to freeze the transformer backbone and only train the task-specific layers of a MTL pipeline: <br>1) When there is a limited amount of high quality, annotated data, freezing the backbone and training only the task-specific layers could help mitigate over-fitting by reducing the number of parameters being trained (ie only training the small amount of features in the task head rather than the millions of parameters in the transformer).  <br>2) When you don't want to interfere with the learned language understanding of the transformer backbone because it could be useful to the specialized task being learned. To explain this conversely, the more complex your task, the more layers of the pre-trained model you will want to unfreeze to provide additional capacity for learning.

b. I think this question is in some ways an extension of the previous. It would make sense to freeze some heads while training others when you have limited data available for certain tasks and don't actually want to conduct any training, but still have the transformer provide the prior learning while using the frozen task head for inference, and still want the other task to benefit from a multi-task setup.  Additionally, it could be that you've achieved some acceptable level of performance on a task head but want to continue training another task head. 

#### 2. Discuss how you would decide when to implement a multi-task model like the one in this assignment and when it would make more sense to use two completely separate models for each task.

I would use a multi-task model when I want to train multiple tasks that are somewhat related because they can benefit from shared feature representation. Since they will share a transformer backbone, the selection of backbone should be relevant to the various tasks I want the model to learn. Conversely, the various tasks should be appropriate given the selection of the transformer backbone. 

1. Task type<br>
It would make sense to combine tasks that operate and learn at the same level of granularity into one model, because the model will be learning a representation specific to that kind of task. For example, tasks that operate at the token level (ie various document NER tasks) would be good candidates for a shared MTL model. Tasks that are at the document level (ex. document classification, document pair similarity tasks, etc) would be good candidates for another shared MTL model. 

2. Data availability<br>
If the size of datasets for a set of similar tasks is quite imbalanced, and if you do want to train a model for each task as opposed to freezing a head as discussed earlier, it may make more sense to train models separately for those tasks so as to not have the model overfit the smaller-dataset task while it trains the other.

3. Differing modalities<br>
The model architecture of a pre-trained transformer backbone needed for one modality is going to be different from that needed for another modality, thus tasks of differing modalities should be trained as separate models.

#### 3. When training the multi-task model, assume that Task A has abundant data, while Task B has limited data. Explain how you would handle this imbalance.

As discussed above, when there is a data imbalance as described in this question, I would propose two options: 1) freeze the head of Task B, and train Task A. Evaluate model performance of each head. If Task B has the desired performance off the bat using the pre-trained weights and untrained task head for inference, you could keep them as an MTL model and continue training Task A.  2) If the Task B head is not performant, then it may make sense to split the tasks into separate models.