In [1]:
import torch, torch.nn as nn, numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import emoji
import os

## Data Preprocessing

In this section, we load and process data using the `SarcasmDataset` class. To create a `SarcasmDataset`, input the path of the data csv file and the tokenizer. Later use pytorch to crate a dataloader for the dataset (in the main script).

In [3]:
class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, data_path, tokenizer, max_len, target_col_names=['tweet', 'sarcastic']):
        ''' 
        data_path: path to csv file
        tokenizer: tokenizer to use, likely load from AutoTokenizer
        max_len: max length of input sequence
        '''
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = self.load_data(target_col_names)

    def load_data(self, target_col_names):
        # use pandas to read csv file
        df = pd.read_csv(self.data_path)
        # only need certain columns
        df = df[target_col_names]
        # replace nan with empty string
        df = df.fillna('')
        # convert to np array
        data = df.values
        # process text
        data = [[self.process_tweet(text), label] for text, label in data]
        return data
    
    def process_tweet(self, text):
        # convert emoji to text
        text = emoji.demojize(text)
        # if contain user name (word start with @), replace with @USER
        text = ' '.join(['@USER' if word.startswith('@') else word for word in text.split()])
        # if contian url, replace with HTTPURL
        text = ' '.join(['HTTPURL' if word.startswith('http') else word for word in text.split()])
        return text

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        '''
        Convert text to tokens, add special tokens, and create attention mask
        return: input_ids, attention_mask, label
        '''
        text, label = self.data[idx]
        # tokenize text
        tokens = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            add_special_tokens=True
        )
        # get input ids: numerical representation of tokens, used as input to model
        input_ids = tokens['input_ids']
        # get attention mask: binary mask to indicate which tokens should be attended to, used as input to model
        attention_mask = tokens['attention_mask']
        # convert to tensor and return
        return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(label)

## Model

In this section, we define the model. We will use the ensemble method, which would use multiple models and combine their outputs to get the final prediction.

The pretrained models are defined in `PretrainedModelPlus` class, which can take in any pretrained model and add a hidden layer and output layer on top of it. 

### Ensembling

The models are trained separately and the outputs are combined using combined average probability. This is implemented in the predict function.


In [4]:
class PretrainedModelPlus(nn.Module):
    def __init__(self, pretrained_model, output_size, linear_layer_size):
        super(PretrainedModelPlus, self).__init__()
        self.pretrained_model = pretrained_model
        # Add linear layer on top of the pretrained model
        self.linear = nn.Linear(self.pretrained_model.config.hidden_size, linear_layer_size)
        self.linear_final = nn.Linear(linear_layer_size, output_size)

        # Activation function
        self.activation = nn.ReLU()
        # Add a sigmoid layer to get the probabilities
        self.sigmoid = nn.Sigmoid()

        # Define the loss function
        self.loss = nn.BCELoss()


    def forward(self, x, attention_mask, labels):
        pretrained_outputs = self.pretrained_model(input_ids=x, attention_mask=attention_mask, 
                                                   return_dict=True, output_hidden_states=True)

        # Get all hidden states, size: (num_hidden_layers, batch_size, sequence_length, pretrained_hidden_size)
        hidden_states = torch.stack(pretrained_outputs["hidden_states"])
        # Sum the last 4 layers, size: (batch_size, sequence_length, pretrained_hidden_size)
        summed_last_4_layers = torch.sum(hidden_states[-4:], dim=0)
        # Use the first token as the output, size: (batch_size, pretrained_hidden_size)
        first_token = summed_last_4_layers[:, 0, :]

        # linear layer with activation
        linear_outputs = self.linear(first_token)
        activation_outputs = self.activation(linear_outputs)

        # get output of size 1 for classification
        output = self.linear_final(activation_outputs)
        # turn output into probabilities
        probs = self.sigmoid(output)
        # calculate loss
        loss = self.loss(probs.view(-1), labels.float())
        return loss, probs

In [22]:
def train(model, dataloader, epochs=3, learning_rate=1e-5):
    ''' Train a model
    model: model to train
    dataloader: data loader to use
    epochs: number of epochs to train
    learning_rate: learning rate to use
    return: trained model
    '''
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    for epoch in range(epochs):
        # keep track of tp, fp, tn, fn
        num_tp = 0
        num_fp = 0
        num_tn = 0
        num_fn = 0

        model.train()
        for input_ids, attention_mask, labels in tqdm(dataloader):
            optimizer.zero_grad()
            loss, probs = model(input_ids, attention_mask, labels)
            loss.backward()
            optimizer.step()

            # get predictions from probabilities
            preds = torch.round(probs)
            # turn into 1d tensor
            preds = preds.squeeze(1)
            # loop through this batch of predictions, and update tp, fp, tn, fn
            for p, l in zip(preds, labels):
                num_tp += int(p == 1 and l == 1)
                num_fp += int(p == 1 and l == 0)
                num_tn += int(p == 0 and l == 0)
                num_fn += int(p == 0 and l == 1)

        # calculate accuracy, precision, recall, f1
        accuracy = (num_tp + num_tn) / (num_tp + num_fp + num_tn + num_fn)
        precision = num_tp / (num_tp + num_fp)
        recall = num_tp / (num_tp + num_fn)
        f1 = 2 * precision * recall / (precision + recall)

        # print out stats
        print(f'Epoch: {epoch + 1}/{epochs} | Loss: {loss.item():.4f} | Accuracy: {accuracy:.4f} | F1: {f1:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f}')
    return model

In [6]:
def predict(models, data_loaders):
    ''' Combine the predictions of models
    models: models to use
    data_loaders: data loaders to use
    return: list of predictions
    '''
    all_models_probs = [] # list of prob by all models

    for data_loader, model in zip(data_loaders, models):
        probs = [] # probs for each tweet predicted by one model
        for input_ids, attention_mask, labels in tqdm(data_loader):
            _, prob = model(input_ids, attention_mask, labels)
            prob = prob.detach().numpy()[0][0]
            probs.append(prob)
        all_models_probs.append(probs)

    # average the probs
    ensemble_probs = np.array(all_models_probs).mean(axis=0)
    # set threshold to 0.5, convert to 0 or 1
    preds = [1 if prob > 0.5 else 0 for prob in ensemble_probs]
    return preds

In [7]:
def generate_pred_from_test_file(models, model_names, test_file, output_file="output.csv"):
    ''' Generate predictions from a file
    models: list of pretrained models
    model_names: list of names of models to use
    test_file: file to use for testing
    output_file: file to save the predictions
    '''
    # Load the test file and create a data loader
    tokenizers = [AutoTokenizer.from_pretrained(model_name) for model_name in model_names]
    test_datasets = [SarcasmDataset(test_file, tokenizer, max_len=128, target_col_names=['text', 'sarcastic']) for tokenizer in tokenizers]
    test_dataloaders = [torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False) for test_dataset in test_datasets]

    # Generate predictions
    preds = predict(models, test_dataloaders)

    # Use pandas to save the predictions into a csv file
    df = pd.read_csv(test_file)
    df['sarcastic'] = preds
    df = df[['text', 'sarcastic']]
    df.to_csv(output_file, index=False)


## Evaluation Matrics

We use the f1 score as the evaluation matrics.

In [8]:
def evaluate_f1(test_file, model_generated_file):
  '''
  Inputs a test file and file generated by the model and returns the f1 score using f1_score from sklearn.metrics
  :param test_file: csv of shape(num_samples, num_classifications)
  :param model_generated_file: csv of shape(num_samples, num_classifications)
  :return: f1_score of test_file and model_generated_file of shape(1)
  '''
  # load file into pandas dataframe
  df1 = pd.read_csv(test_file)
  df2 = pd.read_csv(model_generated_file)
  # get the sarcastic column as a numpy array
  arr1 = df1['sarcastic'].to_numpy()
  arr2 = df2['sarcastic'].to_numpy()

  return f1_score(arr1, arr2)

## Main Script

**Instructions for running the main script:**

1. Download the data from [here](https://github.com/iabufarha/iSarcasmEval).

2. Create the dataset and dataloader for each of the models.

3. Initialize the three models.

4. a. Train the three models OR <br>
   b. Load the three previously trained and continue to fine tune models.

4. Predict and evaluate f1 score on test set for each individual model.

5. Predict and evaluate f1 score on test set for ensemble of models.




In [14]:
# Create dataset #1
dataset_bertweet = SarcasmDataset(data_path='iSarcasmEval/train/train.En.csv',
                                   tokenizer=AutoTokenizer.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis'),
                                   max_len=128)
# Create data loader #1
dataloader_bertweet = torch.utils.data.DataLoader(dataset_bertweet, batch_size=64, shuffle=True)


In [15]:
# Create dataset #2
dataset_bertweet_irony = SarcasmDataset(data_path='iSarcasmEval/train/train.En.csv',
                                        tokenizer=AutoTokenizer.from_pretrained('pysentimiento/bertweet-irony'),
                                        max_len=128)
# Create data loader #2
dataloader_bertweet_irony = torch.utils.data.DataLoader(dataset_bertweet_irony, batch_size=64, shuffle=True)

In [16]:
# Create dataset #3
dataset_bertweet_c = SarcasmDataset(data_path='iSarcasmEval/train/train.En.csv',
                                            tokenizer=AutoTokenizer.from_pretrained('cardiffnlp/bertweet-base-irony'),    
                                            max_len=128)
# Create data loader #3
dataloader_bertweet_c = torch.utils.data.DataLoader(dataset_bertweet_c, batch_size=64, shuffle=True)

In [17]:
# Some sanity checks
assert len(dataset_bertweet) == 3468
assert len(dataset_bertweet_irony) == 3468
assert len(dataset_bertweet_c) == 3468

In [23]:
# Initialize model 1 architecture
model1 = PretrainedModelPlus(pretrained_model=AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis'), output_size=1, linear_layer_size=64)

Some weights of the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

In [24]:
# Initialize model 2 architecture
model2 = PretrainedModelPlus(pretrained_model=AutoModel.from_pretrained('pysentimiento/bertweet-irony'), output_size=1, linear_layer_size=64)

Some weights of the model checkpoint at pysentimiento/bertweet-irony were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at pysentimiento/bertweet-irony and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
# Initialize model 3 architecture
model3 = PretrainedModelPlus(pretrained_model=AutoModel.from_pretrained('cardiffnlp/bertweet-base-irony'), output_size=1, linear_layer_size=64)

Some weights of the model checkpoint at cardiffnlp/bertweet-base-irony were not used when initializing RobertaModel: ['classifier.out_proj.weight', 'classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/bertweet-base-irony and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [26]:
'''
OPTION 4A.
We used this to fine tune model 1. Run this to train model 1.
'''
model1 = train(model1, dataloader_bertweet, epochs=2, learning_rate=1e-5) # larger learning rate at first
model1 = train(model1, dataloader_bertweet, epochs=2, learning_rate=5e-6) # smaller learning rate later

  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 1/2 | Loss: 0.4596 | Accuracy: 0.7413 | F1: 0.0588 | Precision: 0.3256 | Recall: 0.0323


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 2/2 | Loss: 0.2242 | Accuracy: 0.7693 | F1: 0.2806 | Precision: 0.6367 | Recall: 0.1799


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 1/2 | Loss: 0.2449 | Accuracy: 0.8238 | F1: 0.5852 | Precision: 0.7112 | Recall: 0.4971


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 2/2 | Loss: 0.5268 | Accuracy: 0.8417 | F1: 0.6409 | Precision: 0.7402 | Recall: 0.5652


In [29]:
model1 = train(model1, dataloader_bertweet, epochs=1, learning_rate=5e-6)

  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 1/1 | Loss: 0.2624 | Accuracy: 0.8590 | F1: 0.6915 | Precision: 0.7632 | Recall: 0.6321


In [32]:
'''
OPTION 4A.
We used this to fine tune model 2. Run this to train model 2.
'''
model2 = train(model2, dataloader_bertweet_irony, epochs=3, learning_rate=1e-5) # larger learning rate at first
model2 = train(model2, dataloader_bertweet_irony, epochs=2, learning_rate=5e-6) # smaller learning rate later

  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 1/3 | Loss: 0.5600 | Accuracy: 0.7523 | F1: 0.0359 | Precision: 0.6667 | Recall: 0.0185


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 2/3 | Loss: 0.2728 | Accuracy: 0.8106 | F1: 0.5383 | Precision: 0.6888 | Recall: 0.4418


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 3/3 | Loss: 0.2639 | Accuracy: 0.8486 | F1: 0.6553 | Precision: 0.7607 | Recall: 0.5755


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 1/2 | Loss: 0.1756 | Accuracy: 0.8821 | F1: 0.7373 | Precision: 0.8319 | Recall: 0.6621


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 2/2 | Loss: 0.3310 | Accuracy: 0.8976 | F1: 0.7717 | Precision: 0.8721 | Recall: 0.6920


In [38]:
'''
OPTION 4A.
We used this to fine tune model 3. Run this to train model 3.
'''
model3 = train(model3, dataloader_bertweet_c, epochs=3, learning_rate=1e-5) # larger learning rate at first
model3 = train(model3, dataloader_bertweet_c, epochs=1, learning_rate=5e-6) # smaller learning rate later

  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 1/3 | Loss: 0.4524 | Accuracy: 0.7506 | F1: 0.0159 | Precision: 0.5833 | Recall: 0.0081


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 2/3 | Loss: 0.5498 | Accuracy: 0.7982 | F1: 0.4745 | Precision: 0.6796 | Recall: 0.3645


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 3/3 | Loss: 0.3274 | Accuracy: 0.8244 | F1: 0.5849 | Precision: 0.7150 | Recall: 0.4948


  0%|          | 0/55 [00:00<?, ?it/s]

Epoch: 1/1 | Loss: 0.2012 | Accuracy: 0.8518 | F1: 0.6701 | Precision: 0.7554 | Recall: 0.6021


In [63]:
'''
OPTION 4B.
We used this cell to load our previously fine tuned model 2. Run this if you have and wish to use a
pretrained model 1.
'''
# load model1 if it exists
if os.path.exists('model1.pt'):
    model1.load_state_dict(torch.load('model1.pt'))
    print('Loaded model1')

Some weights of the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

Loaded model1


In [22]:
'''
OPTION 4B.
We used this cell to load our previously fine tuned model 2. Run this if you have and wish to use
a pretrained model 2.
'''
# load model2 if it exists
if os.path.exists('model2.pt'):
    model2.load_state_dict(torch.load('model2.pt'))
    print('Loaded model2')

Loaded model2


In [66]:
'''
OPTION 4B.
We used this cell to load our fine tuned model 3. Run this if you have and wish to use
a pretrained model 3.
'''
# load model3 if it exists
if os.path.exists('model3.pt'):
    model3.load_state_dict(torch.load('model3.pt'))
    print('Loaded model3')

Some weights of the model checkpoint at cardiffnlp/bertweet-base-irony were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/bertweet-base-irony and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model3


### Individual Model Performance

In [30]:
generate_pred_from_test_file([model1],
                             ['finiteautomata/bertweet-base-sentiment-analysis'],    
                             'iSarcasmEval/test/task_A_En_test.csv', 'output-1.csv')

  0%|          | 0/1400 [00:00<?, ?it/s]

In [36]:
generate_pred_from_test_file([model2],
                            ['pysentimiento/bertweet-irony'],
                            'iSarcasmEval/test/task_A_En_test.csv', 'output-2.csv')

  0%|          | 0/1400 [00:00<?, ?it/s]

In [39]:
generate_pred_from_test_file([model3],
                            ['cardiffnlp/bertweet-base-irony'],
                            'iSarcasmEval/test/task_A_En_test.csv', 'output-3.csv')

  0%|          | 0/1400 [00:00<?, ?it/s]

In [31]:
evaluate_f1('iSarcasmEval/test/task_A_En_test.csv', 'output-1.csv')

0.475

In [37]:
evaluate_f1('iSarcasmEval/test/task_A_En_test.csv', 'output-2.csv')

0.46808510638297873

In [40]:
evaluate_f1('iSarcasmEval/test/task_A_En_test.csv', 'output-3.csv')

0.4924406047516198

### Ensemble Prediction

In [42]:
generate_pred_from_test_file([model1, model2, model3],
                                ['finiteautomata/bertweet-base-sentiment-analysis', 
                                'pysentimiento/bertweet-irony',
                                'cardiffnlp/bertweet-base-irony'],
                                'iSarcasmEval/test/task_A_En_test.csv', 'output-123.csv')

  0%|          | 0/1400 [00:00<?, ?it/s]

  0%|          | 0/1400 [00:00<?, ?it/s]

  0%|          | 0/1400 [00:00<?, ?it/s]

In [43]:
evaluate_f1('iSarcasmEval/test/task_A_En_test.csv', 'output-123.csv')

0.5030674846625767

## Save model

In [35]:
torch.save(model1.state_dict(), 'model1.pt')

In [26]:
torch.save(model2.state_dict(), 'model2.pt')

In [41]:
torch.save(model3.state_dict(), 'model3.pt')