In [56]:
import torch, torch.nn as nn, numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import emoji
import os

## Data Preprocessing

In this section, we load and process data using the `SarcasmDataset` class. To create a `SarcasmDataset`, input the path of the data csv file and the tokenizer. Later use pytorch to crate a dataloader for the dataset (in the main script).

In [2]:
class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, data_path, tokenizer, max_len, target_col_names=['tweet', 'sarcastic']):
        ''' 
        data_path: path to csv file
        tokenizer: tokenizer to use, likely load from AutoTokenizer
        max_len: max length of input sequence
        '''
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = self.load_data(target_col_names)

    def load_data(self, target_col_names):
        # use pandas to read csv file
        df = pd.read_csv(self.data_path)
        # only need certain columns
        df = df[target_col_names]
        # replace nan with empty string
        df = df.fillna('')
        # convert to np array
        data = df.values
        # convert posible emoji to text
        data = [[emoji.demojize(text), label] for text, label in data]
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        '''
        Convert text to tokens, add special tokens, and create attention mask
        return: input_ids, attention_mask, label
        '''
        text, label = self.data[idx]
        tokens = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            add_special_tokens=True
        )
        input_ids = tokens['input_ids']
        attention_mask = tokens['attention_mask']
        return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(label)

## Model

In this section, we define the model. We will use the ensemble method, which would use multiple models and combine their outputs to get the final prediction.

The pretrained models are defined in `PretrainedModelPlus` class, which can take in any pretrained model and add a hidden layer and output layer on top of it. 

### Ensembling

The models are trained separately and the outputs are combined using combined probability. This is implemented in the predict function.


In [3]:
class PretrainedModelPlus(nn.Module):
    def __init__(self, pretrained_model, output_size, linear_layer_size):
        super(PretrainedModelPlus, self).__init__()
        self.pretrained_model = pretrained_model
        # Add a linear layer on top of the pretrained model
        self.linear = nn.Linear(self.pretrained_model.config.hidden_size, linear_layer_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(linear_layer_size, output_size)
        # Add a sigmoid layer to get the probabilities
        self.sigmoid = nn.Sigmoid()
        # Define the loss function
        self.loss = nn.BCELoss()


    def forward(self, x, attention_mask, labels):
        pretrained_outputs = self.pretrained_model(input_ids=x, attention_mask=attention_mask, 
                                                   return_dict=True, output_hidden_states=True)

        hidden_states = torch.stack(pretrained_outputs["hidden_states"])
        cat_hidden_states = torch.cat([hidden_states[i] for i in [-1, -2, -3, -4]], dim=1)
        first_token = cat_hidden_states[:, 0, :]

        linear_outputs = self.linear(first_token)
        activation_outputs = self.relu(linear_outputs)
        output = self.linear2(activation_outputs)
        probs = self.sigmoid(output)
        loss = self.loss(probs.view(-1), labels.float())
        return loss, probs

In [47]:
def train(model, dataloader, epochs=3, learning_rate=1e-5):
    ''' Train a model
    model: model to train
    dataloader: data loader to use
    epochs: number of epochs to train
    learning_rate: learning rate to use
    return: trained model
    '''
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    for epoch in range(epochs):
        num_tp = 0
        num_fp = 0
        num_tn = 0
        num_fn = 0

        model.train()
        for input_ids, attention_mask, labels in tqdm(dataloader):
            optimizer.zero_grad()
            loss, probs = model(input_ids, attention_mask, labels)
            loss.backward()
            optimizer.step()

            preds = torch.round(probs)
            # compare to labels and update tp, fp, tn, fn
            num_tp += ((preds == 1) & (labels == 1)).sum().item()
            num_fp += ((preds == 1) & (labels == 0)).sum().item()
            num_tn += ((preds == 0) & (labels == 0)).sum().item()
            num_fn += ((preds == 0) & (labels == 1)).sum().item()

        accuracy = (num_tp + num_tn) / (num_tp + num_fp + num_tn + num_fn)
        f1 = 2 * (num_tp / (2 * num_tp + num_fp + num_fn))

        # print out stats
        print(f'Epoch: {epoch + 1}/{epochs} | Loss: {loss.item():.4f} | Accuracy: {accuracy:.4f} | F1: {f1:.4f}')
    return model

In [5]:
def predict(models, data_loaders):
    ''' Combine the predictions of models
    models: models to use
    data_loaders: data loaders to use
    return: list of predictions
    '''
    all_models_probs = []
    for data_loader, model in zip(data_loaders, models):
        probs = []
        for input_ids, attention_mask, labels in tqdm(data_loader):
            _, prob = model(input_ids, attention_mask, labels)
            prob = prob.detach().numpy()[0][0]
            probs.append(prob)
        all_models_probs.append(probs)
    ensemble_probs = np.array(all_models_probs).mean(axis=0)
    preds = [1 if prob > 0.5 else 0 for prob in ensemble_probs]
    return preds

In [6]:
def generate_pred_from_test_file(models, model_names, test_file, output_file="output.csv"):
    ''' Generate predictions from a file
    models: list of pretrained models
    model_names: list of names of models to use
    test_file: file to use for testing
    output_file: file to save the predictions
    '''
    tokenizers = [AutoTokenizer.from_pretrained(model_name) for model_name in model_names]
    test_datasets = [SarcasmDataset(test_file, tokenizer, max_len=128, target_col_names=['text', 'sarcastic']) for tokenizer in tokenizers]
    test_dataloaders = [torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False) for test_dataset in test_datasets]

    preds = predict(models, test_dataloaders)
    df = pd.read_csv(test_file)
    df['sarcastic'] = preds
    df = df[['text', 'sarcastic']]
    df.to_csv(output_file, index=False)


## Evaluation Matrics

We use the f1 score as the evaluation matrics.

In [7]:
def evaluate_f1(test_file, model_generated_file):
  '''
  Inputs a test file and file generated by the model and returns the f1 score using f1_score from sklearn.metrics
  :param test_file: csv of shape(num_samples, num_classifications)
  :param model_generated_file: csv of shape(num_samples, num_classifications)
  :return: f1_score of test_file and model_generated_file of shape(1)
  '''
  df1 = pd.read_csv(test_file)
  df2 = pd.read_csv(model_generated_file)
  arr1 = df1['sarcastic'].to_numpy()
  arr2 = df2['sarcastic'].to_numpy()

  return f1_score(arr1, arr2)

## Main Script

**Instructions for running the main script:**

1. Download the data from [here](https://github.com/iabufarha/iSarcasmEval).

2. Create the dataset and dataloader for each of the models.

3. Initialize the three models.

4. a. Train the three models OR <br>
   b. Load the three previously trained and fine tuned models.

4. Predict and evaluate f1 score on test set for each individual model.

5. Predict and evaluate f1 score on test set for ensemble of models.




In [8]:
# Create dataset #1
dataset_bertweet = SarcasmDataset(data_path='iSarcasmEval/train/train.En.csv',
                                   tokenizer=AutoTokenizer.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis'),
                                   max_len=128)
# Create data loader #1
dataloader_bertweet = torch.utils.data.DataLoader(dataset_bertweet, batch_size=32, shuffle=True)


Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [None]:
# Create dataset #2
dataset_bertweet_irony = SarcasmDataset(data_path='iSarcasmEval/train/train.En.csv',
                                        tokenizer=AutoTokenizer.from_pretrained('pysentimiento/bertweet-irony'),
                                        max_len=128)
# Create data loader #2
dataloader_bertweet_irony = torch.utils.data.DataLoader(dataset_bertweet_irony, batch_size=32, shuffle=True)

In [42]:
# Create dataset #3
dataset_bertweet_c = SarcasmDataset(data_path='iSarcasmEval/train/train.En.csv',
                                            tokenizer=AutoTokenizer.from_pretrained('cardiffnlp/bertweet-base-irony'),    
                                            max_len=128)
# Create data loader #3
dataloader_bertweet_c = torch.utils.data.DataLoader(dataset_bertweet_c, batch_size=32, shuffle=True)

In [43]:
# Some sanity checks
assert len(dataset_bertweet) == 3468
assert len(dataset_bertweet_irony) == 3468
assert len(dataset_bertweet_c) == 3468

In [None]:
# Initialize model 1 architecture
model1 = PretrainedModelPlus(pretrained_model=AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis'), output_size=1, linear_layer_size=64)

In [None]:
# Initialize model 2 architecture
model2 = PretrainedModelPlus(pretrained_model=AutoModel.from_pretrained('pysentimiento/bertweet-irony'), output_size=1, linear_layer_size=64)

In [None]:
# Initialize model 3 architecture
model3 = PretrainedModelPlus(pretrained_model=AutoModel.from_pretrained('cardiffnlp/bertweet-base-irony'), output_size=1, linear_layer_size=64)

In [15]:
'''
OPTION 4A.
We used this to fine tune model 1. Run this to train model 1.

Accuracy metric decreases because less non-sarcastic tweets are being labeled, but more sarcastic tweets are 
labeled correctly. This works for our overall model because we are trying to detect this sarcasm, indicated by 
the increasing f1 score.
'''
model1 = train(model1, dataloader_bertweet, epochs=10, learning_rate=1e-5)

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 1/10 | Loss: 0.3089 | Accuracy: 0.7447 | F1: 0.0180


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 2/10 | Loss: 0.5727 | Accuracy: 0.6972 | F1: 0.1683


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 3/10 | Loss: 0.3584 | Accuracy: 0.6619 | F1: 0.2333


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 4/10 | Loss: 0.3926 | Accuracy: 0.6527 | F1: 0.2445


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 5/10 | Loss: 0.0951 | Accuracy: 0.6442 | F1: 0.2453


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 6/10 | Loss: 0.0721 | Accuracy: 0.6483 | F1: 0.2656


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 7/10 | Loss: 0.5480 | Accuracy: 0.6451 | F1: 0.2656


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 8/10 | Loss: 0.0594 | Accuracy: 0.6444 | F1: 0.2611


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 9/10 | Loss: 0.0462 | Accuracy: 0.6411 | F1: 0.2625


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 10/10 | Loss: 0.0915 | Accuracy: 0.6409 | F1: 0.2595


In [17]:
'''
OPTION 4A.
We used this to fine tune model 2. Run this to train model 2.

Accuracy metric decreases because less non-sarcastic tweets are being labeled, but more sarcastic tweets are 
labeled correctly. This works for our overall model because we are trying to detect this sarcasm, indicated by 
the increasing f1 score.
'''
model2 = train(model2, dataloader_bertweet_irony, epochs=10, learning_rate=1e-5)

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 1/10 | Loss: 0.6022 | Accuracy: 0.7222 | F1: 0.0940


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 2/10 | Loss: 0.3108 | Accuracy: 0.6698 | F1: 0.2168


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 3/10 | Loss: 0.3952 | Accuracy: 0.6530 | F1: 0.2432


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 4/10 | Loss: 0.2992 | Accuracy: 0.6488 | F1: 0.2488


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 5/10 | Loss: 0.1148 | Accuracy: 0.6451 | F1: 0.2539


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 6/10 | Loss: 0.1430 | Accuracy: 0.6449 | F1: 0.2528


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 7/10 | Loss: 0.4921 | Accuracy: 0.6460 | F1: 0.2493


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 8/10 | Loss: 0.2663 | Accuracy: 0.6440 | F1: 0.2513


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 9/10 | Loss: 0.0670 | Accuracy: 0.6437 | F1: 0.2579


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 10/10 | Loss: 0.0474 | Accuracy: 0.6457 | F1: 0.2647


In [48]:
'''
OPTION 4A.
We used this to fine tune model 3. Run this to train model 3.

Accuracy metric decreases because less non-sarcastic tweets are being labeled, but more sarcastic tweets are 
labeled correctly. This works for our overall model because we are trying to detect this sarcasm, indicated by 
the increasing f1 score.
'''
model3 = train(model3, dataloader_bertweet_c, epochs=10, learning_rate=1e-5)

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 1/10 | Loss: 0.6016 | Accuracy: 0.7240 | F1: 0.0854


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 2/10 | Loss: 0.3997 | Accuracy: 0.6746 | F1: 0.2039


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 3/10 | Loss: 0.2911 | Accuracy: 0.6617 | F1: 0.2238


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 4/10 | Loss: 0.2739 | Accuracy: 0.6546 | F1: 0.2381


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 5/10 | Loss: 0.2856 | Accuracy: 0.6552 | F1: 0.2488


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 6/10 | Loss: 0.1053 | Accuracy: 0.6538 | F1: 0.2499


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 7/10 | Loss: 0.0865 | Accuracy: 0.6467 | F1: 0.2503


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 8/10 | Loss: 0.3014 | Accuracy: 0.6517 | F1: 0.2591


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 9/10 | Loss: 0.0591 | Accuracy: 0.6495 | F1: 0.2557


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 10/10 | Loss: 0.0529 | Accuracy: 0.6479 | F1: 0.2593


In [63]:
'''
OPTION 4B.
We used this cell to load our previously fine tuned model 2. Run this if you have and wish to use a
pretrained model 1.
'''
# load model1 if it exists
if os.path.exists('model1.pt'):
    model1.load_state_dict(torch.load('model1.pt'))
    print('Loaded model1')

Some weights of the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at finiteautomata/bertweet-base-sentiment-analysis and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

Loaded model1


In [60]:
'''
OPTION 4B.
We used this cell to load our previously fine tuned model 2. Run this if you have and wish to use
a pretrained model 2.
'''
# load model2 if it exists
if os.path.exists('model2.pt'):
    model2.load_state_dict(torch.load('model2.pt'))
    print('Loaded model2')

Some weights of the model checkpoint at pysentimiento/bertweet-irony were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at pysentimiento/bertweet-irony and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model2


In [66]:
'''
OPTION 4B.
We used this cell to load our fine tuned model 3. Run this if you have and wish to use
a pretrained model 3.
'''
# load model3 if it exists
if os.path.exists('model3.pt'):
    model3.load_state_dict(torch.load('model3.pt'))
    print('Loaded model3')

Some weights of the model checkpoint at cardiffnlp/bertweet-base-irony were not used when initializing RobertaModel: ['classifier.dense.weight', 'classifier.dense.bias', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at cardiffnlp/bertweet-base-irony and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Loaded model3


### Individual Model Performance

In [64]:
generate_pred_from_test_file([model1],
                             ['finiteautomata/bertweet-base-sentiment-analysis'],    
                             'iSarcasmEval/test/task_A_En_test.csv', 'output-1.csv')

  0%|          | 0/1400 [00:00<?, ?it/s]

In [61]:
generate_pred_from_test_file([model2],
                            ['pysentimiento/bertweet-irony'],
                            'iSarcasmEval/test/task_A_En_test.csv', 'output-2.csv')

  0%|          | 0/1400 [00:00<?, ?it/s]

In [67]:
generate_pred_from_test_file([model3],
                            ['cardiffnlp/bertweet-base-irony'],
                            'iSarcasmEval/test/task_A_En_test.csv', 'output-3.csv')

  0%|          | 0/1400 [00:00<?, ?it/s]

In [65]:
evaluate_f1('iSarcasmEval/test/task_A_En_test.csv', 'output-1.csv')

0.47791164658634533

In [62]:
evaluate_f1('iSarcasmEval/test/task_A_En_test.csv', 'output-2.csv')

0.4771784232365145

In [68]:
evaluate_f1('iSarcasmEval/test/task_A_En_test.csv', 'output-3.csv')

0.46630236794171215

### Ensemble Prediction

In [69]:
generate_pred_from_test_file([model1, model2, model3],
                                ['finiteautomata/bertweet-base-sentiment-analysis', 
                                'pysentimiento/bertweet-irony',
                                'cardiffnlp/bertweet-base-irony'],
                                'iSarcasmEval/test/task_A_En_test.csv', 'output-123.csv')

  0%|          | 0/1400 [00:00<?, ?it/s]

  0%|          | 0/1400 [00:00<?, ?it/s]

  0%|          | 0/1400 [00:00<?, ?it/s]

In [71]:
evaluate_f1('iSarcasmEval/test/task_A_En_test.csv', 'output-123.csv')

0.5

## Save model

In [19]:
torch.save(model1.state_dict(), 'model1.pt')

In [20]:
torch.save(model2.state_dict(), 'model2.pt')

In [21]:
torch.save(model3.state_dict(), 'model3.pt')