In [1]:
import torch, torch.nn as nn, numpy as np
from tqdm.notebook import tqdm
from sklearn.metrics import f1_score
from transformers import AutoTokenizer, AutoModel
import pandas as pd
import emoji

## Data Preprocessing

In this section, we load and process data using the `SarcasmDataset` class. To create a `SarcasmDataset`, input the path of the data csv file and the tokenizer. Later use pytorch to crate a dataloader for the dataset (in the main script).

In [2]:
class SarcasmDataset(torch.utils.data.Dataset):
    def __init__(self, data_path, tokenizer, max_len, target_col_names=['tweet', 'sarcastic']):
        ''' 
        data_path: path to csv file
        tokenizer: tokenizer to use, likely load from AutoTokenizer
        max_len: max length of input sequence
        '''
        self.data_path = data_path
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.data = self.load_data(target_col_names)

    def load_data(self, target_col_names):
        # use pandas to read csv file
        df = pd.read_csv(self.data_path)
        # only need certain columns
        df = df[target_col_names]
        # replace nan with empty string
        df = df.fillna('')
        # convert to np array
        data = df.values
        # convert posible emoji to text
        data = [[emoji.demojize(text), label] for text, label in data]
        return data

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        '''
        Convert text to tokens, add special tokens, and create attention mask
        return: input_ids, attention_mask, label
        '''
        text, label = self.data[idx]
        tokens = self.tokenizer(
            text,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            add_special_tokens=True
        )
        input_ids = tokens['input_ids']
        attention_mask = tokens['attention_mask']
        return torch.tensor(input_ids), torch.tensor(attention_mask), torch.tensor(label)

## Model

In this section, we define the model. We will use the ensemble method, which would use multiple models and combine their outputs to get the final prediction.

The pretrained models are defined in `PretrainedModelPlus` class, which can take in any pretrained model and add a hidden layer and output layer on top of it. 

### Ensembling

The models are trained separately and the outputs are combined using combined probability. This is implemented in the predict function.


In [31]:
class PretrainedModelPlus(nn.Module):
    def __init__(self, pretrained_model, output_size, linear_layer_size):
        super(PretrainedModelPlus, self).__init__()
        self.pretrained_model = pretrained_model
        # Add a linear layer on top of the pretrained model
        self.linear = nn.Linear(self.pretrained_model.config.hidden_size, linear_layer_size)
        self.relu = nn.ReLU()
        self.linear2 = nn.Linear(linear_layer_size, output_size)
        # Add a sigmoid layer to get the probabilities
        self.sigmoid = nn.Sigmoid()
        # Define the loss function
        self.loss = nn.BCELoss()


    def forward(self, x, attention_mask, labels):
        pretrained_outputs = self.pretrained_model(input_ids=x, attention_mask=attention_mask, 
                                                   return_dict=True, output_hidden_states=True)

        hidden_states = torch.stack(pretrained_outputs["hidden_states"])
        cat_hidden_states = torch.cat([hidden_states[i] for i in [-1, -2, -3, -4]], dim=1)
        first_token = cat_hidden_states[:, 0, :]

        linear_outputs = self.linear(first_token)
        activation_outputs = self.relu(linear_outputs)
        output = self.linear2(activation_outputs)
        probs = self.sigmoid(output)
        loss = self.loss(probs.view(-1), labels.float())
        return loss, probs

In [5]:
def trian(model, dataloader, epochs=3, learning_rate=1e-5):
    ''' Train a model
    model: model to train
    dataloader: data loader to use
    epochs: number of epochs to train
    learning_rate: learning rate to use
    return: trained model
    '''
    optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
    for epoch in range(epochs):
        num_tp = 0
        num_fp = 0
        num_tn = 0
        num_fn = 0

        model.train()
        for input_ids, attention_mask, labels in tqdm(dataloader):
            optimizer.zero_grad()
            loss, probs = model(input_ids, attention_mask, labels)
            loss.backward()
            optimizer.step()

            preds = torch.round(probs)
            # compare to labels and update tp, fp, tn, fn
            num_tp += ((preds == 1) & (labels == 1)).sum().item()
            num_fp += ((preds == 1) & (labels == 0)).sum().item()
            num_tn += ((preds == 0) & (labels == 0)).sum().item()
            num_fn += ((preds == 0) & (labels == 1)).sum().item()

        accuracy = (num_tp + num_tn) / (num_tp + num_fp + num_tn + num_fn)
        f1 = 2 * (num_tp / (2 * num_tp + num_fp + num_fn))

        # print out stats
        print(f'Epoch: {epoch + 1}/{epochs} | Loss: {loss.item():.4f} | Accuracy: {accuracy:.4f} | F1: {f1:.4f}')
    return model

In [6]:
def predict(models, data_loaders):
    ''' Combine the predictions of models
    models: models to use
    data_loaders: data loaders to use
    return: list of predictions
    '''
    all_models_probs = []
    for data_loader, model in zip(data_loaders, models):
        probs = []
        for input_ids, attention_mask, labels in tqdm(data_loader):
            _, prob = model(input_ids, attention_mask, labels)
            prob = prob.detach().numpy()[0][0]
            probs.append(prob)
        all_models_probs.append(probs)
    ensemble_probs = np.array(all_models_probs).mean(axis=0)
    preds = [1 if prob > 0.5 else 0 for prob in ensemble_probs]
    return preds

In [7]:
def generate_pred_from_test_file(models, model_names, test_file, output_file="output.csv"):
    ''' Generate predictions from a file
    models: list of pretrained models
    model_names: list of names of models to use
    test_file: file to use for testing
    output_file: file to save the predictions
    '''
    tokenizers = [AutoTokenizer.from_pretrained(model_name) for model_name in model_names]
    test_datasets = [SarcasmDataset(test_file, tokenizer, max_len=128, target_col_names=['text', 'sarcastic']) for tokenizer in tokenizers]
    test_dataloaders = [torch.utils.data.DataLoader(test_dataset, batch_size=1, shuffle=False) for test_dataset in test_datasets]

    preds = predict(models, test_dataloaders)
    df = pd.read_csv(test_file)
    df['sarcastic'] = preds
    df = df[['text', 'sarcastic']]
    df.to_csv(output_file, index=False)


## Evaluation Matrics

We use the f1 score as the evaluation matrics.

In [8]:
def evaluate_f1(test_file, model_generated_file):
  '''
  Inputs a test file and file generated by the model and returns the f1 score using f1_score from sklearn.metrics
  :param test_file: csv of shape(num_samples, num_classifications)
  :param model_generated_file: csv of shape(num_samples, num_classifications)
  :return: f1_score of test_file and model_generated_file of shape(1)
  '''
  df1 = pd.read_csv(test_file)
  df2 = pd.read_csv(model_generated_file)
  arr1 = df1['sarcastic'].to_numpy()
  arr2 = df2['sarcastic'].to_numpy()

  return f1_score(arr1, arr2)

## Main Script

**Instructions for running the main script:**

1. Download the data from [here](https://github.com/iabufarha/iSarcasmEval).

2. Create the dataset and dataloader for each of the models.

3. Create and Train model

4. Predict and evaluate f1 score on test set




In [None]:
# Create dataset #1
dataset_roberta = SarcasmDataset(data_path='iSarcasmEval/train/train.En.csv',
                                 tokenizer=AutoTokenizer.from_pretrained('roberta-base'),
                                 max_len=128)
# Create data loader #1
dataloader_roberta = torch.utils.data.DataLoader(dataset_roberta, batch_size=32, shuffle=True)

# Create dataset #2
dataset_bertweet = SarcasmDataset(data_path='iSarcasmEval/train/train.En.csv',
                                   tokenizer=AutoTokenizer.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis'),
                                   max_len=128)
# Create data loader #2
dataloader_bertweet = torch.utils.data.DataLoader(dataset_bertweet, batch_size=32, shuffle=True)

# Create dataset #3
dataset_deberta = SarcasmDataset(data_path='iSarcasmEval/train/train.En.csv',
                                tokenizer=AutoTokenizer.from_pretrained('microsoft/deberta-v3-base'),
                                max_len=128)
# Create data loader #3
dataloader_deberta = torch.utils.data.DataLoader(dataset_deberta, batch_size=32, shuffle=True)

# Create dataset #4
dataset_bertweet_irony = SarcasmDataset(data_path='iSarcasmEval/train/train.En.csv',
                                        tokenizer=AutoTokenizer.from_pretrained('pysentimiento/bertweet-irony'),
                                        max_len=128)
# Create data loader #4
dataloader_bertweet_irony = torch.utils.data.DataLoader(dataset_bertweet_irony, batch_size=32, shuffle=True)


In [17]:
# Some sanity checks
assert len(dataset_roberta) == 3468
assert len(dataset_bertweet) == 3468
assert len(dataset_deberta) == 3468

In [26]:
# Create models
model1 = PretrainedModelPlus(pretrained_model=AutoModel.from_pretrained('roberta-base'), output_size=1, linear_layer_size=64)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaModel: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model2 = PretrainedModelPlus(pretrained_model=AutoModel.from_pretrained('finiteautomata/bertweet-base-sentiment-analysis'), output_size=1, linear_layer_size=64)

In [None]:
model3 = PretrainedModelPlus(pretrained_model=AutoModel.from_pretrained('microsoft/deberta-v3-base'), output_size=1, linear_layer_size=64)

In [21]:
model4 = PretrainedModelPlus(pretrained_model=AutoModel.from_pretrained('pysentimiento/bertweet-irony'), output_size=1, linear_layer_size=64)

Downloading (…)lve/main/config.json:   0%|          | 0.00/939 [00:00<?, ?B/s]

Downloading (…)"pytorch_model.bin";:   0%|          | 0.00/540M [00:00<?, ?B/s]

Some weights of the model checkpoint at pysentimiento/bertweet-irony were not used when initializing RobertaModel: ['classifier.out_proj.bias', 'classifier.dense.bias', 'classifier.out_proj.weight', 'classifier.dense.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaModel were not initialized from the model checkpoint at pysentimiento/bertweet-irony and are newly initialized: ['roberta.pooler.dense.weight', 'roberta.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
# Train models
model1 = trian(model1, dataloader_roberta, epochs=7, learning_rate=1e-5)

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 1/3 | Loss: 0.3252 | Accuracy: 0.6413 | F1: 0.2533


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 2/3 | Loss: 0.1488 | Accuracy: 0.6394 | F1: 0.2649


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 3/3 | Loss: 0.0265 | Accuracy: 0.6365 | F1: 0.2677


In [19]:
model2 = trian(model2, dataloader_bertweet, epochs=7, learning_rate=1e-5)

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 1/7 | Loss: 0.5087 | Accuracy: 0.7492 | F1: 0.0039


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 2/7 | Loss: 0.3477 | Accuracy: 0.7111 | F1: 0.1272


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 3/7 | Loss: 0.4841 | Accuracy: 0.6701 | F1: 0.2242


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 4/7 | Loss: 0.1441 | Accuracy: 0.6553 | F1: 0.2416


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 5/7 | Loss: 0.7052 | Accuracy: 0.6489 | F1: 0.2508


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 6/7 | Loss: 0.1064 | Accuracy: 0.6498 | F1: 0.2593


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 7/7 | Loss: 0.0627 | Accuracy: 0.6458 | F1: 0.2619


In [12]:
model3 = trian(model3, dataloader_deberta, epochs=3, learning_rate=1e-5)

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 1/3 | Loss: 0.7745 | Accuracy: 0.7472 | F1: 0.0128


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 2/3 | Loss: 0.5254 | Accuracy: 0.7318 | F1: 0.0589


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 3/3 | Loss: 0.2870 | Accuracy: 0.6588 | F1: 0.2273


In [22]:
model4 = trian(model4, dataloader_bertweet_irony, epochs=5, learning_rate=1e-5)

  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 1/5 | Loss: 0.7236 | Accuracy: 0.7141 | F1: 0.1344


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 2/5 | Loss: 0.3141 | Accuracy: 0.6729 | F1: 0.2181


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 3/5 | Loss: 0.4340 | Accuracy: 0.6633 | F1: 0.2290


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 4/5 | Loss: 0.2646 | Accuracy: 0.6595 | F1: 0.2410


  0%|          | 0/109 [00:00<?, ?it/s]

Epoch: 5/5 | Loss: 0.1246 | Accuracy: 0.6539 | F1: 0.2452


In [None]:
generate_pred_from_test_file([model2, model3, model4],
                             ['finiteautomata/bertweet-base-sentiment-analysis', 'microsoft/deberta-v3-base', 'pysentimiento/bertweet-irony'],
                             'iSarcasmEval/test/task_A_En_test.csv', 'output-234.csv')

In [73]:
generate_pred_from_test_file([model1], 
                             ['roberta-base'], 
                             'iSarcasmEval/test/task_A_En_test.csv', 'output.csv')

  0%|          | 0/1400 [00:00<?, ?it/s]

In [None]:
generate_pred_from_test_file([model1, model2, model3, model4],
                                ['roberta-base', 'finiteautomata/bertweet-base-sentiment-analysis', 'microsoft/deberta-v3-base', 'pysentimiento/bertweet-irony'],
                                'iSarcasmEval/test/task_A_En_test.csv', 'output-1234.csv')

In [74]:
evaluate_f1('iSarcasmEval/test/task_A_En_test.csv', 'output.csv')

0.362116991643454

In [24]:
evaluate_f1('iSarcasmEval/test/task_A_En_test.csv', 'output-234.csv')

0.485207100591716

## Save model