In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install sentence_transformers

In [None]:
import pandas as pd
import re
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BertModel
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np

In [None]:
def clean_text(input):
  # Pre-processing
  remove_symbols = re.compile('[^0-9A-Za-z ]') 

  input = remove_symbols.sub(' ', str(input)) 
  # remove  empty spaces
  re.sub('\s+', '' ,input)
  return input
    

In [None]:
from transformers import BertTokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
def preprocessing_for_bert(data_tweet, data_labels, max_length=200):

  """Perform required preprocessing steps for pretrained BERT.
  @param    data (np.array): Array of texts to be processed.
  @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
  @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                tokens should be attended to by the model.
  """
  # Create empty lists to store outputs
  input_ids = []
  attention_masks = []

  # For every sentence...
  for element in data_tweet:
    # `encode_plus` will:
    #    (1) Tokenize the sentence
    #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
    #    (3) Truncate/Pad sentence to max length
    #    (4) Map tokens to their IDs
    #    (5) Create attention mask
    #    (6) Return a dictionary of outputs
    encoded_dict = tokenizer.encode_plus(
        text=clean_text(element),  # Preprocess sentence
        add_special_tokens=True, # Add `[CLS]` and `[SEP]`
        max_length=max_length,   # Max length to truncate/pad
        pad_to_max_length=True,  # Pad sentence to max length
        return_tensors='pt',     # Return PyTorch tensor
        return_attention_mask=True      # Return attention mask
        )
      
      # Add the outputs to the lists
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  # Convert lists to tensors
  input_ids = torch.cat(input_ids,dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)
  labels = torch.tensor(data_labels)

  return input_ids, attention_masks, labels

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [None]:
def load_data(input_ids, attention_masks, labels, batch_size = 64):
  train_set = TensorDataset(input_ids, 
                          attention_masks, 
                          labels)
  train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )
  return train_dataloader


In [None]:
class BertClassifier(nn.Module):

  """Bert Model for Classification Tasks.
  """
  def __init__(self):
      """
      @param    bert: a BertModel object
      @param    classifier: a torch.nn.Module classifier
      @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
      """
      super(BertClassifier, self).__init__()
      # Specify hidden size of BERT, hidden size of our classifier, and number of labels
      #dimension_in, hidden_layer, dimension_out = 768, 50, 6

      # Instantiate BERT model
      self.bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)
      
      self.classifier = nn.Sequential(
          #nn.Linear(dimension_in, hidden_layer),
          #nn.ReLU(),
          nn.Dropout(0.1),
          nn.Linear(768,6)
      )
      
  def forward(self, input_ids, attention_mask):
      """
      Feed input to BERT and the classifier to compute logits.
      @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                    max_length)
      @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                    information with shape (batch_size, max_length)
      @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                    num_labels)
      """
      # Feed input to BERT
      outputs = self.bert(input_ids=input_ids,
                          attention_mask=attention_mask)
      
      # Extract the last hidden state of the token `[CLS]` for classification task
      # This will have the size of (batch_size, 768)
      last_hidden_state_cls = outputs.hidden_states[-1][:, 0, :]

      # Feed input to classifier to compute logits
      logits = self.classifier(last_hidden_state_cls)

      return logits


In [None]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(train_dataloader, epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier()

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
#from sklearn.utils.class_weight import compute_class_weight
# def loss_fn(logits,labels):
#   classes = np.unique(labels.numpy())
#   weights = compute_class_weight(class_weight='balanced', classes = classes, y=labels.numpy())
#   criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(weights, dtype=torch.float), reduction='mean')
#   loss = criterion(logits, labels)
#   return loss

loss_fn = nn.BCEWithLogitsLoss()

def train(model, train_dataloader,test_dataloader,optimizer, scheduler,epochs=4):
  """Train the BertClassifier model.
  """
  # Start training loop
  print("Start training...\n")
  for epoch_i in range(epochs):
    model_save_name = 'task-b-bert.pt'
    col_name = 'epoch-' + str(epoch_i+1)

    # =======================================
    #               Training
    # =======================================

    # Reset tracking variables at the beginning of each epoch
    # total_loss, batch_loss, batch_counts = 0, 0, 0

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    # Put the model into the training mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

      # Load batch to GPU
      batch = tuple(t for t in batch)
      b_input_ids, b_input_mask, b_labels = batch

      # Zero out any previously calculated gradients
      model.zero_grad()
      # Perform a forward pass. This will return logits.
      logits = model(b_input_ids, attention_mask = b_input_mask)
      
      # Compute loss and accumulate the loss values
      loss = loss_fn(logits, b_labels)
      tr_loss += loss.item()

      # Perform a backward pass to calculate gradients
      loss.backward()
      nb_tr_examples += b_input_ids.size(0)
      nb_tr_steps += 1

      # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
      #torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Update parameters and the learning rate
      optimizer.step()
      scheduler.step()

    model_save_name = 'epoch-' + str(epoch_i+1) + '-' + model_save_name
    path = F"/content/drive/MyDrive/Dataset/train/Task-B/{model_save_name}" 
    torch.save(model.state_dict(), path)

    print("Training complete!")


    prediction = bert_predict(model, test_dataloader)
    col_name = "probabilities-" + col_name + ".csv"
    path = "/content/drive/MyDrive/Dataset/test/Task-B/sigmoid-output" + col_name
    pd.DataFrame(prediction).to_csv(path)



In [None]:
import torch.nn.functional as F
def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attention_mask,_ = tuple(t for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attention_mask)
        all_logits.append(logits)

        
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = all_logits.sigmoid().cpu().numpy()
    return probs

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/train/train.En.csv')
target = ['sarcasm' , 'irony', 'satire' ,'understatement', 'overstatement', 'rhetorical_question']
train_tweets = df.tweet.values
train_labels = df[['sarcasm' , 'irony', 'satire' ,'understatement', 'overstatement', 'rhetorical_question']].values
train_labels = pd.DataFrame(train_labels)
train_labels = pd.DataFrame.to_numpy(train_labels.fillna(0))

In [None]:
df_test = pd.read_csv('/content/drive/MyDrive/Dataset/test/Task-B/task_B_En_test.csv')
test_tweets = df_test.tweet.values
test_labels = df_test[['sarcasm' , 'irony', 'satire' ,'understatement', 'overstatement', 'rhetorical_question']].values

input_ids, attention_masks, labels = preprocessing_for_bert(train_tweets,train_labels)
test_input_ids, test_attention_masks, test_labels = preprocessing_for_bert(test_tweets, test_labels)

train_dataloader =  load_data(input_ids,attention_masks,labels)
test_dataloader = load_data(test_input_ids,test_attention_masks,test_labels)

# df_results = pd.DataFrame()
# df_results['True-Value'] = df_test['sarcastic'].values

bert_classifier, optimizer, scheduler = initialize_model(train_dataloader)
train(bert_classifier, train_dataloader, test_dataloader, optimizer, scheduler)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Start training...

Training complete!
Training complete!
Training complete!
Training complete!


In [12]:
import numpy as np
import pandas as pd
prob = pd.read_csv("/content/drive/MyDrive/Dataset/test/Task-B/sigmoid-outputprobabilities-epoch-2.csv", sep=",",names=['sarcasm' , 'irony', 'satire' ,'understatement', 'overstatement', 'rhetorical_question'],header=None, skiprows=1)

df_test = pd.read_csv('/content/drive/MyDrive/Dataset/test/Task-B/task_B_En_test.csv')
test_labels = df_test[['sarcasm' , 'irony', 'satire' ,'understatement', 'overstatement', 'rhetorical_question']].values

In [13]:
predictions = (np.array(prob) > 0.2).astype(int)
pd_predictions = pd.DataFrame(predictions, columns= ['sarcasm' , 'irony', 'satire' ,'understatement', 'overstatement', 'rhetorical_question'])
pd_predictions.head(20)

0    1400
Name: rhetorical_question, dtype: int64

In [14]:
# from sklearn.metrics import confusion_matrix, classification_report
from sklearn.metrics import multilabel_confusion_matrix
print(multilabel_confusion_matrix(test_labels,predictions))

[[[ 561  659]
  [  84   96]]

 [[1380    0]
  [  20    0]]

 [[1351    0]
  [  49    0]]

 [[1399    0]
  [   1    0]]

 [[1390    0]
  [  10    0]]

 [[1389    0]
  [  11    0]]]


In [15]:
from sklearn.metrics import classification_report

label_names = ['sarcasm' , 'irony', 'satire' ,'understatement', 'overstatement', 'rhetorical_question']

print(classification_report(test_labels,predictions,target_names=label_names))

                     precision    recall  f1-score   support

            sarcasm       0.13      0.53      0.21       180
              irony       0.00      0.00      0.00        20
             satire       0.00      0.00      0.00        49
     understatement       0.00      0.00      0.00         1
      overstatement       0.00      0.00      0.00        10
rhetorical_question       0.00      0.00      0.00        11

          micro avg       0.13      0.35      0.19       271
          macro avg       0.02      0.09      0.03       271
       weighted avg       0.08      0.35      0.14       271
        samples avg       0.07      0.06      0.06       271



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
