In [65]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [66]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [67]:
import pandas as pd
import re
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import BertModel
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np

In [68]:
def clean_text(input):
  # Pre-processing
  remove_symbols = re.compile('[^0-9A-Za-z ]') 

  input = remove_symbols.sub(' ', str(input)) 
  # remove  empty spaces
  re.sub('\s+', '' ,input)
  return input

In [69]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)
def preprocessing_for_bert(data, max_length=200):

  """Perform required preprocessing steps for pretrained BERT.
  @param    data (np.array): Array of texts to be processed.
  @return   input_ids (torch.Tensor): Tensor of token ids to be fed to a model.
  @return   attention_masks (torch.Tensor): Tensor of indices specifying which
                tokens should be attended to by the model.
  """
  # Create empty lists to store outputs
  data_tweet = data.tweet.values
  labels = data.sarcastic.values
  input_ids = []
  attention_masks = []

  # For every sentence...
  for element in data_tweet:
    # `encode_plus` will:
    #    (1) Tokenize the sentence
    #    (2) Add the `[CLS]` and `[SEP]` token to the start and end
    #    (3) Truncate/Pad sentence to max length
    #    (4) Map tokens to their IDs
    #    (5) Create attention mask
    #    (6) Return a dictionary of outputs
    encoded_dict = tokenizer.encode_plus(
        text=clean_text(element),  # Preprocess sentence
        add_special_tokens=True, # Add `[CLS]` and `[SEP]`
        max_length=max_length,   # Max length to truncate/pad
        pad_to_max_length=True,  # Pad sentence to max length
        return_tensors='pt',     # Return PyTorch tensor
        return_attention_mask=True      # Return attention mask
        )
      
      # Add the outputs to the lists
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

  # Convert lists to tensors
  input_ids = torch.cat(input_ids,dim = 0)
  attention_masks = torch.cat(attention_masks, dim = 0)
  labels = torch.tensor(labels)

  return input_ids, attention_masks, labels

In [70]:
def load_data(input_ids, attention_masks, labels, batch_size = 64):
  train_set = TensorDataset(input_ids, 
                          attention_masks, 
                          labels)
  train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )
  return train_dataloader


In [71]:
class BertClassifier(nn.Module):

  """Bert Model for Classification Tasks.
  """
  def __init__(self):
      """
      @param    bert: a BertModel object
      @param    classifier: a torch.nn.Module classifier
      @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
      """
      super(BertClassifier, self).__init__()
      # Specify hidden size of BERT, hidden size of our classifier, and number of labels
      dimension_in, hidden_layer, dimension_out = 768, 50, 2

      # Instantiate BERT model
      self.bert = BertModel.from_pretrained('bert-base-uncased', output_hidden_states = True)

      # Instantiate an one-layer feed-forward classifier
      self.classifier = nn.Sequential(
          nn.Linear(dimension_in, hidden_layer),
          nn.ReLU(),
          #nn.Dropout(0.5),
          nn.Linear(hidden_layer, dimension_out)
      )
      
  def forward(self, input_ids, attention_mask):
      """
      Feed input to BERT and the classifier to compute logits.
      @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                    max_length)
      @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                    information with shape (batch_size, max_length)
      @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                    num_labels)
      """
      # Feed input to BERT
      outputs = self.bert(input_ids=input_ids,
                          attention_mask=attention_mask)
      
      # Extract the last hidden state of the token `[CLS]` for classification task
      # This will have the size of (batch_size, 768)
      last_hidden_state_cls = outputs.hidden_states[-1][:, 0, :]

      # Feed input to classifier to compute logits
      logits = self.classifier(last_hidden_state_cls)

      return logits


In [72]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(train_dataloader, epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier()

    # Tell PyTorch to run the model on GPU
    # bert_classifier.to(device)

    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [73]:
loss_fn = nn.CrossEntropyLoss()

def train(model, train_dataloader,test_dataloader,optimizer, scheduler,epochs=4):
  """Train the BertClassifier model.
  """
  # Start training loop
  print("Start training...\n")
  for epoch_i in range(epochs):
    model_save_name = 'task-a-bert-with-classifier.pt'
    col_name = 'epoch-' + str(epoch_i+1)

    # =======================================
    #               Training
    # =======================================

    # Reset tracking variables at the beginning of each epoch
    # total_loss, batch_loss, batch_counts = 0, 0, 0

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    # Put the model into the training mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

      # Load batch to GPU
      batch = tuple(t for t in batch)
      b_input_ids, b_input_mask, b_labels = batch

      # Zero out any previously calculated gradients
      model.zero_grad()
      # Perform a forward pass. This will return logits.
      logits = model(b_input_ids, attention_mask = b_input_mask)
      
      # Compute loss and accumulate the loss values
      loss = loss_fn(logits, b_labels)
      tr_loss += loss.item()

      # Perform a backward pass to calculate gradients
      loss.backward()

      # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Update parameters and the learning rate
      optimizer.step()
      scheduler.step()

    model_save_name = 'epoch-' + str(epoch_i+1) + '-' + model_save_name
    path = F"/content/drive/MyDrive/Dataset/train/Task-A/{model_save_name}" 
    torch.save(model.state_dict(), path)

    print("Training complete!")


    prediction = bert_predict(model, test_dataloader)
    col_name = "probabilities-" + col_name
    path = "/content/drive/MyDrive/Dataset/test/Task-A/" + col_name
    pd.DataFrame(prediction).to_csv(path)



In [74]:
import torch.nn.functional as F
def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attention_mask,_ = tuple(t for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attention_mask)
        all_logits.append(logits)

        
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().numpy()

    return probs

In [75]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/train/train.En.csv')
df_test = pd.read_csv('/content/drive/MyDrive/Dataset/test/task_A_En_test.csv')

input_ids, attention_masks, labels = preprocessing_for_bert(df)
test_input_ids, test_attention_masks, test_labels = preprocessing_for_bert(df_test)

train_dataloader =  load_data(input_ids,attention_masks,labels)
test_dataloader = load_data(test_input_ids,test_attention_masks,test_labels)

# df_results = pd.DataFrame()
# df_results['True-Value'] = df_test['sarcastic'].values

bert_classifier, optimizer, scheduler = initialize_model(train_dataloader)
train(bert_classifier, train_dataloader, test_dataloader, optimizer, scheduler)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This 

Start training...

Training complete!
Training complete!
Training complete!
Training complete!


In [None]:
from sklearn.metrics import confusion_matrix, classification_report
prob = pd.read_csv("/content/drive/MyDrive/Dataset/test/Task-A/probabilities-epoch-1", sep=",",names=['Class-0', 'Class-1'])
prob.drop(index=prob.index[0], axis=0, inplace=True)
prob.head(2)
threshold = 0.5
preds = np.where(prob.iloc[:, 1] > threshold, 1, 0)
preds

In [101]:
print(confusion_matrix(df_test['sarcastic'], preds))


[[1194    6]
 [ 199    1]]


In [102]:
print(classification_report(df_test['sarcastic'], preds))

              precision    recall  f1-score   support

           0       0.86      0.99      0.92      1200
           1       0.14      0.01      0.01       200

    accuracy                           0.85      1400
   macro avg       0.50      0.50      0.47      1400
weighted avg       0.76      0.85      0.79      1400

