In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [3]:
import pandas as pd
import re
from transformers import RobertaTokenizer, RobertaModel, AutoTokenizer, RobertaForSequenceClassification
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import numpy as np

In [4]:
# class RobertaClassifier(torch.nn.Module):
#     def __init__(self, dropout_rate=0.3):
#         super(RobertaClassifier, self).__init__()
        
#         self.model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_hidden_states = True, num_labels=2)
#         self.model.roberta.config.type_vocab_size = 2 
#         # single_emb = self.model.roberta.embeddings.token_type_embeddings
#         # self.model.roberta.embeddings.token_type_embeddings = torch.nn.Embedding(2, single_emb.embedding_dim)
#         # self.roberta.embeddings.token_type_embeddings.weight = torch.nn.Parameter(single_emb.weight.repeat([2, 1]))

#         # self.d1 = torch.nn.Dropout(dropout_rate)
#         # self.l1 = torch.nn.Linear(768, 2)
        
#     def forward(self, input_ids, attention_mask,token_type_ids,labels):
#         output = self.roberta(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,labels = labels)
#         return output
#         # last_cls = output['last_hidden_state'][:,0,:]
#         # x = self.d1(last_cls)
#         # x = self.l1(x)
#         # return x

In [5]:
def load_data(input_id, attention_masks,token_type_ids, labels, batch_size = 64):
  train_set = TensorDataset(input_id, 
                          attention_masks, 
                          token_type_ids,
                          labels)
  train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )
  return train_dataloader


In [6]:
from transformers import AdamW, get_linear_schedule_with_warmup

def initialize_model(train_dataloader, epochs=4):
    """Initialize the Roberta Classifier, the optimizer and the learning rate scheduler.
    """
    # Instantiate Bert Classifier
    model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_hidden_states = True, num_labels=2)
    model.roberta.config.type_vocab_size = 3
    single_emb = model.roberta.embeddings.token_type_embeddings
    model.roberta.embeddings.token_type_embeddings = torch.nn.Embedding(3, single_emb.embedding_dim)
    roberta_classifier = model

    # Create the optimizer
    optimizer = AdamW(roberta_classifier.parameters(),
                      lr=5e-5,    # Default learning rate
                      eps=1e-8    # Default epsilon value
                      )

    # Total number of training steps
    total_steps = len(train_dataloader) * epochs

    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer,
                                                num_warmup_steps=0, # Default value
                                                num_training_steps=total_steps)
    return roberta_classifier, optimizer, scheduler

In [7]:
#loss_fn = nn.CrossEntropyLoss()

def train(model, train_dataloader,test_dataloader,optimizer,df_results, scheduler,epochs=4):
  """Train the roberta classifier model.
  """
  # Start training loop
  print("Start training...\n")
  for epoch_i in range(epochs):
    model_save_name = 'task-c-roberta-token-id.pt'
    col_name = 'roberta-base-token-id2-epoch-' + str(epoch_i+4)

    # =======================================
    #               Training
    # =======================================

    # Reset tracking variables at the beginning of each epoch
    # total_loss, batch_loss, batch_counts = 0, 0, 0

    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    # Put the model into the training mode
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

      # Load batch to GPU
      batch = tuple(t for t in batch)
      b_input_ids, b_input_mask, b_token_type_ids, b_labels = batch

      # Zero out any previously calculated gradients
      model.zero_grad()
      # Perform a forward pass. This will return logits.
      output = model(input_ids = b_input_ids, attention_mask = b_input_mask, token_type_ids = b_token_type_ids,labels=b_labels)

      # Compute loss and accumulate the loss values
      # loss = loss_fn(output, b_labels)
      loss = output.loss
      tr_loss += loss.item()

      # Perform a backward pass to calculate gradients
      loss.backward()

      # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
      
      torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

      # Update parameters and the learning rate
      optimizer.step()
      scheduler.step()

    model_save_name = 'epoch-' + str(epoch_i+4) + '-' + model_save_name
    path = F"/content/drive/MyDrive/Dataset/train/Task-C/{model_save_name}" 
    torch.save(model.state_dict(), path)

    print("Training complete!")


    predictions = roberta_predict(model, test_dataloader)
    path = "/content/drive/MyDrive/Dataset/test/Task-C/" + col_name + ".csv"
    df_results[col_name] = predictions
    df_results.to_csv(path)

In [8]:
import torch.nn.functional as F
def roberta_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attention_mask, b_token_type_ids,b_labels = tuple(t for t in batch)

        # Compute logits
        with torch.no_grad():
            output = model(b_input_ids, b_attention_mask, token_type_ids = b_token_type_ids, labels=b_labels)
        # all_logits.append(output)
        all_logits.append(output.logits)

        
    
    # Concatenate logits from each batch
    all_logits = torch.cat(all_logits, dim=0)

    # Apply softmax to calculate probabilities
    probs = F.softmax(all_logits, dim=1).cpu().detach().numpy()
    predictions = np.argmax(probs, axis=1)
    # threshold = 0.5
    # preds = np.where(probs.iloc[:, 1] > threshold, 1, 0)

    return predictions

In [9]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/train/sarcastic_tweets.csv')

# Pre-processing
remove_symbols = re.compile('[^0-9A-Za-z ]') 

def clean_text(input):
  input = remove_symbols.sub(' ', str(input)) 
  # remove  empty spaces
  re.sub('\s+', '' ,input)
  return input

df['tweet'] = df['tweet'].apply(clean_text)

In [10]:
df_sarcastic = df[df['sarcastic'] == 1 ][['tweet','sarcastic']]
df_rephrase = df[df['sarcastic'] == 0 ][['tweet', 'sarcastic']]

text_sarcastic = df_sarcastic.tweet.values
# labels_sarcastic = df_sarcastic.sarcastic.values

text_rephrase = df_rephrase.tweet.values
# labels_rephrase = df_rephrase.sarcastic.values

In [11]:
# def create_token_type_id(input_ids, pad_length = 200):
#   token_type_id = []
#   input_ids = input_ids.squeeze().numpy()
#   token_two_1 = np.where(input_ids == 2)[0][0] + 1
#   token_two_3 = np.where(input_ids == 2)[0][2] + 1
#   padding = pad_length - token_two_3 
#   for i in range(token_two_1):
#     token_type_id.append(0)
#   for i in range(token_two_1,token_two_3):
#     token_type_id.append(1)
#   for i in range(token_two_3,200):
#     token_type_id.append(0)
#   return(torch.Tensor([token_type_id]).to(torch.int32))
  

In [12]:
def create_token_type_id(input_ids, pad_length = 200):
  token_type_id = []
  input_ids = input_ids.squeeze().numpy()
  token_two_1 = np.where(input_ids == 2)[0][0] + 1
  token_two_3 = np.where(input_ids == 2)[0][2] + 1
  padding = pad_length - token_two_3 
  for i in range(token_two_1):
    token_type_id.append(0)
  for i in range(token_two_1,token_two_3):
    token_type_id.append(1)
  for i in range(token_two_3,200):
    token_type_id.append(2)
  return(torch.Tensor([token_type_id]).to(torch.int32))

In [13]:
tokenizer = RobertaTokenizer.from_pretrained("roberta-base", do_lower_case=True)

In [14]:
input_id = []
token_type_id = []
attention_masks = []
labels = []

for element1, element2 in zip(text_sarcastic, text_rephrase):
  encoded_dict_0 = tokenizer(element1, element2, max_length=200,pad_to_max_length=True,return_tensors = 'pt')
  encoded_dict_1 = tokenizer(element2, element1, max_length=200,pad_to_max_length=True,return_tensors = 'pt')
  input_id.append(encoded_dict_0['input_ids'])
  input_id.append(encoded_dict_1['input_ids'])
  token_type_id.append(create_token_type_id(encoded_dict_0['input_ids']))
  token_type_id.append(create_token_type_id(encoded_dict_1['input_ids']))
  attention_masks.append(encoded_dict_0['attention_mask'])
  attention_masks.append(encoded_dict_1['attention_mask'])
  labels.append(0)
  labels.append(1)


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [15]:
input_id = torch.cat(input_id, dim = 0)
token_type_id = torch.cat(token_type_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [16]:
df_test = pd.read_csv('/content/drive/MyDrive/Dataset/test/task_C_En_test.csv')

df_test['text_0'] = df_test['text_0'].apply(clean_text)
df_test['text_1'] = df_test['text_1'].apply(clean_text)

text_0 = df_test.text_0.values
text_1 = df_test.text_1.values

test_input_id = []
test_token_type_id = []
test_attention_masks = []
for element1, element2 in zip(text_0, text_1):
  encoded_dict = tokenizer(element1, element2, max_length=200,pad_to_max_length=True,return_tensors = 'pt')
  test_input_id.append(encoded_dict['input_ids'])
  test_token_type_id.append(create_token_type_id(encoded_dict['input_ids']))
  test_attention_masks.append(encoded_dict['attention_mask'])
     

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.


In [17]:
test_input_id = torch.cat(test_input_id, dim = 0)
test_token_type_id = torch.cat(test_token_type_id, dim = 0)
test_attention_masks = torch.cat(test_attention_masks, dim = 0)
test_labels = torch.tensor(df_test['sarcastic_id'])

In [18]:
test_input_id.shape

torch.Size([200, 200])

In [19]:
df_test.head()

Unnamed: 0,text_0,text_1,sarcastic_id
0,I see that your team played well today,I m sorry that your team didn t win yesterday,0
1,Anthony Taylor is such a fair referee I wish ...,I hope Anthony Taylor is never put in charge o...,0
2,the weather is gloomy just raining and dull,What a glorious weather today,1
3,People going out to get there boosters without...,Nice to see the sheep getting their boosters t...,1
4,Really great weather we re having love a bit ...,Really cold January so far looking forward t...,0


In [20]:
train_dataloader =  load_data(input_id,attention_masks,token_type_id,labels)
test_dataloader = load_data(test_input_id,test_attention_masks,test_token_type_id ,test_labels)

df_results = pd.DataFrame()
df_results['True-Value'] = df_test['sarcastic_id'].values

robert_classifier, optimizer, scheduler = initialize_model(train_dataloader)
#train(robert_classifier, train_dataloader, test_dataloader, optimizer,df_results, scheduler)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

In [21]:
robert_classifier, optimizer, scheduler = initialize_model(train_dataloader)
robert_classifier.load_state_dict(torch.load("/content/drive/MyDrive/Dataset/train/Task-C/epoch-4-task-c-roberta-token-id.pt"))
train(robert_classifier, train_dataloader, test_dataloader, optimizer,df_results, scheduler)

Some weights of the model checkpoint at roberta-base were not used when initializing RobertaForSequenceClassification: ['lm_head.bias', 'lm_head.decoder.weight', 'lm_head.layer_norm.weight', 'roberta.pooler.dense.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'roberta.pooler.dense.weight', 'lm_head.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.weight', 'classifie

Start training...

Training complete!
Training complete!
Training complete!
Training complete!


In [30]:
import pandas as pd
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
pred = pd.read_csv("/content/drive/MyDrive/Dataset/test/Task-C/roberta-base-token-id2-epoch-3.csv", sep=",")
# prob.drop(index=prob.index[0], axis=0, inplace=True)
# prob.head(2)
# threshold = 0.5
# preds = np.where(prob.iloc[:, 1] > threshold, 1, 0)
# preds
pred.head(15)

Unnamed: 0.1,Unnamed: 0,True-Value,roberta-base-token-id2-epoch-1,roberta-base-token-id2-epoch-2,roberta-base-token-id2-epoch-3
0,0,0,1,0,1
1,1,0,1,0,1
2,2,1,1,0,0
3,3,1,1,1,1
4,4,0,1,1,0
5,5,1,1,1,0
6,6,0,1,0,1
7,7,1,1,1,1
8,8,0,1,1,1
9,9,0,1,1,1


In [31]:
print(confusion_matrix(pred['True-Value'], pred['roberta-base-token-id2-epoch-3']))

[[33 74]
 [36 57]]


In [32]:
print(classification_report(pred['True-Value'], pred['roberta-base-token-id2-epoch-3']))


              precision    recall  f1-score   support

           0       0.48      0.31      0.37       107
           1       0.44      0.61      0.51        93

    accuracy                           0.45       200
   macro avg       0.46      0.46      0.44       200
weighted avg       0.46      0.45      0.44       200



In [None]:
model = RobertaForSequenceClassification.from_pretrained('roberta-base', output_hidden_states = True)
# model.config.type_vocab_size = 2 
# single_emb = model.embeddings.token_type_embeddings
# single_emb.embedding_dim
#self.roberta.embeddings.token_type_embeddings = torch.nn.Embedding(2, single_emb.embedding_dim)

In [None]:
model.roberta.embeddings.token_type_embeddings
