In [34]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
import pandas as pd
import re

In [36]:
!pip install sentence_transformers

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [37]:
df = pd.read_csv('/content/drive/MyDrive/Dataset/train/train.En.csv')
# df = pd.read_csv('/content/drive/MyDrive/Dataset/train/sarcastic_tweets.csv')
# Pre-processing
remove_symbols = re.compile('[^0-9A-Za-z ]') 

def clean_text(input):
  input = remove_symbols.sub(' ', str(input)) 
  # remove  empty spaces
  re.sub('\s+', '' ,input)
  return input

df['tweet'] = df['tweet'].apply(clean_text)

In [43]:
df_sarcastic = df[df['sarcastic'] == 1 ][['tweet','sarcastic']]
df_not_sarcastic = df[df['sarcastic'] == 0 ][['tweet', 'sarcastic']]

count = df_sarcastic.shape[0]

df_not_sarcastic_under = df_not_sarcastic.sample(count * 2, random_state=120)
df_sarcastic_over = df_sarcastic.sample(count * 2, replace=True, random_state = 120)

text_sarcastic = df_sarcastic_over.tweet.values
# labels_sarcastic = df_sarcastic.sarcastic.values

text_not_sarcastic = df_not_sarcastic_under.tweet.values
# labels_rephrase = df_rephrase.sarcastic.values


In [44]:
import torch
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertForSequenceClassification
from sklearn.model_selection import train_test_split

import pandas as pd
import numpy as np

from tabulate import tabulate
from tqdm import trange
import random

import torch.nn as nn
from sklearn.utils.class_weight import compute_class_weight

In [45]:
tokenizer = BertTokenizer.from_pretrained(
    'bert-base-uncased'
    )

In [46]:
token_id = []
attention_masks = []
labels = []

for element in text_sarcastic:
  encoded_dict = tokenizer(element , max_length=200,pad_to_max_length=True,return_tensors = 'pt')
  token_id.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])
  labels.append(1)
for element in text_not_sarcastic:
  encoded_dict = tokenizer(element , max_length=200,pad_to_max_length=True,return_tensors = 'pt')
  token_id.append(encoded_dict['input_ids'])
  attention_masks.append(encoded_dict['attention_mask'])
  labels.append(0)

Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


In [47]:
token_id = torch.cat(token_id, dim = 0)
attention_masks = torch.cat(attention_masks, dim = 0)
labels = torch.tensor(labels)

In [48]:
labels
len(labels)

3468

In [49]:
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels = 2,
    output_attentions = False,
    output_hidden_states = False,
)

# Recommended learning rates (Adam): 5e-5, 3e-5, 2e-5. See: https://arxiv.org/pdf/1810.04805.pdf
optimizer = torch.optim.AdamW(model.parameters(), 
                              lr = 5e-5,
                              eps = 1e-08
                              )

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at

In [50]:
batch_size = 8

train_set = TensorDataset(token_id, 
                          attention_masks, 
                          labels)

train_dataloader = DataLoader(
            train_set,
            sampler = RandomSampler(train_set),
            batch_size = batch_size
        )

In [51]:
df_test = pd.read_csv('/content/drive/MyDrive/Dataset/test/task_A_En_test.csv')
df_test['text'] = df_test['text'].apply(clean_text)
test_sarcastic_value = df_test.sarcastic.values
test_tweet = df_test.text.values

test_token_id = []
test_attention_masks = []

for element in test_tweet:
  encoded_dict = tokenizer(element,  max_length=200,pad_to_max_length=True,return_tensors = 'pt')
  test_token_id.append(encoded_dict['input_ids'])
  test_attention_masks.append(encoded_dict['attention_mask'])

test_token_id = torch.cat(test_token_id, dim = 0)
test_attention_masks = torch.cat(test_attention_masks, dim = 0)



In [52]:
df_results = pd.DataFrame()
df_results['True-Value'] = df_test['sarcastic'].values

In [53]:
df_results

Unnamed: 0,True-Value
0,0
1,0
2,1
3,0
4,0
...,...
1395,0
1396,0
1397,0
1398,1


In [54]:
epochs = 4

for i in trange(epochs, desc = 'Epoch'):
  model_save_name = 'task-a-bert-undersampled_and_oversampled.pt'
  col_name = 'epoch-' + str(i+1)

  model.train()
    
    # Tracking variables
  tr_loss = 0
  nb_tr_examples, nb_tr_steps = 0, 0

  for step, batch in enumerate(train_dataloader):
    batch = tuple(t for t in batch)
    b_input_ids, b_input_mask, b_labels = batch
    optimizer.zero_grad()
    # Forward pass
    train_output = model(b_input_ids, 
                         attention_mask = b_input_mask, 
                         labels = b_labels)
    # num = np.bincount(b_labels)
    # if len(num) == 2:
    #   classes = np.unique(b_labels.numpy())
    #   weights = compute_class_weight(class_weight='balanced',classes=classes,y =b_labels.numpy())
    #   criterion = torch.nn.CrossEntropyLoss(weight=torch.tensor(weights, dtype=torch.float),reduction='mean')
    #   loss = criterion(train_output['logits'], b_labels)
    # else:
    #   loss = train_output.loss
    
    train_output.loss.backward()

    optimizer.step()
    # Update tracking variables
    tr_loss += train_output.loss.item()
    nb_tr_examples += b_input_ids.size(0)
    nb_tr_steps += 1

  model_save_name = 'epoch-' + str(i+1) + '-' + model_save_name
  path = F"/content/drive/MyDrive/Dataset/train/Task-A/{model_save_name}" 
  torch.save(model.state_dict(), path)
  
  model.eval()
  with torch.no_grad():
    output = model(test_token_id, attention_mask = test_attention_masks)
    logits = output.logits.detach().cpu().numpy()
    preds = np.argmax(logits, axis = 1).flatten()
  df_results[col_name] = preds
  pd.DataFrame(df_results).to_csv("/content/drive/MyDrive/Dataset/test/Task-A/task_a_results_bert_undersampled_and_oversampled.csv")

Epoch: 100%|██████████| 4/4 [1:03:38<00:00, 954.63s/it]


In [None]:
# model = BertForSequenceClassification.from_pretrained(
#     'bert-base-uncased',
#     num_labels = 2
# )
# model.load_state_dict(torch.load("/content/drive/MyDrive/Dataset/train/Task-A/epoch-2-task-a-2.pt"))
# model.eval()
# with torch.no_grad():
#     output = model(test_token_id,attention_mask = test_attention_masks)
#     logits = output.logits.detach().cpu().numpy()
#     preds = np.argmax(logits, axis = 1).flatten()

In [None]:
# df_results = pd.read_csv("/content/drive/MyDrive/Dataset/test/Task-A/task_a_undersampled.csv")
# df_results.head(2)

In [60]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
print(classification_report(df_test['sarcastic'],df_results['epoch-1']))

              precision    recall  f1-score   support

           0       0.91      0.58      0.71      1200
           1       0.21      0.66      0.31       200

    accuracy                           0.59      1400
   macro avg       0.56      0.62      0.51      1400
weighted avg       0.81      0.59      0.65      1400



In [61]:
print(confusion_matrix(df_test['sarcastic'], df_results['epoch-1']))

[[692 508]
 [ 69 131]]


In [62]:
print(classification_report(df_test['sarcastic'],df_results['epoch-2']))

              precision    recall  f1-score   support

           0       0.88      0.78      0.83      1200
           1       0.23      0.38      0.28       200

    accuracy                           0.73      1400
   macro avg       0.55      0.58      0.56      1400
weighted avg       0.79      0.72      0.75      1400



In [63]:
print(confusion_matrix(df_test['sarcastic'], df_results['epoch-2']))

[[939 261]
 [124  76]]


In [64]:
print(classification_report(df_test['sarcastic'],df_results['epoch-3']))

              precision    recall  f1-score   support

           0       0.89      0.43      0.59      1200
           1       0.17      0.69      0.27       200

    accuracy                           0.47      1400
   macro avg       0.53      0.56      0.43      1400
weighted avg       0.79      0.47      0.54      1400



In [66]:
print(confusion_matrix(df_test['sarcastic'], df_results['epoch-3']))

[[522 678]
 [ 62 138]]


In [67]:
print(classification_report(df_test['sarcastic'],df_results['epoch-4']))

              precision    recall  f1-score   support

           0       0.91      0.52      0.66      1200
           1       0.19      0.70      0.30       200

    accuracy                           0.54      1400
   macro avg       0.55      0.61      0.48      1400
weighted avg       0.81      0.54      0.61      1400



In [68]:
print(confusion_matrix(df_test['sarcastic'], df_results['epoch-4']))

[[620 580]
 [ 60 140]]


In [69]:
print(model.classifier)

Linear(in_features=768, out_features=2, bias=True)


In [57]:
pd.DataFrame(df_results).to_csv("/content/drive/MyDrive/Dataset/test/Task-A/task_a_results_bert_undersampled_and_oversampled.csv")