In [1]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.30.2-py3-none-any.whl (7.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.2/7.2 MB[0m [31m55.6 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m27.2 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m107.0 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m75.9 MB/s[0m eta [36m0:00:

In [2]:
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertForSequenceClassification, AdamW, BertConfig, BertTokenizer, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split

import datetime
import numpy as np
import pandas
import random
import time
import torch

## Mount

In [3]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/sarcasm-detection/notebooks
!ls

Mounted at /content/drive
/content/drive/MyDrive/sarcasm-detection/notebooks
training.ipynb


# Get Data

In [4]:
df = pandas.read_csv("../data/processed_sarcasm-dataset.csv", index_col=False)
df = df.astype({'label': 'int32', 'comment': 'str', 'parent_comment': 'str'})
df = df.sample(frac = 1)

In [5]:
display(df)

Unnamed: 0,label,comment,parent_comment
428011,1,new rule COLON art lucina breasts bigger cub b...,mean COMMA get wrong COMMA big tits awesome PE...
634632,1,must white,honestly problem COMMA cops work hard tough jo...
447804,0,someone set patreon buy new key holidays PERIOD,last COMMA mama diablos keyboard know rest PERIOD
418102,1,another death caused alcohol QUESTION_MARK,one COMMA aborted PERIOD use history count one...
27256,0,2016 strikes,fidel castro dead 90 PERIOD
...,...,...,...
336549,0,used cannon fodder,know copious amounts women front lines would a...
903945,0,going mono blue cipher cards hoping going good...,play black cards talrand general COMMA though ...
963321,0,comcast content channels QUESTION_MARK,comcast PERIOD increased prices 30 since bough...
106759,1,oh glues COMMA platinum EXCLAMATIONMARK,need unturned platinum view movie


# Get Model & Tokenizer

In [6]:
model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased",
    num_labels = 2
).cuda()

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

Downloading (…)lve/main/config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForSequenceClassification: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly i

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

# Tokenize and segment data

In [None]:
input_ids = []
attention_masks = []

for _, row in df.iterrows():
    encoded_dict = tokenizer.encode_plus(
                        row['parent_comment'],
                        row['comment'],
                        add_special_tokens = True,
                        max_length = 320,
                        padding='max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation='longest_first'
                   )
    input_ids.append(encoded_dict['input_ids'])
    attention_masks.append(encoded_dict['attention_mask'])

Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pairs with the 'longest_first' truncation strategy. So the returned list will always be empty even if some tokens have been removed.
Be aware, overflowing tokens are not returned for the setting you have chosen, i.e. sequence pai

In [None]:
# lists into tensors
input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)
labels = torch.tensor(df['label'].values)

In [None]:
#print(input_ids[:3])
#print(attention_masks[:3])
#print(labels[:3])

In [None]:
train_x, val_x, train_y, val_y = train_test_split(input_ids, labels, random_state=2018, test_size=0.1)

In [None]:
train_masks, val_masks, _, _ = train_test_split(attention_masks, labels, random_state=2018, test_size=0.1)

# Generate batches

In [None]:
batch_size = 32

In [None]:
# Create the DataLoader for our training set.
train_data = TensorDataset(train_x, train_masks, train_y)
train_sampler = RandomSampler(train_data)
train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

# Create the DataLoader for our validation set.
validation_data = TensorDataset(val_x, val_masks, val_y)
validation_sampler = SequentialSampler(validation_data)
validation_dataloader = DataLoader(validation_data, sampler=validation_sampler, batch_size=batch_size)

# Train Config

In [None]:
optimizer = AdamW(model.parameters(), lr = 2e-5)

In [None]:
epochs = 5
total_steps = len(train_dataloader) * epochs
lr_scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps = 0, # Default value in run_glue.py
                                            num_training_steps = total_steps)

In [None]:
# Function to calculate the accuracy of our predictions vs labels
def flat_accuracy(preds, labels):
    pred_flat = np.argmax(preds, axis=1).flatten()
    labels_flat = labels.flatten()
    return np.sum(pred_flat == labels_flat) / len(labels_flat)

In [None]:
def format_time(elapsed):
    '''
    Takes a time in seconds and returns a string hh:mm:ss
    '''
    # Round to the nearest second.
    elapsed_rounded = int(round((elapsed)))

    # Format as hh:mm:ss
    return str(datetime.timedelta(seconds=elapsed_rounded))

In [None]:
# Set the seed value all over the place to make this reproducible.
seed_val = 42

random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [None]:
# We'll store a number of quantities such as training and validation loss,
# validation accuracy, and timings.
training_stats = []

In [None]:
# Measure the total training time for the whole run.
total_t0 = time.time()

# For each epoch...
for epoch_i in range(0, epochs):

    print('\n======== Epoch {:} / {:} ========'.format(epoch_i + 1, epochs))

    # Measure how long the training epoch takes.
    t0 = time.time()

    # Reset the total loss for this epoch.
    total_train_loss = 0

    # Put the model into training mode. Don't be mislead--the call to
    # `train` just changes the *mode*, it doesn't *perform* the training.
    model.train()

    # For each batch of training data...
    for step, batch in enumerate(train_dataloader):

        # Progress update every 40 batches.
        if step % 40 == 0 and not step == 0:
            # Calculate elapsed time in minutes.
            elapsed = format_time(time.time() - t0)

            # Report progress.
            print('\r  Batch {:>5,}  of  {:>5,}.    Elapsed: {:}.'.format(step, len(train_dataloader), elapsed), end='')

        b_input_ids = batch[0].to('cuda')
        b_input_mask = batch[1].to('cuda')
        b_labels = batch[2].to('cuda')

        model.zero_grad()

        loss, logits = model(b_input_ids,
                            token_type_ids=None,
                            attention_mask=b_input_mask,
                            labels=b_labels)
        wandb.log({'train_batch_loss':loss.item()})

        total_train_loss += loss.item()

        loss.backward()

        # Clip the norm of the gradients to 1.0.
        # This is to help prevent the "exploding gradients" problem.
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)


        optimizer.step()
        lr_scheduler.step()

In [None]:
# Prepare the test comment and parent_comment
test_comment = ["man COMMA rush limbaugh going field day one EXCLAMATIONMARK"]
test_parent_comment = ["man beats military woman front child shouting racial slurs PERIOD vote story check races QUESTION_MARK"]

# Prepare the test data in the same way as the training data
test_text = test_parent_comment[0] + ' ' + test_comment[0]

# Tokenize the test text and convert to input IDs
encoded_dict = tokenizer.encode_plus(test_parent_comment, test_comment,
                        add_special_tokens = True,
                        max_length = 320,
                        padding='max_length',
                        return_attention_mask = True,   # Construct attn. masks.
                        return_tensors = 'pt',     # Return pytorch tensors.
                        truncation='longest_first')

# Get the input IDs and attention mask from the encoded result
input_ids = encoded_dict['input_ids'].to('cuda')
attention_mask = encoded_dict['attention_mask'].to('cuda')


In [None]:
model.eval()

# Forward pass, get logit predictions
with torch.no_grad():
    outputs = model(input_ids, token_type_ids=None, attention_mask=attention_mask)

logits = outputs[0]

In [None]:
logits = logits.detach().cpu().numpy()

# Get predictions as the index of the highest logit
predictions = np.argmax(logits, axis=1)

print('Predicted label:', predictions[0])