In [1]:
import os
import numpy as np
import pandas as pd
from tqdm.auto import tqdm

from datasets import load_dataset, Dataset, DatasetDict, load_metric
from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, AutoModel, AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
from transformers import AdamW, get_scheduler

import torch
import torch.nn as nn
from torch.utils.data import DataLoader

In [2]:
for dirname, _, filenames in os.walk('../input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json
../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset.json


In [3]:
dataset_path = "../input/news-headlines-dataset-for-sarcasm-detection/Sarcasm_Headlines_Dataset_v2.json"

df = pd.read_json(dataset_path, lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


# Load dataset with HF load_dataset

In [4]:
dataset_hf = load_dataset("json", data_files=dataset_path)
dataset_hf

Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-3ac6d2c7c86069e4/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-3ac6d2c7c86069e4/0.0.0/ac0ca5f5289a6cf108e706efcf040422dbbfa8e658dee6a819f20d76bb84d26b. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['is_sarcastic', 'headline', 'article_link'],
        num_rows: 28619
    })
})

In [5]:
dataset_hf = dataset_hf.remove_columns(['article_link'])

dataset_hf.set_format('pandas')
dataset_hf = dataset_hf.rename_columns({'is_sarcastic': 'label'})
dataset_hf = dataset_hf['train'][:]
dataset_hf

Unnamed: 0,label,headline
0,1,thirtysomething scientists unveil doomsday clo...
1,0,dem rep. totally nails why congress is falling...
2,0,eat your veggies: 9 deliciously different recipes
3,1,inclement weather prevents liar from getting t...
4,1,mother comes pretty close to using word 'strea...
...,...,...
28614,1,jews to celebrate rosh hashasha or something
28615,1,internal affairs investigator disappointed con...
28616,0,the most beautiful acceptance speech this week...
28617,1,mars probe destroyed by orbiting spielberg-gat...


In [6]:
dataset_hf.drop_duplicates(subset=['headline'], inplace=True)
dataset_hf = dataset_hf.reset_index()[['headline', 'label']]
dataset_hf = Dataset.from_pandas(dataset_hf)
dataset_hf

Dataset({
    features: ['headline', 'label'],
    num_rows: 28503
})

# Train Valid Test split 

In [7]:
train_test_valid_split = dataset_hf.train_test_split(test_size=0.2, seed=15)
test_valid_split = train_test_valid_split['test'].train_test_split(test_size=0.5, seed=15)

dataset_hf = DatasetDict({
    'train': train_test_valid_split['train'],
    'test': test_valid_split['test'],
    'valid': test_valid_split['train']
})
dataset_hf

DatasetDict({
    train: Dataset({
        features: ['headline', 'label'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label'],
        num_rows: 2850
    })
})

In [8]:
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)lve/main/config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

# Vector size "distilbert-base_uncased"

The shape of the output from the base model is

**(batch_size, max_sequence_length, embedding_vector_size=768)**

In [9]:
def tokenize(batch):
    return tokenizer(batch['headline'], truncation=True, max_length=512)

tokenized_dataset = dataset_hf.map(tokenize, batched=True)
tokenized_dataset

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'label', 'input_ids', 'attention_mask'],
        num_rows: 2850
    })
})

In [10]:
# set_format method from HF to set the dataset format compatible with pytorch
tokenized_dataset.set_format('torch', columns=["input_ids", "attention_mask", "label"])

data_collator = DataCollatorWithPadding(tokenizer=tokenizer)


In [11]:
class MyTaskSpecificCustomModel(nn.Module):
    def __init__(self, checkpoint, num_labels):
        super(MyTaskSpecificCustomModel, self).__init__()
        self.num_labels = num_labels
        self.model = AutoModel.from_pretrained(checkpoint, config=AutoConfig.from_pretrained(
            checkpoint,
            output_attention=True,
            output_hidden_state=True
        ))
        
        # New Layer
        self.dropout = nn.Dropout(0.1)
        self.classifier= nn.Linear(768, num_labels)
    
    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.model(
            input_ids=input_ids,
            attention_mask=attention_mask
        )
        last_hidden_state = outputs[0]
        sequence_outputs = self.dropout(last_hidden_state)
        logits = self.classifier(sequence_outputs[:, 0, :].view(-1, 768))
        
        loss = None
        if labels is not None:
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(logits.view(-1, self.num_labels), labels.view(-1))
            
        return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)

In [12]:
# PyTorch DataLoader
train_dataloader = DataLoader(
    tokenized_dataset['train'],
    shuffle=True,
    batch_size=32,
    collate_fn=data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset['valid'],
    shuffle=True,
    collate_fn=data_collator
)

In [13]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_task_specific = MyTaskSpecificCustomModel(checkpoint=checkpoint, num_labels=2).to(device)

Downloading pytorch_model.bin:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [14]:
optimizer = AdamW(model_task_specific.parameters(), lr=5e-5)

num_epoch = 3
num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
)



In [15]:
metric = load_metric("f1")

Downloading builder script:   0%|          | 0.00/2.06k [00:00<?, ?B/s]

# Training

In [16]:
progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader)))

for epoch in range(num_epoch):
    model_task_specific.train()
    
    for batch in train_dataloader:
        batch = {k: v.to(device) for k,v in batch.items()}
        outputs = model_task_specific(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)
        
    model_task_specific.eval()
    for batch in eval_dataloader:
        batch = {k: v.to(device) for k,v in batch.items()}
        with torch.no_grad():
            outputs = model_task_specific(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim=-1)
        metric.add_batch(predictions=predictions, references=batch['labels'])
        progress_bar_eval.update(1)
        
    print(metric.compute())

  0%|          | 0/2139 [00:00<?, ?it/s]

  0%|          | 0/8550 [00:00<?, ?it/s]

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


{'f1': 0.9167309175019276}
{'f1': 0.9319526627218935}
{'f1': 0.9270353302611367}


# Post Training Evaluation

In [17]:
model_task_specific.eval()

test_dataloader = DataLoader(
    tokenized_dataset['test'],
    batch_size=32,
    collate_fn=data_collator
)

for batch in test_dataloader:
    batch = {k: v.to(device) for k,v in batch.items()}
    with torch.no_grad():
        outputs = model_task_specific(**batch)
    
    logits = outputs.logits
    predictions = torch.argmax(logits, dim=-1)
    metric.add_batch(predictions=predictions, references=batch['labels'])
    
metric.compute()


{'f1': 0.9252615844544095}