<a href="https://colab.research.google.com/github/agnxsh/task-specific-hf/blob/main/task_specific_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [81]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [82]:
!pip install datasets
!pip install transformers
# !pip install transformers.modeling_ouputs
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [83]:
import numpy as np
import pandas as pd

from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification,Trainer,TrainingArguments,AutoTokenizer, AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import os

for dirname, _, filenames in os.walk("/content/drive/MyDrive/input"):
  for filename in filenames:
    print(os.path.join(dirname,filename))


/content/drive/MyDrive/input/Sarcasm_Headlines_Dataset.json
/content/drive/MyDrive/input/Sarcasm_Headlines_Dataset_v2.json


In [84]:
dataset_v2_path ="/content/drive/MyDrive/input/Sarcasm_Headlines_Dataset_v2.json"

In [85]:
df = pd.read_json(dataset_v2_path, lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


In [86]:
dataset_hf = load_dataset("json",data_files = dataset_v2_path)



  0%|          | 0/1 [00:00<?, ?it/s]

In [87]:
import pandas as pd
dataset_hf = dataset_hf.remove_columns(["article_link"])
dataset_hf.set_format('pandas')
dataset_hf=dataset_hf['train'][:]


In [89]:
dataset_hf.drop_duplicates(subset=['headline'],inplace=True)

dataset_hf=dataset_hf.reset_index()[['headline','is_sarcastic']]

dataset_hf=Dataset.from_pandas(dataset_hf)


# Train Test Valid Split
train_testvalid = dataset_hf.train_test_split(test_size=0.2,seed=15)


test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)

dataset_hf = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dataset_hf

DatasetDict({
    train: Dataset({
        features: ['headline', 'is_sarcastic'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'is_sarcastic'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'is_sarcastic'],
        num_rows: 2850
    })
})

In [90]:
#check --> "cardiffnlp/twitter-roberta-base-emotion"
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512


Vector Size : "distilbert-base-uncase"
In the model distilbert-base-uncased, each token is embedded into a vector of size 768. The shape of the output from the base model is

(batch_size,max_sequence_length,embedding_vector_size=768)

In [91]:
def tokenize(batch):
  return tokenizer(batch["headline"], truncation=True, max_length=512)

tokenized_dataset = dataset_hf.map(tokenize,batched=True)
#divide the dataset into batches
tokenized_dataset

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'is_sarcastic', 'input_ids', 'attention_mask'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'is_sarcastic', 'input_ids', 'attention_mask'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'is_sarcastic', 'input_ids', 'attention_mask'],
        num_rows: 2850
    })
})

Making the datasets compatible with PyTorch

In [93]:
tokenized_dataset.set_format("torch", columns=["input_ids","attention_mask","is_sarcastic"])
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

Data collators are objects that will form a batch by using a list of dataset elements as input. These elements are of the same type as the elements of train_dataset or eval_dataset.

To be able to build batches, data collators may apply some processing (like padding). Some of them (like DataCollatorForLanguageModeling) also apply some random data augmentation (like random masking) on the formed batch.

data_collator automatically pads the model inputs in a batch to the length of the longest example. This bypasses the need to set a global maximum sequence length, and in practice leads to faster training since we perform fewer redundant computations on the padded tokens and attention masks.



We construct MyTaskSpecificCustomModel class that inherits from the nn.Module.

In [102]:
class CustomModel(nn.Module):
    def __init__(self, checkpoint, num_labels ):
        super(CustomModel, self).__init__()
        self.num_labels = num_labels
        
        self.model = model = AutoModel.from_pretrained(checkpoint, config = AutoConfig.from_pretrained(checkpoint, 
                                                                                                       output_attention = True, 
                                                                                                       output_hidden_state = True ) )
        # New Layer
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, num_labels )
        
    def forward(self, input_ids = None, attention_mask=None, is_sarcastic = None ):
        outputs = self.model(input_ids = input_ids, attention_mask = attention_mask  )
        
        last_hidden_state = outputs[0]
        
        sequence_outputs = self.dropout(last_hidden_state)
        
        logits = self.classifier(sequence_outputs[:, 0, : ].view(-1, 768 ))
        
        loss = None
        if is_sarcastic is not None:
            loss_func = nn.CrossEntropyLoss()
            loss = loss_func(logits.view(-1, self.num_labels), is_sarcastic.view(-1))
            
            return TokenClassifierOutput(loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions)
        

attention_mask
From Doc - This argument indicates to the model which tokens should be attended to, and which should not.

If the attention_mask is 0, the token id is ignored. For instance if a sequence is padded to adjust the sequence length, the padded words should be ignored hence their attention_mask are 0.

torch.nn.Linear(in_features, out_features, bias=True)
Parameters in_features – size of each input sample out_features – size of each output sample

Making sense of nn.Linear
In your Neural Network, the self.hidden = nn.Linear(784, 256) defines a hidden (meaning that it is in between of the input and output layers), fully connected linear layer, which takes input x of shape (batch_size, 784), where batch size is the number of inputs (each of size 784) which are passed to the network at once (as a single tensor), and transforms it by the linear equation y = x*W^T + b into a tensor y of shape (batch_size, 256).

##Create PyTorch DataLoader

In [103]:
from torch.utils.data import DataLoader

train_dataloader = DataLoader(
    tokenized_dataset['train'], shuffle = True, batch_size = 32, collate_fn = data_collator
)

eval_dataloader = DataLoader(
    tokenized_dataset['valid'], shuffle = True, collate_fn = data_collator
)

In [104]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model_task_specific = CustomModel(checkpoint=checkpoint, num_labels=2 ).to(device)

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_transform.weight', 'vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_projector.bias', 'vocab_layer_norm.bias', 'vocab_transform.bias']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [105]:
from transformers import AdamW, get_scheduler

optimizer = AdamW(model_task_specific.parameters(), lr = 5e-5, no_deprecation_warning=True )

num_epoch = 3

num_training_steps = num_epoch * len(train_dataloader)

lr_scheduler = get_scheduler(
    'linear',
    optimizer = optimizer,
    num_warmup_steps = 0,
    num_training_steps = num_training_steps,
    
)

In [106]:
from datasets import load_metric
metric = load_metric("f1")

##Training

In [None]:
from tqdm.auto import tqdm

progress_bar_train = tqdm(range(num_training_steps))
progress_bar_eval = tqdm(range(num_epoch * len(eval_dataloader) ))


for epoch in range(num_epoch):
    model_task_specific.train()
    for batch in train_dataloader:
        batch = { k: v.to(device) for k, v in batch.items() }
        outputs = model_task_specific(**batch)
        loss = outputs.loss
        loss.backward()
        
        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()
        progress_bar_train.update(1)
        
    model_task_specific.eval()
    for batch in eval_dataloader:
        batch = { k: v.to(device) for k, v in batch.items() }
        with torch.no_grad():
            outputs = model_task_specific(**batch)
            
        logits = outputs.logits
        predictions = torch.argmax(logits, dim = -1 )
        metric.add_batch(predictions = predictions, references = batch['labels'] )
        progress_bar_eval.update(1)
        
    print(metric.compute()) 
       

#Post Training Evaluation

In [None]:
model_task_specific.eval()

test_dataloader = DataLoader(
    tokenized_dataset["test"], batch_size=32, collate_fn = data_collator
)

for batch in test_dataloader:
  batch = {k: v.to(device) for k,v in batch.items()}
  with torch.no_grad():
    outputs = model_task_specific(**batch)

  logits = outputs.logits
  predictions = torch.argmax(logits, dim=-1)
  metric.add_batch(predictions=predictions, references=batch['is_sarcastic'])