<a href="https://colab.research.google.com/github/agnxsh/task-specific-hf/blob/main/task_specific_hf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Adding a task specific head on top of the BERT model, for dealing with custom NLP tasks

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


###Apart from the last layer, which is the Model head, a layer which is task specific, apart from the other shared layers that are task agnostic, these layers mainly consists of transformer layers and token embeddings.

#### The base model in HuggingFace is usually represented as a python class depicting the BERT class or the GPT-2 class, and what it returns is usually the last hidden layer of the transformer, this is represented in the following way:

output[0] => this is the last hidden state

####In the custom layer for HF, we pass this last hidden state as the input to our new task-specific layer

##Practically speaking the base model has to support the task that we want to add, for example we can't just use DistilBERT for machine translation, because it does not support the same, however DistilBERT can be used for Question-Answering tasks.

In [None]:
# Here we're using a Kaggle Dataset which consists of News Headlines for Sarcasm Detection

In [2]:
!pip install datasets
!pip install transformers
# !pip install transformers.modeling_ouputs
!pip install torch

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting datasets
  Downloading datasets-2.8.0-py3-none-any.whl (452 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m452.9/452.9 KB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting responses<0.19
  Downloading responses-0.18.0-py3-none-any.whl (38 kB)
Collecting huggingface-hub<1.0.0,>=0.2.0
  Downloading huggingface_hub-0.11.1-py3-none-any.whl (182 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m182.4/182.4 KB[0m [31m21.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash
  Downloading xxhash-3.2.0-cp38-cp38-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (213 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m213.0/213.0 KB[0m [31m27.3 MB/s[0m eta [36m0:00:00[0m
Collecting multiprocess
  Downloading multiprocess-0.70.14-py38-none-any.whl (132 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m132.0/132.0 

In [3]:
import numpy as np
import pandas as pd

from datasets import load_dataset,Dataset,DatasetDict
from transformers import DataCollatorWithPadding, AutoModelForSequenceClassification,Trainer,TrainingArguments,AutoTokenizer, AutoModel,AutoConfig
from transformers.modeling_outputs import TokenClassifierOutput
import torch
import torch.nn as nn
import os
import numpy as np
import pandas as pd
for dirname, _ ,filenames in os.walk("/content/drive/MyDrive/input/"):
  for filename in filenames:
    print(os.path.join(dirname,filename))


/content/drive/MyDrive/input/Sarcasm_Headlines_Dataset.json
/content/drive/MyDrive/input/Sarcasm_Headlines_Dataset_v2.json


In [17]:
dataset_v2_path = "/content/drive/MyDrive/input/Sarcasm_Headlines_Dataset_v2.json"

In [18]:
df = pd.read_json(dataset_v2_path, lines=True)
df.head()

Unnamed: 0,is_sarcastic,headline,article_link
0,1,thirtysomething scientists unveil doomsday clo...,https://www.theonion.com/thirtysomething-scien...
1,0,dem rep. totally nails why congress is falling...,https://www.huffingtonpost.com/entry/donna-edw...
2,0,eat your veggies: 9 deliciously different recipes,https://www.huffingtonpost.com/entry/eat-your-...
3,1,inclement weather prevents liar from getting t...,https://local.theonion.com/inclement-weather-p...
4,1,mother comes pretty close to using word 'strea...,https://www.theonion.com/mother-comes-pretty-c...


#Load Dataset with HF's load_dataset

In [19]:
dataset_hf = load_dataset("json",data_files = dataset_v2_path)



  0%|          | 0/1 [00:00<?, ?it/s]

In [20]:
dataset_hf = dataset_hf.remove_columns(["article_link"])
dataset_hf.set_format('pandas')
dataset_hf=dataset_hf['train'][:]

##Reevaluation of the parameters and labels

In [16]:
dataset_hf.drop_duplicates(subset=['headline'],inplace=True)

dataset_hf=dataset_hf.reset_index()[['headline','label']]

dataset_hf=Dataset.from_pandas(dataset_hf)


# Train Test Valid Split
train_testvalid = dataset_hf.train_test_split(test_size=0.2,seed=15)


test_valid = train_testvalid['test'].train_test_split(test_size=0.5,seed=15)

dataset_hf = DatasetDict({
    'train': train_testvalid['train'],
    'test': test_valid['test'],
    'valid': test_valid['train']})

dataset_hf

KeyError: ignored

In [10]:
#check --> "cardiffnlp/twitter-roberta-base-emotion"
checkpoint = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
tokenizer.model_max_len=512

Downloading:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/483 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

#Vector Size : "distilbert-base-uncase"
In the model distilbert-base-uncased, each token is embedded into a vector of size 768. The shape of the output from the base model is
##(batch_size,max_sequence_length,embedding_vector_size=768)

In [12]:
def tokenize(batch):
  return tokenizer(batch["headline"], truncation=True, max_length=512)

tokenized_dataset = dataset_hf.map(tokenize,batched=True)
#divide the dataset into batches
tokenized_dataset

  0%|          | 0/23 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

  0%|          | 0/3 [00:00<?, ?ba/s]

DatasetDict({
    train: Dataset({
        features: ['headline', 'input_ids', 'attention_mask'],
        num_rows: 22802
    })
    test: Dataset({
        features: ['headline', 'input_ids', 'attention_mask'],
        num_rows: 2851
    })
    valid: Dataset({
        features: ['headline', 'input_ids', 'attention_mask'],
        num_rows: 2850
    })
})

###Making the datasets compatible with PyTorch

In [None]:
tokenized_dataset.set_format("torch", columns=["input_ids","attention_mask","label"])