https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/token_classification.ipynb#scrollTo=GWiVUF0jIrIv

https://huggingface.co/transformers/custom_datasets.html#token-classification-with-w-nut-emerging-entities

https://huggingface.co/blog/ray-tune


# Installing and importing libraries

In [6]:
# !sudo apt install git-lfs
!git config --global user.email aditeya.baral@gmail.com
!git config --global user.name Aditeya
HF_HUB_API_TOKEN = "api_vOKpxbhcEnOqvVNYfPxOSzFoMsFNxOqttx"

In [7]:
!pip install transformers datasets seqeval ray



In [8]:
import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\adite\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [9]:
import json
import math
import pandas as pd
import numpy as np
import torch
from sklearn.model_selection import train_test_split
from nltk.tokenize import word_tokenize
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset, load_metric, DatasetDict
from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
from transformers import DataCollatorForTokenClassification
from transformers import GPT2LMHeadModel, GPT2TokenizerFast, RobertaTokenizerFast

# Loading Dataset

In [10]:
df = pd.read_json('../ppl-grammar-dataset.json')
print(df.shape)
df.head()

(54629, 5)


Unnamed: 0,poem,haiku,indices,ppl-gpt2,grammar-check
0,Did the CIA tell the FBI that it knows the wor...,cia fbi the biggest weapon,"[2, 5, 9, 24, 25]",2447.34648,False
1,Did the CIA tell the FBI that it knows the wor...,cia fbi the biggest weapon,"[2, 5, 9, 24, 25]",2447.34648,False
2,"Dark clouds gathered overhead,\nExpelling bull...",clouds overhead bullets of the valley,"[1, 3, 5, 6, 10, 11]",3639.095887,False
3,A vigilante lacking of heroic qualities that\n...,lacking qualities that damn criminals,"[2, 5, 6, 11, 12]",8305.684147,False
4,"(A Diamante Poem)\nBrain\nHeavenly, hellish\nF...",diamante poem the sybaritic pathetic,"[1, 2, 10, 18, 19]",1982.106818,False


In [11]:
df["ppl-gpt2"].describe()

count     54629.000000
mean       6511.533484
std       13967.007052
min           4.944884
25%        1245.401788
50%        2755.530497
75%        6398.639543
max      560770.619276
Name: ppl-gpt2, dtype: float64

In [12]:
df["grammar-check"].describe()

count     54629
unique        2
top       False
freq      54557
Name: grammar-check, dtype: object

There are some hindi sentences here - clean them in the preprocessing notebook

In [13]:
filtered_df = df[df["ppl-gpt2"] <= 1500]
filtered_df.reset_index(drop=True, inplace=True)
print(filtered_df.shape)
filtered_df.head()

(16576, 5)


Unnamed: 0,poem,haiku,indices,ppl-gpt2,grammar-check
0,"You sit left to me,\nYou don't wanna see,\nWha...",name extentions problem was about a tightrope,"[38, 53, 56, 60, 75, 89, 90]",850.798879,False
1,"Life,\nFrom ABC to XYZ;\nLearning all the time...",abc xyz all perfect laws of all rules,"[2, 4, 6, 11, 14, 15, 18, 20]",1279.585696,False
2,"Life,\nFrom ABC to XYZ;\nLearning all the time...",perfect laws all rules like the beginning,"[11, 14, 18, 20, 24, 25, 26]",1097.296416,False
3,"Alfons Schuhbeck top chef.\nBavaria, Germany i...",chef chefs a wonderful cookbook,"[3, 11, 14, 15, 16]",974.032754,False
4,"Actress Angela Oberer passing a moment, Angela...",moment eyes a long time,"[5, 27, 34, 35, 36]",445.035082,False


# Preprocessing Dataset

In [14]:
filtered_df["tokens"] = filtered_df["poem"].apply(lambda x: word_tokenize(x))
ner_tags = list()
for i in range(filtered_df.shape[0]):
  indices = filtered_df["indices"][i]
  length = len(filtered_df["tokens"][i])
  ner_tag = ['O' for _ in range(length)]
  for idx in indices:
    ner_tag[idx] = 'W'
  ner_tags.append(ner_tag)
filtered_df["ner_tags"] = ner_tags

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["tokens"] = filtered_df["poem"].apply(lambda x: word_tokenize(x))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  filtered_df["ner_tags"] = ner_tags


In [15]:
filtered_df.head()

Unnamed: 0,poem,haiku,indices,ppl-gpt2,grammar-check,tokens,ner_tags
0,"You sit left to me,\nYou don't wanna see,\nWha...",name extentions problem was about a tightrope,"[38, 53, 56, 60, 75, 89, 90]",850.798879,False,"[You, sit, left, to, me, ,, You, do, n't, wan,...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"Life,\nFrom ABC to XYZ;\nLearning all the time...",abc xyz all perfect laws of all rules,"[2, 4, 6, 11, 14, 15, 18, 20]",1279.585696,False,"[Life, ,, From, ABC, to, XYZ, ;, Learning, all...","[O, O, W, O, W, O, W, O, O, O, O, W, O, O, W, ..."
2,"Life,\nFrom ABC to XYZ;\nLearning all the time...",perfect laws all rules like the beginning,"[11, 14, 18, 20, 24, 25, 26]",1097.296416,False,"[Life, ,, From, ABC, to, XYZ, ;, Learning, all...","[O, O, O, O, O, O, O, O, O, O, O, W, O, O, W, ..."
3,"Alfons Schuhbeck top chef.\nBavaria, Germany i...",chef chefs a wonderful cookbook,"[3, 11, 14, 15, 16]",974.032754,False,"[Alfons, Schuhbeck, top, chef, ., Bavaria, ,, ...","[O, O, O, W, O, O, O, O, O, O, O, W, O, O, W, ..."
4,"Actress Angela Oberer passing a moment, Angela...",moment eyes a long time,"[5, 27, 34, 35, 36]",445.035082,False,"[Actress, Angela, Oberer, passing, a, moment, ...","[O, O, O, O, O, W, O, O, O, O, O, O, O, O, O, ..."


In [16]:
train_df, test_df = train_test_split(filtered_df, test_size=0.05)
print(train_df.shape, test_df.shape)

(15747, 7) (829, 7)


In [17]:
tokens = filtered_df["tokens"]
tags = filtered_df["ner_tags"]

In [18]:
unique_tags = ["O", "W"]
tag2id = {tag: id for id, tag in enumerate(unique_tags)}
id2tag = {id: tag for tag, id in tag2id.items()}

In [19]:
train_dataset = Dataset.from_pandas(train_df)
test_dataset = Dataset.from_pandas(test_df)

# Initialising GPT-2 for Perplexity Loss

In [20]:
gpt2_model_name = "gpt2"
gpt2_model = GPT2LMHeadModel.from_pretrained(gpt2_model_name)
gpt2_tokenizer = GPT2TokenizerFast.from_pretrained(gpt2_model_name)

def perplexityGPT2(sentences):
  ppl = list()
  total_length = len(sentences)
  for index, sent in enumerate(sentences):
    tokenize_input = gpt2_tokenizer.encode(sent)
    tensor_input = torch.tensor([tokenize_input])
    loss = gpt2_model(tensor_input, labels=tensor_input)[0]
    ppl.append(math.exp(loss))
  return ppl

# Setting Model Parameters

In [24]:
model_name = "distilgpt2"

In [26]:
if "roberta" in model_name:
  tokenizer = RobertaTokenizerFast.from_pretrained(model_name, add_prefix_space=True)
elif "gpt2" in model_name:
  tokenizer = GPT2TokenizerFast.from_pretrained(model_name, add_prefix_space=True) 
  tokenizer.add_special_tokens({'pad_token': '[PAD]'}) 
else:
  tokenizer = AutoTokenizer.from_pretrained(model_name)

In [27]:
train_texts = list(train_df["tokens"].values)
val_texts = list(test_df["tokens"].values)

train_tags = list(train_df["ner_tags"].values)
val_tags = list(test_df["ner_tags"].values)

In [28]:
train_encodings = tokenizer(train_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)
val_encodings = tokenizer(val_texts, is_split_into_words=True, return_offsets_mapping=True, padding=True, truncation=True)

In [29]:
def encode_tags(tags, encodings):
    labels = [[tag2id[tag] for tag in doc] for doc in tags]
    encoded_labels = []
    for doc_labels, doc_offset in zip(labels, encodings.offset_mapping):
        doc_enc_labels = np.ones(len(doc_offset),dtype=int) * -100
        arr_offset = np.array(doc_offset)

        try:
          doc_enc_labels[(arr_offset[:,0] == 0) & (arr_offset[:,1] != 0)] = doc_labels
          encoded_labels.append(doc_enc_labels.tolist())
        except:
          pass

    return encoded_labels

train_labels = encode_tags(train_tags, train_encodings)
val_labels = encode_tags(val_tags, val_encodings)

In [30]:
class MapleDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

offset_mapping_train = train_encodings.pop("offset_mapping")
offset_mapping_val = val_encodings.pop("offset_mapping")
train_dataset = MapleDataset(train_encodings, train_labels)
val_dataset = MapleDataset(val_encodings, val_labels)

In [31]:
'''def model_init():
  return AutoModelForTokenClassification.from_pretrained(model_name, num_labels=2, return_dict=True)'''

model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=2)

Some weights of the model checkpoint at distilgpt2 were not used when initializing GPT2ForTokenClassification: ['lm_head.weight']
- This IS expected if you are initializing GPT2ForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing GPT2ForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at distilgpt2 and are newly initialized: ['transformer.h.5.attn.masked_bias', 'classifier.bias', 'transformer.h.3.attn.masked_bias', 'transformer.h.2.attn.masked_bias', 'classifier.weight', 'transformer.h.0.attn.masked_bias', 'transformer.h.1.attn.masked_bias', 'transformer.h.4.attn.masked_bi

In [32]:
class PerplexityTrainer(Trainer):
  def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
  def compute_loss(self, model, inputs, return_outputs=False):
    batch_size = len(inputs["input_ids"])
    poems = list()
    for i in range(batch_size):
      text = tokenizer.decode(inputs["input_ids"][i])
      tokenized_text = tokenizer.tokenize(text)
      labels = inputs["labels"][i].tolist()
      words = list()
      for idx, value in enumerate(labels):
        if value == 1:
          words.append(tokenized_text[idx])
      poem = ' '.join(words)
      poems.append(poem)
    perplexity_scores = perplexityGPT2(poems)
    perplexity_loss = sum(perplexity_scores)
    loss = torch.tensor(perplexity_loss, requires_grad=True)
    return (loss, poems) if return_outputs else loss

In [33]:
args = TrainingArguments(
    f"modelfolder",
    save_strategy="epoch",
    evaluation_strategy="epoch",
    load_best_model_at_end=True,
    learning_rate=2e-8,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    weight_decay=0.01,
)

In [34]:
trainer = PerplexityTrainer(
    # model_init=model_init,                        
    model=model,
    args=args,                  
    train_dataset=train_dataset,         
    eval_dataset=val_dataset             
)

In [35]:
'''trainer.hyperparameter_search(
    direction="maximize", 
    backend="ray", 
    n_trials=10
)'''

'trainer.hyperparameter_search(\n    direction="maximize", \n    backend="ray", \n    n_trials=10\n)'

# Training and Evaluating Model

In [36]:
trainer.train()

***** Running training *****
  Num examples = 10730
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 6710


Epoch,Training Loss,Validation Loss


Token indices sequence length is longer than the specified maximum sequence length for this model (1026 > 1024). Running this sequence through the model will result in indexing errors


KeyboardInterrupt: 

In [None]:
trainer.evaluate()

# Saving Model

In [None]:
model.push_to_hub(f"maple-{model_name.lower()}", use_auth_token=HF_HUB_API_TOKEN)
# tokenizer.push_to_hub(f"maple-{model_name.lower()}", use_auth_token=HF_HUB_API_TOKEN)

In [None]:
model.save_pretrained(f"/content/drive/MyDrive/Maple/maple-{model_name.lower()}")
trainer.save_model(f"/content/drive/MyDrive/Maple/maple-{model_name.lower()}")