<a href="https://colab.research.google.com/github/ankit-kothari/Data-Science-Journey/blob/master/pos_tagging.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers 
!pip install dask
!pip install 'fsspec>=0.3.3'
!pip install datasets
!pip install torchinfo

In [2]:
gpu_info = !nvidia-smi
gpu_info = '\n'.join(gpu_info)
if gpu_info.find('failed') >= 0:
  print('Not connected to a GPU')
else:
  print(gpu_info)

Tue Jun 28 03:42:07 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   36C    P0    26W / 250W |      0MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [3]:
from transformers import pipeline
import torch
import spacy
import tqdm as notebook_tqdm
from torchinfo import summary
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score,roc_auc_score,accuracy_score
from sklearn.metrics import plot_confusion_matrix,classification_report
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
import  matplotlib.pyplot as plt
from datasets import load_dataset
from datasets import get_dataset_config_names
from spacy.lang.en import English
import warnings
from tqdm import tqdm
import re
import string
warnings.filterwarnings("ignore")
import nltk
from nltk.corpus import brown 

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)
pd.set_option('display.max_colwidth', None)

In [None]:
import nltk
nltk.download('treebank')
nltk.download('brown')
nltk.download('conll2000')
nltk.download('universal_tagset')

In [5]:
corpus = brown.tagged_sents(tagset='universal')
corpus[0][0:5]

[('The', 'DET'),
 ('Fulton', 'NOUN'),
 ('County', 'NOUN'),
 ('Grand', 'ADJ'),
 ('Jury', 'NOUN')]

In [6]:
#converting this to list of input tokens and list of output tags
sentence = []
target = []
for sample  in corpus:
    tokens=[]
    tags=[]
    for word, tag in sample:
        tokens.append(word)
        tags.append(tag)
sentence.append(tokens)
target.append(tags)

## Preprocessing the dataset 
- It needs to be in a format aacceptable to Hugging Face "load datast"

In [7]:
import json
with open('data.json','w') as f:
    for sample  in corpus:
        tokens=[]
        tags=[]
        for word, tag in sample:
            tokens.append(word)
            tags.append(tag)
        s= json.dumps({'sentence':tokens,'label':tags})
        f.write(f"{s}\n")

In [8]:
big_data = load_dataset("json",data_files='data.json')
data = big_data['train'].shuffle(seed=42).select(range(20_000))
data

Using custom data configuration default-edd352445934480d


Downloading and preparing dataset json/default to /root/.cache/huggingface/datasets/json/default-edd352445934480d/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5...


Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

0 tables [00:00, ? tables/s]

Dataset json downloaded and prepared to /root/.cache/huggingface/datasets/json/default-edd352445934480d/0.0.0/da492aad5680612e4028e7f6ddc04b1dfcec4b64db470ed7cc5f2bb265b9b6b5. Subsequent calls will reuse this data.


  0%|          | 0/1 [00:00<?, ?it/s]

Dataset({
    features: ['sentence', 'label'],
    num_rows: 20000
})

#### Splitting the dataset into Train and Test Split

In [9]:
split_data = data.train_test_split(train_size=0.8, seed=42)

In [10]:
split_data

DatasetDict({
    train: Dataset({
        features: ['sentence', 'label'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['sentence', 'label'],
        num_rows: 4000
    })
})

In [11]:
split_data['train'].features

{'label': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None),
 'sentence': Sequence(feature=Value(dtype='string', id=None), length=-1, id=None)}

In [12]:
input_tokens = split_data['train'][1]['sentence']
output_tags =  split_data['train'][1]['label']
pd.DataFrame([input_tokens,output_tags], index=['input','output'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21
input,The,Treasury,arrives,at,substantially,the,same,conclusion,",",but,skirts,the,problem,of,section,203,of,the,United,States,Code,.
output,DET,NOUN,VERB,ADP,ADV,DET,ADJ,NOUN,.,CONJ,VERB,DET,NOUN,ADP,NOUN,NUM,ADP,DET,VERB,NOUN,NOUN,.


#### Distribution of labels across Train and Test Set

In [13]:
from collections import Counter
from collections import defaultdict
split2freqs = defaultdict(Counter)
for data_set, features in split_data.items():
     for row in features['label']:
        for tag in row:
            split2freqs[data_set][tag]= split2freqs[data_set].get(tag,0) + 1

In [14]:
sample_data = pd.DataFrame.from_dict(split2freqs, orient='index')
sample_data

Unnamed: 0,.,ADV,VERB,PRON,DET,NOUN,ADP,ADJ,CONJ,NUM,PRT,X
train,41363,15517,50831,13900,38079,76297,40141,23254,10583,4187,8473,387
test,10463,4093,12950,3498,9501,19419,10142,5823,2718,1047,2102,128


#### Creating Dictionaris for Target Columns

In [15]:
label2idx = {tag:i for i, tag in enumerate(split2freqs['train'].keys())}
print(label2idx)
idx2label = {i:tag for i, tag in enumerate(split2freqs['train'].keys())}
print(idx2label)

{'.': 0, 'ADV': 1, 'VERB': 2, 'PRON': 3, 'DET': 4, 'NOUN': 5, 'ADP': 6, 'ADJ': 7, 'CONJ': 8, 'NUM': 9, 'PRT': 10, 'X': 11}
{0: '.', 1: 'ADV', 2: 'VERB', 3: 'PRON', 4: 'DET', 5: 'NOUN', 6: 'ADP', 7: 'ADJ', 8: 'CONJ', 9: 'NUM', 10: 'PRT', 11: 'X'}


## Tokenization 

In [None]:
from transformers import AutoTokenizer
checkpoint = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

In [17]:
sample_tokens = tokenizer(input_tokens ,is_split_into_words=True)
sample_input_ids = sample_tokens.input_ids
sample_tokens_ids = sample_tokens.tokens()
sample_words_ids = sample_tokens.word_ids()

In [18]:
sample_tokens.keys()

dict_keys(['input_ids', 'attention_mask'])

In [19]:
type(sample_tokens)

transformers.tokenization_utils_base.BatchEncoding

In [20]:
pd.DataFrame([sample_input_ids,sample_tokens_ids,sample_words_ids], index=['input','tokens','words'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
input,101,1109,11712,8121,1120,12613,1103,1269,6593,117,1133,20103,1103,2463,1104,2237,20022,1104,1103,1244,1311,6741,119,102
tokens,[CLS],The,Treasury,arrives,at,substantially,the,same,conclusion,",",but,skirts,the,problem,of,section,203,of,the,United,States,Code,.,[SEP]
words,,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,


- Misalligned input_ids after tokenization resulting in different lenght input_ids and length

In [21]:
print(f' Tokenized Input ID Length for Smaple 1 is equal to  {len(sample_input_ids)}')
print(f' Output Tag Length for Sample 1 is equal to {len(output_tags)}')

 Tokenized Input ID Length for Smaple 1 is equal to  24
 Output Tag Length for Sample 1 is equal to 22


In [22]:
alligned_output_tags = []
for id in sample_words_ids:
    if id is None:
         alligned_output_tags.append(-100)
    else:
        alligned_output_tags.append(output_tags[id])

In [23]:
pd.DataFrame([sample_input_ids,sample_tokens_ids,sample_words_ids,alligned_output_tags], index=['input','tokens','words','alligned_output_tags'])


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23
input,101,1109,11712,8121,1120,12613,1103,1269,6593,117,1133,20103,1103,2463,1104,2237,20022,1104,1103,1244,1311,6741,119,102
tokens,[CLS],The,Treasury,arrives,at,substantially,the,same,conclusion,",",but,skirts,the,problem,of,section,203,of,the,United,States,Code,.,[SEP]
words,,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,
alligned_output_tags,-100,DET,NOUN,VERB,ADP,ADV,DET,ADJ,NOUN,.,CONJ,VERB,DET,NOUN,ADP,NOUN,NUM,ADP,DET,VERB,NOUN,NOUN,.,-100


In [24]:
#Goal 
# 1. Get the tokenized inputs
# 2. Using the word_ids allign the labels to match the tokenized dataset
# 3. Get the output tag with index same as the word id
# 4. From that output tag get the index value from label2idx
def tokenize_and_allign_labels(examples):
    tokens_inputs = tokenizer(examples['sentence'] ,is_split_into_words=True,truncation=True)
    alligned_output_tag =[]
    for idx,label in enumerate(examples['label']):
        output_tag =[]
        word_ids = tokens_inputs.word_ids(batch_index=idx)
        for word_id in word_ids:
            if word_id is None:
                output_tag.append(-100)
            else:
                output_tag.append((label2idx[label[word_id]]))
        alligned_output_tag.append(output_tag)
    tokens_inputs['labels']=alligned_output_tag
    return tokens_inputs
    
      

In [25]:
split_data['train'].column_names

['sentence', 'label']

In [26]:
def encode_dataset(split):
    return split.map(tokenize_and_allign_labels, 
            batched=True,
            remove_columns=['sentence','label'])

In [27]:
tokenized_dataset = encode_dataset(split_data)



  0%|          | 0/16 [00:00<?, ?ba/s]

  0%|          | 0/4 [00:00<?, ?ba/s]

In [28]:
tokenized_dataset 

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 16000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 4000
    })
})

## Data Collator

In [29]:
##for batching
from transformers import DataCollatorForTokenClassification
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

## Metrics 

In [30]:
import itertools
def flatten_list(final_list):
    return list(itertools.chain(*final_list))

In [31]:
def compute_metrics(logits_and_labels):
    """_summary_
    Computes the accuracy of the model. 
    Goal is to flatten this list of list of prediction and actual labels into one 
    and remove -100 from the list.
    Output: It should always return a dictionary of metrics
    Args:
        logits_and_labels (_type_): A list of tuples (logits, labels)
        Shape of logits: (batch_sixe,Sequence Length, num_labels)
        Shape of labels: (batch_size,Sequence Length)
    """
    logits, labels = logits_and_labels
    logits, labels = logits, labels
    preds = np.argmax(logits, axis=2)
    batch_size, seq_len = preds.shape
    one_big_pred_list = []
    one_big_label_list = []
    for ids in range(batch_size):
        for id in range(seq_len):
            if labels[ids][id]==-100:
                continue
            else:
                one_big_label_list.append(labels[ids][id])
                one_big_pred_list.append(preds[ids][id])
    acc = accuracy_score(one_big_label_list, one_big_pred_list)
    f1 = f1_score(one_big_label_list, one_big_pred_list, average='macro')
    return {"f1":f1,"acc":acc}
    

#### Dummy working of the compute working metrics

In [32]:
from numpy import argmax
#creating one prediction for every token in the sequence
#creating a dummy array of shape (batch_size,sequence_length, num_labels)
a = np.random.randint(9, size=(2,3,3))
print(f'shape of the logits coming out of the model output {np.shape(a)}')
#reducing it (batch_size, num_labels)
argmax_a = np.argmax(a, axis=2)
print(f"shape after argamx batch_size,seq_len {np.shape(argmax_a)}")
labels = np.random.randint(6,size=(2,3))
print(f"shape of actual_labels batch_size,seq_len {np.shape(labels)}")
a_lalels = a, labels
compute_metrics(a_lalels)


shape of the logits coming out of the model output (2, 3, 3)
shape after argamx batch_size,seq_len (2, 3)
shape of actual_labels batch_size,seq_len (2, 3)


{'acc': 0.16666666666666666, 'f1': 0.13333333333333333}

## Modeling

In [None]:
from transformers import AutoModelForTokenClassification

model = AutoModelForTokenClassification.from_pretrained(checkpoint,
    id2label =idx2label ,
    label2id = label2idx
)

## Training Arguments

In [34]:
from transformers import TrainingArguments
batch_size= 16
logging_steps = len(tokenized_dataset["train"]) // batch_size
model_name = "ner_model"
training_args = TrainingArguments(
    'ner_lp_model',
    evaluation_strategy='epoch',
    num_train_epochs=3,
    learning_rate=2e-5,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    save_strategy='epoch',
    logging_steps=logging_steps,
    log_level="error",
    weight_decay = 0.01,
    push_to_hub=False,
    disable_tqdm=False
    )

## Trainer 

In [35]:
from transformers import Trainer

In [36]:
trainer = Trainer(model=model, args=training_args,
                  compute_metrics=compute_metrics,
                  train_dataset=tokenized_dataset['train'],
                  eval_dataset = tokenized_dataset['test'],
                  data_collator= data_collator,
                  tokenizer=tokenizer)

In [37]:
trainer.train()

Epoch,Training Loss,Validation Loss,F1,Acc
1,0.174,0.066159,0.923042,0.981146
2,0.0424,0.054277,0.950831,0.984419
3,0.0266,0.056952,0.951024,0.984545


TrainOutput(global_step=3000, training_loss=0.08100966453552245, metrics={'train_runtime': 217.7826, 'train_samples_per_second': 220.403, 'train_steps_per_second': 13.775, 'total_flos': 728592156703872.0, 'train_loss': 0.08100966453552245, 'epoch': 3.0})

In [38]:
trainer.save_model('pos_tag_model')

## Prediction 

In [39]:
pos = pipeline("token-classification", 
               model='pos_tag_model',
               aggregation_strategy="simple",
               device=0,)

In [40]:
s = "Sohini Sarkar lives in Ashburn Virginia"
pos(s)

[{'end': 13,
  'entity_group': 'NOUN',
  'score': 0.9985427,
  'start': 0,
  'word': 'Sohini Sarkar'},
 {'end': 19,
  'entity_group': 'VERB',
  'score': 0.9969187,
  'start': 14,
  'word': 'lives'},
 {'end': 22,
  'entity_group': 'ADP',
  'score': 0.9991215,
  'start': 20,
  'word': 'in'},
 {'end': 39,
  'entity_group': 'NOUN',
  'score': 0.9992576,
  'start': 23,
  'word': 'Ashburn Virginia'}]

In [41]:
s1 = "Shane who was a good player, died in Thiland of heart attack"

In [43]:
pred = pos(s1)

In [44]:
token = []
score = []
label = []

for i in range(len(pred)):
  token.append(pred[i]['word'])
  score.append(pred[i]['score'])
  label.append(pred[i]['entity_group'])

pd.DataFrame([token,label,score],index=['word','POS_tag','Confidence'])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
word,Shane,who,was,a,good,player,",",died,in,Thiland,of,heart attack
POS_tag,NOUN,PRON,VERB,DET,ADJ,NOUN,.,VERB,ADP,NOUN,ADP,NOUN
Confidence,0.999596,0.998916,0.999729,0.999144,0.998818,0.99964,0.999875,0.999643,0.999546,0.999108,0.99944,0.998124
